In [1]:
import os
import re
import csv
import codecs
import numpy as np
import pandas as pd
from BertEmbeddings import BertEmbeddings
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from string import punctuation
import numpy as np
from scipy.stats import entropy
from math import log, e
import pandas as pd
import timeit

import json
from ibm_watson import NaturalLanguageUnderstandingV1
from ibm_cloud_sdk_core.authenticators import IAMAuthenticator
from ibm_watson.natural_language_understanding_v1 import Features, KeywordsOptions, EntitiesOptions

In [2]:
# Input data
file = "COMP All courses.csv" # This is the Google Sheet in the "Data" folder. Combine all 4 sub-sheets into one sheet and rename it to get a single csv file
df_train = pd.read_csv(file, encoding='utf-8')
# Remove rows containing "N" (unrelated to OH requests)

# && df retrieve column error
#df_train = df_train.loc[df_train["Type (Good(G), Lack of Time element(T), Lack of Reason element(R), No asking for OH(N))"] != "N"]
df_train = df_train.loc[df_train.iloc[:,4] != "N"]
df_train = df_train.dropna()

content = df_train.iloc[:,2]
labels = df_train.iloc[:,4]

#### Add rows for fine labels

In [4]:
time = []
reason = []
for label in labels:
    if "T" in label:
        time.append(1)
    else:
        time.append(0)
    if "R" in label:
        reason.append(1)
    else:
        reason.append(0)
time = np.array(time)
reason = np.array(reason)
df_train["Lack of time compartment?"] = time
df_train["Lack of reason compartment?"] = reason

#### Time feature 1: Does the request contains substring "at|before|after|around X/Xam/AM|pm/PM/X am/AM|pm/PM" (such as "at 10" or "10am") or HH:MM (such as 10:00) or HH-HH (such as 10-11)

In [6]:
time_feature_1 = []
for post in content:
    result = re.findall(r'at [0-9]+|before [0-9]+|after [0-9]+|around [0-9]+|[0-9]+am|[0-9]+pm|[0-9]+ am|[0-9]+ pm|[0-9]+AM|[0-9]+PM|[0-9]+ AM|[0-9]+ PM', post)
    result2 = re.findall(r'[0-9]+:[0-9]+', post)
    result3 = re.findall(r'[0-9]+-[0-9]+', post)
    if len(result) != 0 or len(result2) != 0 or len(result3) != 0:
        time_feature_1.append(1)
    else:
        time_feature_1.append(0)
df_train['Time Feature 1'] = time_feature_1

#### Time feature 2: Does the request contains substring "after someone" or "with someone" (such as " I would like to come after/with someone...")

In [7]:
time_feature_2 = []
for post in content:
    result = re.findall(r'after [A-Z][a-z]+', post)
    result2 = re.findall(r'with [A-Z][a-z]+', post)
    if len(result) != 0 or len(result2) != 0:
        time_feature_2.append(1)
    else:
        time_feature_2.append(0)
df_train['Time Feature 2'] = time_feature_2

#### Time feature 3: Does the request contains substring "MM/DD HH" (such as "11/17 11")

In [8]:
time_feature_3 = []
for post in content:
    result = re.findall(r'[0-9]+/[0-9]+ [0-9]+', post)
    if len(result) != 0:
        time_feature_3.append(1)
    else:
        time_feature_3.append(0)
df_train['Time Feature 3'] = time_feature_3

#### Reason feature 1: How many words does the request have? (Usually short requests are incomplete)

In [9]:
reason_feature_1 = []
for post in content:
    # Remove punctuation
    post = ''.join([c for c in post if c not in punctuation])
    reason_feature_1.append(len(post.split(" ")))
df_train['Reason Feature 1'] = reason_feature_1

#### Reason feature 2: What is the entropy of the request? (Higher entroy means more unique words, which leads to more specific reasoning)

In [10]:
def entropy1(labels, base=None):
    value,counts = np.unique(labels, return_counts=True)
    return entropy(counts, base=base)

def entropy2(labels, base=None):
    """ Computes entropy of label distribution. """

    n_labels = len(labels)

    if n_labels <= 1:
        return 0

    value,counts = np.unique(labels, return_counts=True)
    probs = counts / n_labels
    n_classes = np.count_nonzero(probs)

    if n_classes <= 1:
        return 0

    ent = 0.

    # Compute entropy
    base = e if base is None else base
    for i in probs:
        ent -= i * log(i, base)

    return ent

def entropy3(labels, base=None):
    vc = pd.Series(labels).value_counts(normalize=True, sort=False)
    base = e if base is None else base
    return -(vc * np.log(vc)/np.log(base)).sum()

def entropy4(labels, base=None):
    value,counts = np.unique(labels, return_counts=True)
    norm_counts = counts / counts.sum()
    base = e if base is None else base
    return -(norm_counts * np.log(norm_counts)/np.log(base)).sum()

In [11]:
reason_feature_2 = []
for post in content:
    # Remove punctuation
    post = ''.join([c for c in post if c not in punctuation])
    word_array = post.split(" ")
    reason_feature_2.append(entropy2(word_array))
df_train['Reason Feature 2'] = reason_feature_2

#### Reason feature 3: What is the average TF-IDF of each request (average of each words' TF-IDF in the request)?

In [12]:
def computeTF(wordDict, bow):
    tfDict = {}
    bowCount = len(bow)
    for word, count in wordDict.items():
        tfDict[word] = count/float(bowCount)
    return tfDict

def computeIDF(docList):
    import math
    idfDict = {}
    N = len(docList)
    
    idfDict = dict.fromkeys(docList[0].keys(), 0)
    for doc in docList:
        for word, val in doc.items():
            if val > 0:
                idfDict[word] += 1
    
    for word, val in idfDict.items():
        idfDict[word] = math.log10(N / float(val))
        
    return idfDict

def computeTFIDF(tfBow, idfs):
    tfidf = {}
    for word, val in tfBow.items():
        tfidf[word] = val*idfs[word]
    return tfidf

In [13]:
# Build word set
word_set = set()
for post in content:
    # Remove punctuation
    post = ''.join([c for c in post if c not in punctuation])
    word_array = post.split(" ")
    word_set = set(word_set).union(set(word_array))

In [14]:
# Build word dictionary list
dictionary_list = []
for post in content:
    # Remove punctuation
    post = ''.join([c for c in post if c not in punctuation])
    word_array = post.split(" ")
    word_dict = dict.fromkeys(word_set, 0)
    for word in word_array:
        word_dict[word] = word_dict[word] + 1
    dictionary_list.append(word_dict)

In [15]:
# Calculate each post's average TF-IDF
reason_feature_3 = []
for post in content:
    # Remove punctuation
    post = ''.join([c for c in post if c not in punctuation])
    word_array = post.split(" ")
    word_dict = dict.fromkeys(word_set, 0)
    for word in word_array:
        word_dict[word] = word_dict[word] + 1
    post_tf = computeTF(word_dict, word_array)
    post_idf = computeIDF(dictionary_list)
    post_tfidf = computeTFIDF(post_tf, post_idf)
    average_tfidf = []
    for word in word_array:
        average_tfidf.append(post_tfidf[word])
    reason_feature_3.append(np.average(average_tfidf))
df_train['Reason Feature 3'] = reason_feature_3

#### Reason feature 4: How many keywords are in the post? Usually less keywords lead to incompliteness

In [None]:
reason_feature_4 = []
authenticator = IAMAuthenticator('Your IBM Watson Natural Language Understanding API password here')
natural_language_understanding = NaturalLanguageUnderstandingV1(
    version='2021-08-01',
    authenticator=authenticator
)

natural_language_understanding.set_service_url('Your IBM Watson Natural Language Understanding API service URL here')

progress_count = 0
for post in content:
    try:
        response = natural_language_understanding.analyze(text = post, features=Features(keywords=KeywordsOptions(sentiment=True,emotion=True,limit=999))).get_result()
        reason_feature_4.append(len(response["keywords"]))
    except:
        reason_feature_4.append(0)
    progress_count = progress_count + 1
    if (progress_count % 50 == 0):
        print(str(progress_count) + " posts processed")
    
df_train['Reason Feature 4'] = reason_feature_4

#### Reason feature 5: Does the request contains a link to a Piazza post using "@"? This also indicates completeness of reason.

In [16]:
reason_feature_5 = []
for post in content:
    result = re.findall(r'@[0-9]+', post)
    if len(result) != 0:
        reason_feature_5.append(1)
    else:
        reason_feature_5.append(0)
df_train['Reason Feature 5'] = reason_feature_5

#### Reason feature 6: How many times does the request contain keyword "to"?

In [17]:
reason_feature_6 = []
for post in content:
    reason_feature_6.append(post.split(" ").count("to"))
df_train['Reason Feature 6'] = reason_feature_6

#### Reason feature 7: How many times does the request contain keyword "I"?

In [18]:
reason_feature_7 = []
for post in content:
    reason_feature_7.append(post.split(" ").count("I"))
df_train['Reason Feature 7'] = reason_feature_7

#### Reason feature 8: How many times does the request contain keyword "the"?

In [19]:
reason_feature_8 = []
for post in content:
    reason_feature_8.append(post.split(" ").count("the"))
df_train['Reason Feature 8'] = reason_feature_8

#### Reason feature 9: Does the request contain keyword "think"?

In [20]:
reason_feature_9 = []
for post in content:
    occurrence = post.split(" ").count("think")
    if occurrence > 0:
        reason_feature_9.append(1)
    else:
        reason_feature_9.append(0)
df_train['Reason Feature 9'] = reason_feature_9

#### Reason feature 10: Does the request contain keyword "Hello!"?

In [21]:
reason_feature_10 = []
for post in content:
    occurrence = post.split(" ").count("Hello!")
    if occurrence > 0:
        reason_feature_10.append(1)
    else:
        reason_feature_10.append(0)
df_train['Reason Feature 10'] = reason_feature_10

#### Reason feature 11: Does the request contain keyword "OHs"?

In [22]:
reason_feature_11 = []
for post in content:
    occurrence = post.split(" ").count("OHs")
    if occurrence > 0:
        reason_feature_11.append(1)
    else:
        reason_feature_11.append(0)
df_train['Reason Feature 11'] = reason_feature_11

#### Reason feature 12: Does the request contain keyword "use"?

In [23]:
reason_feature_12 = []
for post in content:
    occurrence = post.split(" ").count("use")
    if occurrence > 0:
        reason_feature_12.append(1)
    else:
        reason_feature_12.append(0)
df_train['Reason Feature 12'] = reason_feature_12

#### Reason feature 13: Does the request contain keyword "some"?

In [24]:
reason_feature_13 = []
for post in content:
    occurrence = post.split(" ").count("some")
    if occurrence > 0:
        reason_feature_13.append(1)
    else:
        reason_feature_13.append(0)
df_train['Reason Feature 13'] = reason_feature_13

#### Reason feature 14: Does the request contain keyword "question(s)"?

In [25]:
reason_feature_14 = []
for post in content:
    occurrence = post.split(" ").count("question") + post.split(" ").count("questions")
    if occurrence > 0:
        reason_feature_14.append(1)
    else:
        reason_feature_14.append(0)
df_train['Reason Feature 14'] = reason_feature_14

#### Reason feature 15: Does the request contain keyword "error(s)"?

In [26]:
reason_feature_15 = []
for post in content:
    occurrence = post.split(" ").count("error") + post.split(" ").count("errors")
    if occurrence > 0:
        reason_feature_15.append(1)
    else:
        reason_feature_15.append(0)
df_train['Reason Feature 15'] = reason_feature_15

#### Split labelled data with features into train and test

In [27]:
train, test  = train_test_split(df_train, test_size=0.20, random_state=4242)
train.to_csv(file.split(".")[0] + " (train).csv", index=False)
test.to_csv(file.split(".")[0] + " (test).csv", index=False)

In [28]:
print(file)

COMP All courses.csv
