In [21]:
import pandas as pd
import numpy as np
import string, unidecode, re
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from IPython.core.display import display, HTML
from sklearn.model_selection import train_test_split
import matplotlib as plt
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from keras.models import Sequential
from keras.layers import Dense

display(HTML("<style>.jp-Cell { width: 80% !important; margin: 0 auto;}</style>"))
path = 'E:\\M2 EconStat\\Web Mining\\Project\\comments_students.csv'
my_stopwords = stopwords.words('english')

In [22]:
#df = pd.read_csv(path, header = 0, nrows = 200000)
#df2 = df.sort_values('ups', ascending = False, ignore_index = True)
#np.sum(df['body'] == None)

## Import dataset

In [23]:
df = pd.read_csv(path, header = 0, nrows = 10000)
ind = (df['body'] != 'deleted') & (df['body'] != '[deleted]') & df['ups'].notna() & df['body'].notna()
df = df[ind].astype({'body': 'string'})

## Extract more features

- *Timing*: time since root, time since parent (in hours), number of later comments, and number of previous comments

- *Author*: a binary indicator as to whether the author is the original poster, and number of comments made by the author in the conversation

- *Graph-location*: depth of the comment (distance from the root), and number of siblings

- *Graph-response*: number of children (direct replies to the comment), height of the subtree rooted from the node, size of that subtree, number of children normalized for each thread (2 normalization techniques), subtree size normalized for each thread (2 normalization techniques).

## Text cleaning

In [24]:
def text_cleaning(df, colname):
    # Convert text to lowercase
    tmp = df[colname].str.lower()
    
    # Delete punctuation
    tmp = tmp.str.replace('\n', ' ')
    tmp = tmp.str.replace('\r', ' ')    
    tmp = tmp.str.replace(r"((?!{}).)".format('(\\b[-/]\\b|[a-zA-Z0-9])'), ' ', regex = True)
    
    # Tokenize
    tmp = tmp.str.split()
    
    # Delete stop words
    tmp = tmp.apply(lambda x: [w for w in x if w not in my_stopwords])
    
    # Reverse tokenize
    df[colname] = tmp.map(lambda word: ' '.join(word))  
    
    return df

df = text_cleaning(df, 'body')

In [25]:
data = df['body']
y = df['ups']

## TF-IDF

In [26]:
n_feature = 50

tfi_df_vec = TfidfVectorizer(use_idf = True,
                             max_features = n_feature)

X = tfi_df_vec.fit_transform(data)

## Split dataset into test and train sets

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size = 0.33,
                                                    random_state = 42)

## Random forest model

In [28]:
model = RandomForestRegressor(n_jobs = -1)
model.fit(X_train, y_train)

# Get the mean absolute error on the validation data
y_pred = model.predict(X_test)
MAE = mean_absolute_error(y_test , y_pred)
print('Random forest validation MAE = ', MAE)

Random forest validation MAE =  21.796178664497134


## XGBoost model

In [29]:
# from xgboost import XGBRegressor
# XGBModel = XGBRegressor()
# XGBModel.fit(X_train, y_train , verbose = False)

# # Get the mean absolute error on the validation data :
# XGBpredictions = XGBModel.predict(X_test)
# MAE = mean_absolute_error(y_test , XGBpredictions)
# print('XGBoost validation MAE = ', MAE)

## Preliminary deep neural network

In [30]:
# model = Sequential()
# model.add(Dense(1000, input_dim = 1000, activation = 'relu', kernel_initializer='normal'))
# model.add(Dense(8, activation = 'relu', kernel_initializer='normal'))
# model.add(Dense(1, activation = 'linear', kernel_initializer='normal'))
# model.compile(loss = 'mean_absolute_error',
#               optimizer = 'adam',
#               metrics = ['accuracy'])
# print(model.summary())
# model.fit(X_train, y_train,
#           epochs = 3,
#           batch_size = 10,
#           validation_data = (X_test, y_test),
#           verbose = 1)