In [19]:
import pandas as pd
import numpy as np
import string, unidecode, re
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from IPython.core.display import display, HTML
from sklearn.model_selection import train_test_split
import matplotlib as plt
from sklearn.metrics import mean_absolute_error

display(HTML("<style>.jp-Cell { width: 80% !important; margin: 0 auto;}</style>"))
path = 'E:\\M2 EconStat\\Web Mining\\Project\\comments_students.csv'

## Import dataset

In [20]:
df = pd.read_csv(path, header = 0, nrows = 100000)
df2 = df.sort_values('ups', ascending = False, ignore_index = True)

## Text cleaning

In [21]:
def convert_text_to_lowercase(df, colname):
    df[colname] = df[colname].str.lower()
    return df
    
def not_regex(pattern):
        return r"((?!{}).)".format(pattern)
    
# Delete punctuation
def remove_punctuation(df, colname):
    df[colname] = df[colname].str.replace('\n', ' ')
    df[colname] = df[colname].str.replace('\r', ' ')
    df[colname] = df[colname].apply(lambda x: unidecode.unidecode(x))
    alphanumeric_characters_extended = '(\\b[-/]\\b|[a-zA-Z0-9])'
    df[colname] = df[colname].str.replace(not_regex(alphanumeric_characters_extended), ' ', regex = True)
    return df

def tokenize_sentence(df, colname):
    df[colname] = df[colname].str.split()
    return df

# Delete stop words
def remove_stop_words(df, colname):
    my_stopwords = stopwords.words('english')
    df[colname] = df[colname].apply(lambda x: [word for word in x if word not in my_stopwords])
    return df

def reverse_tokenize_sentence(df, colname):
    df[colname] = df[colname].map(lambda word: ' '.join(word))
    return df


def text_cleaning(df, colname):
    """
    Takes in a string of text, then performs the following:
    1. convert text to lowercase
    2. remove punctuation and new line characters '\n'
    3. Tokenize sentences
    4. Remove all stopwords
    5. convert tokenized text to text
    """
    df = (
        df
        .pipe(convert_text_to_lowercase, colname)
        .pipe(remove_punctuation, colname)
        .pipe(tokenize_sentence, colname)
        .pipe(remove_stop_words, colname)
        .pipe(reverse_tokenize_sentence, colname)
    )
    return df

In [22]:
df3 = text_cleaning(df, 'body')
data = df3['body']
y = df3['ups']

## TF-IDF

In [23]:
n_feature = 50

tfi_df_vec = TfidfVectorizer(max_df = 0.5,
                             min_df = 2,
                             use_idf = True,
                             stop_words = 'english',
                             max_features = n_feature)

X = tfi_df_vec.fit_transform(data)

## Split dataset into test and train sets

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size = 0.33,
                                                    random_state = 42)

## Preliminary deep neural network

In [25]:
from keras.models import Sequential
from keras.layers import Dense

# define the architecture
model = Sequential()
model.add(Dense(1000, input_dim = 1000, activation = 'relu', kernel_initializer='normal'))
model.add(Dense(8, activation = 'relu', kernel_initializer='normal'))
model.add(Dense(1, activation = 'linear', kernel_initializer='normal'))
model.compile(loss = 'mean_absolute_error',
              optimizer = 'adam',
              metrics = ['accuracy'])
print(model.summary())
# model.fit(X_train, y_train,
#           epochs = 3,
#           batch_size = 10,
#           validation_data = (X_test, y_test),
#           verbose = 1)

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_6 (Dense)              (None, 1000)              1001000   
_________________________________________________________________
dense_7 (Dense)              (None, 8)                 8008      
_________________________________________________________________
dense_8 (Dense)              (None, 1)                 9         
Total params: 1,009,017
Trainable params: 1,009,017
Non-trainable params: 0
_________________________________________________________________
None


## Random forest model

In [26]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()
model.fit(X_train, y_train)

# Get the mean absolute error on the validation data
predicted_prices = model.predict(X_test)
MAE = mean_absolute_error(y_test , predicted_prices)
print('Random forest validation MAE = ', MAE)

Random forest validation MAE =  21.653275848425533


## XGBoost model

In [27]:
from xgboost import XGBRegressor
XGBModel = XGBRegressor()
XGBModel.fit(X_train, y_train , verbose = False)

# Get the mean absolute error on the validation data :
XGBpredictions = XGBModel.predict(X_test)
MAE = mean_absolute_error(y_test , XGBpredictions)
print('XGBoost validation MAE = ', MAE)

XGBoost validation MAE =  22.30182093093011
