In [1]:
import pandas as pd
import numpy as np
import string, unidecode, re
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from IPython.core.display import display, HTML
from sklearn.model_selection import train_test_split
import matplotlib as plt
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from keras.models import Sequential
from keras.layers import Dense
import networkx as nx
from multiprocessing import dummy
from networkx.algorithms.traversal.depth_first_search import dfs_tree

display(HTML("<style>.jp-Cell { width: 80% !important; margin: 0 auto;}</style>"))
path = 'E:\\M2 EconStat\\Web Mining\\Project\\comments_students.csv'
my_stopwords = stopwords.words('english')
P = dummy.Pool(processes = 12)

In [2]:
#df = pd.read_csv(path, header = 0, nrows = 200000)
#df2 = df.sort_values('ups', ascending = False, ignore_index = True)
#df2 = df.sort_values(by=['link_id'])
#np.sum(df['body'] == None)
#df[ind] = df[ind].astype({'body': 'string'})
#df['body']
#df['body'][ind].str.lower().copy()

# Import dataset

In [3]:
df = pd.read_csv(path, header = 0)

# ind is the indices of "normal" rows that will be fed into the model later
ind = (df['body'] != 'deleted') & (df['body'] != '[deleted]') & df['body'].notna()

# Text cleaning

In [4]:
def text_cleaning(df, colname, ind):
    
    tmp = df[colname][ind].copy()
    
    # Convert text to lowercase
    tmp = tmp.str.lower()
    
    # Delete punctuation
    tmp = tmp.str.replace('\n', ' ')
    tmp = tmp.str.replace('\r', ' ')    
    tmp = tmp.str.replace(r"((?!{}).)".format('(\\b[-/]\\b|[a-zA-Z0-9])'), ' ', regex = True)
    
    # Tokenize
    tmp = tmp.str.split()
    
    # Delete stop words
    tmp = tmp.apply(lambda x: [w for w in x if w not in my_stopwords])
    
    # Reverse tokenize
    df.loc[ind, colname] = tmp.map(lambda word: ' '.join(word))
    
    return df

df = text_cleaning(df, 'body', ind)

# Generate subgraphs and extract more features for each comment

- *Timing*: time since root, time since parent (in hours), number of later comments, and number of previous comments

- *Author*: a binary indicator as to whether the author is the original poster, and number of comments made by the author in the conversation

- *Graph-location*: depth of the comment (distance from the root), and number of siblings

- *Graph-response*: number of children (direct replies to the comment), height of the subtree rooted from the node, size of that subtree, number of children normalized for each thread (2 normalization techniques), subtree size normalized for each thread (2 normalization techniques).

In [7]:
link_ids = np.unique(df['link_id']).tolist()  
atts = ['created_utc', 'ups', 'link_id', 'author', 'body', 'parent_id']

def create_subgraph(link_id):
    ind = df['link_id'] == link_id
    temp_df = df[ind]
    g = nx.from_pandas_edgelist(temp_df, source = 'parent_id', target = 'name', create_using = nx.DiGraph())
    
    # Features that already exist in original dataframe
    for att in atts:
        nx.set_node_attributes(g,
                               temp_df[['name', att]].set_index('name').T.to_dict('records')[0],
                               name = att)
    
    ## Graph-location
    # Depth of a comment (distance from the root)    
    nx.set_node_attributes(g, nx.shortest_path_length(g, link_id), name = 'depth')
    
    # Number of siblings
    nx.set_node_attributes(g,
                           dict(zip(temp_df.name, temp_df.groupby('parent_id')['parent_id'].transform('count') - 1)),
                           name = 'num_siblings')
    
    ## Graph-response
    # Number of children (direct replies to a comment)
    nx.set_node_attributes(g,
                           temp_df.groupby(by = 'parent_id').size().reindex(temp_df['name'], fill_value = 0).to_dict(),
                           name = 'num_children')
    
    num_comment = dict.fromkeys(g.nodes, 0)
    height_subtree = dict.fromkeys(g.nodes, 0)
    
    for node in g.nodes:
        sub_tree = dfs_tree(g, node)
        num_comment[node] = sub_tree.number_of_nodes()
        height_subtree[node] = nx.dag_longest_path_length(sub_tree)
        
    # Number of comments in the subtree rooted from the comment
    nx.set_node_attributes(g, num_comment, name = 'comments_in_subtree')
        

    # Height of the subtree rooted from the comment
    nx.set_node_attributes(g, height_subtree, name = 'height_subtree')    
    
    ## Author
    # Number of comments made by the author in the conversation
    nx.set_node_attributes(g,
                           dict(zip(temp_df.name, temp_df.groupby('author')['author'].transform('count'))),
                           name = 'num_comments_by_author')
    
    ## Timing
    count_utc = temp_df.groupby('created_utc').size()
    
    # Number of previous comments
    cum_counts = count_utc.sort_index(ascending = True).shift(fill_value = 0).cumsum()
    nx.set_node_attributes(g,
                           dict(zip(temp_df['name'], temp_df['created_utc'].map(cum_counts))),
                           name = 'num_previous_comments')  
    
    # number of later comments
    cum_counts = count_utc.sort_index(ascending = False).shift(fill_value = 0).cumsum()
    nx.set_node_attributes(g,
                           dict(zip(temp_df['name'], temp_df['created_utc'].map(cum_counts))),
                           name = 'num_later_comments')
    
    # Time since root (in unix timestamp https://www.epochconverter.com/)
    #intuitively, the earliest comment in our subgraph will be the root. 
    #we take the minimum of the times and subtract that from the time of comment to calculate the time since root.
    roottime = np.min(temp_df.created_utc)
    mytime = temp_df.
    nx.set_node_attributes(g,
                           dict(zip(temp_df['id'], abs(temp_df['created_utc'] - roottime))),
                           name = 'time_since_root')
    
    # Time since parent (in hours)
    
    nx.set_node_attributes(g, #not sure if this will work, the iloc.
                           dict(zip(temp_df['id'], abs(temp_df['created_utc'] - temp_df.iloc['parent_id']['created_utc']))),
                           name = 'time_since_parent')
    
    return g

g = create_subgraph('t3_35jfjt')
g.nodes['t1_cr56nez']

{'created_utc': 1431317199,
 'ups': 6761.0,
 'link_id': 't3_35jfjt',
 'author': 'buckus69',
 'body': 'got one night standoff',
 'depth': 3,
 'num_siblings': 67,
 'num_children': 214,
 'comments_in_subtree': 311,
 'height_subtree': 7,
 'num_comments_by_author': 8,
 'num_previous_comments': 1393,
 'num_later_comments': 11324}

## TF-IDF

In [26]:
data = df['body']
y = df['ups']

n_feature = 50

tfi_df_vec = TfidfVectorizer(use_idf = True,
                             max_features = n_feature)

X = tfi_df_vec.fit_transform(data)

## Split dataset into test and train sets

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size = 0.33,
                                                    random_state = 42)

## Random forest model

In [28]:
model = RandomForestRegressor(n_jobs = -1)
model.fit(X_train, y_train)

# Get the mean absolute error on the validation data
y_pred = model.predict(X_test)
MAE = mean_absolute_error(y_test , y_pred)
print('Random forest validation MAE = ', MAE)

Random forest validation MAE =  21.796178664497134


## XGBoost model

In [29]:
# from xgboost import XGBRegressor
# XGBModel = XGBRegressor()
# XGBModel.fit(X_train, y_train , verbose = False)

# # Get the mean absolute error on the validation data :
# XGBpredictions = XGBModel.predict(X_test)
# MAE = mean_absolute_error(y_test , XGBpredictions)
# print('XGBoost validation MAE = ', MAE)

## Preliminary deep neural network

In [30]:
# model = Sequential()
# model.add(Dense(1000, input_dim = 1000, activation = 'relu', kernel_initializer='normal'))
# model.add(Dense(8, activation = 'relu', kernel_initializer='normal'))
# model.add(Dense(1, activation = 'linear', kernel_initializer='normal'))
# model.compile(loss = 'mean_absolute_error',
#               optimizer = 'adam',
#               metrics = ['accuracy'])
# print(model.summary())
# model.fit(X_train, y_train,
#           epochs = 3,
#           batch_size = 10,
#           validation_data = (X_test, y_test),
#           verbose = 1)