# Import necessary packages and setting for multiprocessing 

In [1]:
import pandas as pd
import numpy as np
import string, unidecode, re, os, json
import matplotlib as plt
import networkx as nx
from networkx.algorithms.traversal.depth_first_search import dfs_tree
from dask.distributed import Client
import dask.dataframe as dd
from nltk.corpus import stopwords
from multiprocessing import dummy

core = os.cpu_count() - 2
client = Client(n_workers = core, threads_per_worker = 1, memory_limit = '25GB')
my_stopwords = stopwords.words('english')
P = dummy.Pool(processes = core)

# Import dataset

In [None]:
path = 'E:\\M2 EconStat\\Web Mining\\Project\\comments_students.csv'
df = pd.read_csv(path, header = 0, engine = 'c',
                 usecols = ['created_utc', 'ups', 'link_id', 'id', 'author', 'body', 'parent_id'])

We remove prefix so that columns `parent_id` and `link_id` have the same format as column `id`.

In [None]:
df['parent_id'] = df['parent_id'].str.split('_', expand = True)[1]
df['link_id'] = df['link_id'].str.split('_', expand = True)[1]

`ind` is the indices of  "normal" rows that will be fed into the model later.

In [None]:
ind = (df['body'] != 'deleted') & (df['body'] != '[deleted]') & df['body'].notna()

We save this dataset for later use.

In [None]:
path = r'E:\\M2 EconStat\\Web Mining\\prefix_removed.csv'
df.to_csv(path, index = False, encoding =  'utf-8')

# Find the root for every tree

In [None]:
path = r'E:\\M2 EconStat\\Web Mining\\prefix_removed.csv'
df = pd.read_csv(path, header = 0, engine = 'c')

The idea is that a comment is a root if it does not reply any comment in the dataset. Hence its `in_degree` is $0$. Of course, a comment with `link_id` = `parent_id` is a root.

In [18]:
G = nx.from_pandas_edgelist(df, source = 'parent_id', target = 'id', create_using = nx.DiGraph())
roots = [n for n, d in G.in_degree() if d == 0]

Function `para_func` below takes a root as input and returns a dataframe. Column `id` contains all the nodes of the tree characterized by the root. We extract this tree by `dfs_tree`. All elements in column `root` are the same, i.e the root itself. 

In [20]:
def para_func(r):
    node = list(dfs_tree(G, r).nodes)
    tree = G.subgraph(node)
    node.remove(r)
    root =  pd.DataFrame({'id': node, 'root': [r] * len(node)})
    return root

This operation is not computationally intensive, so I use `multiprocessing.dummy.Pool` for simple implementation.

In [None]:
root_node_list = P.map(para_func, roots)
root_node = pd.concat(root_node_list)

We add this root-node information to original dataframe. Notice that the original dataframe does not contain comments that are roots. That's why we specify `how = 'left'`.

In [None]:
df = df.merge(root_node, how = 'left', on = 'id')

We save this dataset for later use.

In [None]:
path = r'E:\\M2 EconStat\\Web Mining\\root_included.csv'
df.to_csv(path, index = False, encoding =  'utf-8')

# Extract graph-related features

In [2]:
path = r'E:\\M2 EconStat\\Web Mining\\root_included.csv'
df = dd.read_csv(path, header = 0)

We create column `timestamp` to compute time interval later.

In [3]:
df['timestamp'] = dd.to_datetime(df['created_utc'], utc = True, unit = 's')

Below is the list of features we will extract.

- *Timing*: time since root, time since parent (in hours), number of later comments, and number of previous comments

- *Author*: a binary indicator as to whether the author is the original poster, and number of comments made by the author in the conversation

- *Graph-location*: depth of the comment (distance from the root), and number of siblings

- *Graph-response*: number of children (direct replies to the comment), height of the subtree rooted from the node, size of that subtree, number of children normalized for each thread (2 normalization techniques), subtree size normalized for each thread (2 normalization techniques).

I could not find any package containing functions to compute all features, except for *depth of the comment*, *height of the subtree*, and *size of that subtree*. Luckily, our subgraphs are of a special kind, [arborescence](https://www.wikiwand.com/en/Tree_(graph_theory)#/Rooted_tree). This allows us to utilize package `dask` and highly optimized functions from package `pandas` to speed up computation.

First, we compute features related to the subgraph characterized by column `root`. Comments with the same `root` are in the same subgraph.

In [4]:
def features_extract(sub_df):
    r = sub_df['root'].iloc[0]
    tree = nx.from_pandas_edgelist(sub_df,
                                   source = 'parent_id',
                                   target = 'id',
                                   create_using = nx.DiGraph())

    count_utc = sub_df.groupby('created_utc').size()
    cum_previous_counts = count_utc.sort_index(ascending = True).shift(fill_value = 0).cumsum()
    cum_later_counts = count_utc.sort_index(ascending = False).shift(fill_value = 0).cumsum()
    time_parent = sub_df.parent_id.map(dict(zip(sub_df.id, sub_df.timestamp)))

    depth = [nx.shortest_path_length(tree, source = r, target = n) for n in sub_df.id]
    num_siblings = sub_df.groupby('parent_id').parent_id.transform('count')
    num_children = sub_df.groupby('parent_id').size().reindex(sub_df.id, fill_value = 0)
    num_comments_author = sub_df.groupby('author').author.transform('count')
    num_previous_comments = sub_df.created_utc.map(cum_previous_counts)
    num_later_comments = sub_df.created_utc.map(cum_later_counts)
    time_since_root = (sub_df.timestamp - sub_df.timestamp.min()) / pd.Timedelta(hours = 1)
    time_since_parent = ((sub_df.timestamp - time_parent) / pd.Timedelta(hours = 1)).fillna(0)

    data = list(zip(sub_df.id, depth, num_siblings, num_children, num_comments_author,
                    num_previous_comments, num_later_comments, time_since_root, time_since_parent))

    columns = ['id', 'depth', 'num_siblings', 'num_children', 'num_comments_author',
              'num_previous_comments', 'num_later_comments', 'time_since_root', 'time_since_parent']

    return pd.DataFrame(data, columns = columns)

In [5]:
meta = {'id': object, 'depth': np.int64, 'num_siblings': np.int64, 'num_children': np.int64,
       'num_comments_author': np.int64, 'num_previous_comments': np.int64, 'num_later_comments': np.int64,
       'time_since_root': np.float64, 'time_since_parent': np.float64}

result = df.groupby('root').apply(features_extract, meta = meta)
result = result.compute(scheduler = 'processes')
result.reset_index(inplace = True, drop = True)
result

Unnamed: 0,id,depth,num_siblings,num_children,num_comments_author,num_previous_comments,num_later_comments,time_since_root,time_since_parent
0,cr1qq1w,1,1,0,1,0,0,0.000000,0.000000
1,cqvdyoy,1,1,0,1,0,0,0.000000,0.000000
2,cqxivp8,1,1,0,1,0,0,0.000000,0.000000
3,cqxiruo,1,1,0,1,0,0,0.000000,0.000000
4,cqws27u,1,1,0,1,0,0,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...
4234965,cquiajn,1,1,1,1,0,2,0.000000,0.000000
4234966,cqumskx,2,1,1,1,1,1,2.025556,2.025556
4234967,cquxm86,3,1,0,1,2,0,11.754444,9.728889
4234968,cqugv6d,1,1,0,1,0,0,0.000000,0.000000


*Remark 1:*

- I use `.compute()` or equivalently `.compute(scheduler = 'threads')`. The executing time is reduced to $30$ minutes. The utilization of CPU increases from $25\%$ to $50\%$. Previously, I used `multiprocessing.dummy.Pool()` and my laptop could not finish the computation in 3 hours. I had no choice but to interupt Python kernel.

- Then I come across this [answer](https://stackoverflow.com/a/31364127/7357673) and change to `.compute(scheduler = 'processes')`. The utilization of CPU is almost $100\%$. The executing time is reduced to just $12$ minutes. This is awesome.

*Remark 2:*

- At first I thought `dask` only works with function that returns a dictionary. This means I have to do $2$ more steps, i.e. converting resulted dictionaries to pandas dataframe and then concatenating them together. Even I used `multiprocessing.dummy.Pool()`, the conversion still took up to $2.5$ minutes.

- Then I come across this [documentation](https://docs.dask.org/en/latest/dataframe-design.html#metadata). It tells that `dask` works perfectly with function that returns a pandas dataframe (It should ^_^). We only need to specify the correct data types with option `meta`. One advantage of this approach is that `dask` automatically concatenate resulted dataframes.

## Number of comments and height of the subtree rooted from the comment itself

I first create a graph from the whole dataset.

In [6]:
G = nx.from_pandas_edgelist(df,
                            source = 'parent_id',
                            target = 'id',
                            create_using = nx.DiGraph())

In this case, `multiprocessing.dummy.Pool` and `dask.Series.apply` take almost the same amount of time to finish. This makes sense because `features_extract` is applied on **every** comment in the dataset. Hence `dask.Series.apply` does not have any advantage by cleverly partitioning the dataframe.

Last but not least, `dask.Series.apply` has a disadvantage, i.e. it can not be called on an existing Dask dataframe. To use it, we need to import our dataset again with `pd.read_csv`.

In [7]:
def features_extract(r):
    subtree = dfs_tree(G, r)
    num_comment = subtree.number_of_nodes()
    height_subtree = nx.dag_longest_path_length(subtree)
    return [r, num_comment, height_subtree]

result2 = P.map(features_extract, df.id)
result2 = pd.DataFrame(result2, columns = ['id', 'num_comments_subtree', 'height_subtree'])
result2

Unnamed: 0,id,num_comments_subtree,height_subtree
0,cqug90j,1,0
1,cqug90k,1,0
2,cqug90z,1,0
3,cqug91c,1,0
4,cqug91e,3,2
...,...,...,...
4234965,crrbelu,1,0
4234966,crrbelv,1,0
4234967,crrbemp,1,0
4234968,crrbenh,1,0


We add these features to our original daraframe. By construction, `df`, `compute_result`, and `sub_tree_infor` have exactly the same elements in column `id`. That's why dont need to specify parameter `how`.

In [8]:
path = r'E:\\M2 EconStat\\Web Mining\\root_included.csv'
df = pd.read_csv(path, header = 0, engine = 'c')
df = df.merge(result, on = 'id')
df = df.merge(result2, on = 'id')
df

Unnamed: 0,created_utc,ups,link_id,id,author,body,parent_id,root,depth,num_siblings,num_children,num_comments_author,num_previous_comments,num_later_comments,time_since_root,time_since_parent,num_comments_subtree,height_subtree
0,1430438400,3.0,34f9rh,cqug90j,jesse9o3,No one has a European accent either because i...,cqug2sr,cqug2sr,1,1,0,1,0,0,0.000000,0.000000,1,0
1,1430438400,3.0,34fvry,cqug90k,beltfedshooter,That the kid ..reminds me of Kevin. so sad :-(,34fvry,34fvry,1,1443,0,1,0,3512,0.000000,0.000000,1,0
2,1430438400,5.0,34ffo5,cqug90z,InterimFatGuy,NSFL,cqu80zb,cqu80zb,1,5,0,1,0,11,0.000000,0.000000,1,0
3,1430438401,1.0,34aqsn,cqug91c,JuanTutrego,I'm a guy and I had no idea this was a thing g...,cqtdj4m,cqtdj4m,1,4,0,1,0,4,0.000000,0.000000,1,0
4,1430438401,101.0,34f9rh,cqug91e,dcblackbelt,"Mid twenties male rocking skinny jeans/pants, ...",cquc4rc,cquc4rc,1,2,1,1,0,3,0.000000,0.000000,3,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4234965,1433116795,,37y5rx,crrbelu,monster860,Does the sun set in the west/rise in the east?...,37y5rx,37y5rx,1,1496,0,1,3265,0,12.452222,0.000000,1,0
4234966,1433116795,,37y5rx,crrbelv,jsimo36,Coffee.,37y5rx,37y5rx,1,1496,0,1,3265,0,12.452222,0.000000,1,0
4234967,1433116796,,380jx2,crrbemp,torianicole731,"people who cannot make up their mind, my bulls...",380jx2,380jx2,1,9,0,1,8,0,0.522500,0.000000,1,0
4234968,1433116797,,37yawp,crrbenh,bellypouch,Give them to Irish people in exchange for doin...,37yawp,37yawp,1,1779,0,1,3519,0,11.398056,0.000000,1,0


We save it for later use.

In [9]:
path = r'E:\\M2 EconStat\\Web Mining\\features_included.csv'
df.to_csv(path, index = False, encoding =  'utf-8')

# Text cleaning

In [None]:
def text_cleaning(df, colname, ind):
    
    tmp = df[colname][ind].copy()
    
    # Convert text to lowercase
    tmp = tmp.str.lower()
    
    # Delete punctuation
    tmp = tmp.str.replace('\n', ' ')
    tmp = tmp.str.replace('\r', ' ')    
    tmp = tmp.str.replace(r"((?!{}).)".format('(\\b[-/]\\b|[a-zA-Z0-9])'), ' ', regex = True)
    
    # Tokenize
    tmp = tmp.str.split()
    
    # Delete stop words
    tmp = tmp.apply(lambda x: [w for w in x if w not in my_stopwords])
    
    # Reverse tokenize
    df.loc[ind, colname] = tmp.map(lambda word: ' '.join(word))
    
    return df

df = text_cleaning(df, 'body', ind)

## TF-IDF

In [None]:
data = df['body']
y = df['ups']

n_feature = 50

tfi_df_vec = TfidfVectorizer(use_idf = True,
                             max_features = n_feature)

X = tfi_df_vec.fit_transform(data)

## Split dataset into test and train sets

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size = 0.33,
                                                    random_state = 42)

## Random forest model

In [None]:
model = RandomForestRegressor(n_jobs = -1)
model.fit(X_train, y_train)

# Get the mean absolute error on the validation data
y_pred = model.predict(X_test)
MAE = mean_absolute_error(y_test , y_pred)
print('Random forest validation MAE = ', MAE)

## XGBoost model

In [None]:
# from xgboost import XGBRegressor
# XGBModel = XGBRegressor()
# XGBModel.fit(X_train, y_train , verbose = False)

# # Get the mean absolute error on the validation data :
# XGBpredictions = XGBModel.predict(X_test)
# MAE = mean_absolute_error(y_test , XGBpredictions)
# print('XGBoost validation MAE = ', MAE)

## Preliminary deep neural network

In [None]:
# model = Sequential()
# model.add(Dense(1000, input_dim = 1000, activation = 'relu', kernel_initializer='normal'))
# model.add(Dense(8, activation = 'relu', kernel_initializer='normal'))
# model.add(Dense(1, activation = 'linear', kernel_initializer='normal'))
# model.compile(loss = 'mean_absolute_error',
#               optimizer = 'adam',
#               metrics = ['accuracy'])
# print(model.summary())
# model.fit(X_train, y_train,
#           epochs = 3,
#           batch_size = 10,
#           validation_data = (X_test, y_test),
#           verbose = 1)