# Sentiment Classification Code 
### Author : Saksham Arora

### Install node2vec

In [None]:
!pip install node2vec

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting node2vec
  Downloading node2vec-0.4.6-py3-none-any.whl (7.0 kB)
Collecting networkx<3.0,>=2.5 (from node2vec)
  Downloading networkx-2.8.8-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m27.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: networkx, node2vec
  Attempting uninstall: networkx
    Found existing installation: networkx 3.1
    Uninstalling networkx-3.1:
      Successfully uninstalled networkx-3.1
Successfully installed networkx-2.8.8 node2vec-0.4.6


In [None]:
import numpy as np
import networkx as nx
# import community
import pandas as pd
import scipy
import sklearn
from node2vec import Node2Vec
import warnings
import matplotlib.pyplot as plt
import math
from sklearn.cluster import KMeans
from sklearn.metrics.cluster import normalized_mutual_info_score

In [None]:
dtypes = {'id':'str', 'author':'str', 'score':'int', 'upvote_ratio':'float', 'num_comments':'int',
       'subreddit':'str', 'comment':'object', 'comment_sentiment_dict':'object', 'comment_sentiment_class':'str'}
cornell_comment_class_df = pd.read_csv('processed_data/cornell/cornell_comment_sentiment_class.csv', dtype=dtypes, usecols=dtypes.keys())

In [None]:
cornell_comment_class_df

Unnamed: 0,id,author,score,upvote_ratio,num_comments,subreddit,comment,comment_sentiment_dict,comment_sentiment_class
0,jryusf,yoyoyaass,374,0.97,43,berkeley,"{'author': 'buckyspunisher', 'body': 'Haha yea...","{'neg': 0.282, 'neu': 0.483, 'pos': 0.235, 'co...",N
1,jryusf,yoyoyaass,374,0.97,43,berkeley,"{'author': 'funkyfaithy', 'body': 'This is the...","{'neg': 0.164, 'neu': 0.77, 'pos': 0.066, 'com...",VN
2,jryusf,yoyoyaass,374,0.97,43,berkeley,"{'author': 'novared19', 'body': 'it took me a ...","{'neg': 0.097, 'neu': 0.875, 'pos': 0.028, 'co...",N
3,jryusf,yoyoyaass,374,0.97,43,berkeley,"{'author': 'DragoSphere', 'body': 'Midterm in ...","{'neg': 0.0, 'neu': 0.571, 'pos': 0.429, 'comp...",P
4,jryusf,yoyoyaass,374,0.97,43,berkeley,"{'author': 'None', 'body': '[deleted]', 'score...","{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",NEU
...,...,...,...,...,...,...,...,...,...
18945,n5wa1v,-feelalive-,514,0.98,77,berkeley,"{'author': 'KNJI03', 'body': ':)', 'score': 6,...","{'neg': 0.0, 'neu': 0.0, 'pos': 1.0, 'compound...",P
18946,n5wa1v,-feelalive-,514,0.98,77,berkeley,"{'author': 'Xxb30wulfxX', 'body': 'I would say...","{'neg': 0.045, 'neu': 0.803, 'pos': 0.152, 'co...",P
18947,n5wa1v,-feelalive-,514,0.98,77,berkeley,"{'author': 'Right-Advertising367', 'body': 'Id...","{'neg': 0.048, 'neu': 0.787, 'pos': 0.165, 'co...",VP
18948,n5wa1v,-feelalive-,514,0.98,77,berkeley,"{'author': 'walter_evertonshire', 'body': 'In ...","{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",NEU


In [None]:
cornell_comment_class_df['comment'] = cornell_comment_class_df['comment'].apply(lambda x: eval(x))

In [None]:
cornell_comment_class_df['author'] = cornell_comment_class_df['comment'].apply(lambda x: x['author'])
cornell_comment_class_df['comment_text'] = cornell_comment_class_df['comment'].apply(lambda x: x['body'])
cornell_comment_class_df['score'] = cornell_comment_class_df['comment'].apply(lambda x: x['score'])
cornell_comment_class_df['created_time_utc'] = cornell_comment_class_df['comment'].apply(lambda x: x['created_utc'])

In [None]:
cornell_comment_class_df = cornell_comment_class_df[cornell_comment_class_df['comment_text'] != '[deleted]']

In [None]:
!pip install vaderSentiment

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2


In [None]:
cornell_graph = nx.read_edgelist('processed_data/cornell/cornell_graph_comment.edgelist')

In [None]:
%%time
node2vec = Node2Vec(cornell_graph, dimensions=128, walk_length=80, num_walks=10, workers=1)  # Use temp_folder for big graphs
model = node2vec.fit(window=10, min_count=1, batch_words=4)  

Computing transition probabilities:   0%|          | 0/5980 [00:00<?, ?it/s]

In [None]:
node_ids = cornell_comment_class_df['author'].values.tolist()
# embeddings = np.array([model.wv[str(node)] for node in node_ids])
targets = cornell_comment_class_df['comment_sentiment_class'].values.tolist()

In [None]:
embeddings_df = pd.DataFrame(embeddings, columns=list(range(128)))
targets_df = pd.DataFrame(targets)

In [None]:
cornell_feature_df = embeddings_df
cornell_feature_df['comment_sentiment_class'] = targets_df

In [None]:
cornell_feature_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,119,120,121,122,123,124,125,126,127,comment_sentiment_class
0,-0.005053,0.154495,0.126255,-0.103662,0.088851,-0.293839,0.380883,0.196578,-0.300757,0.123030,...,0.066300,0.147360,-0.173618,-0.114226,0.127423,0.139200,-0.115003,-0.079569,-0.074510,NEU
1,-0.036723,0.172970,0.332766,0.020840,-0.071663,-0.130393,0.399502,0.040201,0.013044,-0.024582,...,0.090879,-0.441134,-0.007421,0.362586,0.364319,-0.161145,0.156700,0.034117,0.085246,N
2,-0.209586,0.197000,0.092381,-0.053890,0.189721,-0.256567,0.157380,0.097021,0.042921,0.007310,...,0.069556,-0.098143,-0.159944,0.115349,0.166395,0.179107,-0.045794,0.092897,0.085286,NEU
3,-0.035096,0.144254,-0.252204,-0.031182,0.176053,0.049873,0.002641,-0.023567,0.129240,-0.292440,...,-0.221809,-0.060979,-0.017270,-0.069755,0.081001,0.037078,-0.083885,-0.400756,-0.021449,VP
4,-0.171048,0.140349,0.054417,-0.071818,0.234886,-0.154765,0.340058,-0.005405,-0.026236,0.041267,...,0.016899,0.075523,-0.244101,-0.093958,0.248183,0.096905,0.036983,-0.127759,0.411964,P
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4117,0.081979,-0.465591,0.218207,-0.332556,0.007064,-0.121298,0.349834,0.144585,-0.195300,0.118537,...,-0.015438,-0.010255,0.149976,0.346196,0.257508,-0.387366,-0.031274,0.197870,0.439841,N
4118,-0.066315,-0.103202,0.091733,0.116861,0.023815,0.077006,0.474686,-0.046073,0.289066,-0.062201,...,-0.241741,-0.007812,0.195650,0.033870,0.234619,-0.097853,-0.321625,0.084369,0.269935,NEU
4119,-0.037851,-0.677482,0.064357,-0.290314,-0.096774,-0.031396,0.311618,0.012856,0.102354,-0.047596,...,0.193960,-0.045655,-0.071975,-0.048440,-0.011747,-0.345474,-0.058489,-0.196581,0.081686,P
4120,0.066436,-0.023699,0.106220,-0.162501,0.227659,-0.019291,0.429362,0.010465,-0.030707,-0.063433,...,0.247931,-0.018426,0.093356,-0.056099,0.093991,-0.211784,-0.095648,-0.154910,0.013870,VN


In [None]:
numeric_var = {"comment_sentiment_class": {"NEU":0, "P":1, "VP":2, "N":3, "VN":4}}
cornell_feature_df = cornell_feature_df.replace(numeric_var)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Load your data

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(embeddings, targets, test_size=0.2, random_state=42)


# Create and fit the Logistic Regression model
# model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)
# model.fit(X_train, y_train)

# Make predictions on the testing set and print out the results
# predictions = model.predict(X_test)
# print(classification_report(y_test, predictions))


NameError: ignored

In [None]:
from sklearn import svm

# Create and fit the SVM model
model = svm.SVC()
model.fit(X_train, y_train)
predictions = model.predict(X_test)
print(classification_report(y_test, predictions))


In [None]:
from sklearn.ensemble import RandomForestClassifier

# Create and fit the Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
predictions = model.predict(X_test)
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           N       0.04      0.02      0.03        43
         NEU       0.44      0.67      0.53       300
           P       0.20      0.09      0.13       163
          VN       0.07      0.03      0.04        39
          VP       0.48      0.42      0.45       280

    accuracy                           0.41       825
   macro avg       0.24      0.25      0.23       825
weighted avg       0.37      0.41      0.37       825



In [None]:
from sklearn.tree import DecisionTreeClassifier

# Create and fit the Decision Tree model
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)
predictions = model.predict(X_test)
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           N       0.04      0.05      0.04        43
         NEU       0.43      0.63      0.52       300
           P       0.20      0.12      0.15       163
          VN       0.05      0.03      0.03        39
          VP       0.47      0.36      0.41       280

    accuracy                           0.38       825
   macro avg       0.24      0.24      0.23       825
weighted avg       0.36      0.38      0.36       825



In [None]:
deg_centrality = nx.degree_centrality(cornell_graph)
btw_centrality = nx.betweenness_centrality(cornell_graph, normalized = True, 
                                              endpoints = False)
eig_centrality = nx.eigenvector_centrality(cornell_graph)
clustering_coefficients = nx.clustering(cornell_graph)
pagerank = nx.pagerank(cornell_graph, 0)

In [None]:
eig_targets = [eig_centrality[node_id] for node_id in node_ids]
deg_targets = [deg_centrality[node_id] for node_id in node_ids]
cluster_targets = [clustering_coefficients[node_id] for node_id in node_ids]
pagerank_targets = [pagerank[node_id] for node_id in node_ids]


In [None]:
# X = np.hstack([eig_targets, embeddings])
eig_2d = np.reshape(eig_targets, (-1, 1))
deg_2d = np.reshape(deg_targets, (-1, 1))
cluster_2d = np.reshape(cluster_targets, (-1, 1))
pagerank_2d = np.reshape(pagerank_targets, (-1, 1))

X = np.concatenate((eig_2d, deg_2d, cluster_2d, pagerank_2d), 1)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, targets, test_size=0.2, random_state=42)


NameError: ignored

In [None]:
X_train.shape

(14584, 132)

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Create and fit the Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
predictions = model.predict(X_test)
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           N       0.04      0.02      0.03        43
         NEU       0.44      0.67      0.53       300
           P       0.18      0.08      0.11       163
          VN       0.07      0.03      0.04        39
          VP       0.49      0.44      0.46       280

    accuracy                           0.41       825
   macro avg       0.24      0.25      0.23       825
weighted avg       0.36      0.41      0.37       825



In [None]:
import statsmodels.api as st
mdl = st.MNLogit(targets, st.add_constant(eig_targets))

In [None]:
set(targets)

{'N', 'NEU', 'P', 'VN', 'VP'}

In [None]:
mdl_fit = mdl.fit()
mdl_fit.summary()

Optimization terminated successfully.
         Current function value: 1.435336
         Iterations 6


0,1,2,3
Dep. Variable:,y,No. Observations:,2811.0
Model:,MNLogit,Df Residuals:,2803.0
Method:,MLE,Df Model:,4.0
Date:,"Thu, 25 May 2023",Pseudo R-squ.:,0.0007101
Time:,13:35:24,Log-Likelihood:,-4034.7
converged:,True,LL-Null:,-4037.6
Covariance Type:,nonrobust,LLR p-value:,0.2199

y=NEU,coef,std err,z,P>|z|,[0.025,0.975]
const,1.4020,0.074,18.832,0.000,1.256,1.548
x1,0.0879,0.465,0.189,0.850,-0.824,1.000
y=P,coef,std err,z,P>|z|,[0.025,0.975]
const,0.6460,0.082,7.839,0.000,0.485,0.808
x1,-0.0865,0.520,-0.166,0.868,-1.106,0.933
y=VN,coef,std err,z,P>|z|,[0.025,0.975]
const,-0.3661,0.104,-3.531,0.000,-0.569,-0.163
x1,0.4515,0.618,0.731,0.465,-0.759,1.662
y=VP,coef,std err,z,P>|z|,[0.025,0.975]
const,0.7685,0.081,9.475,0.000,0.610,0.927
