In [19]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing


In [21]:
df_emotion_and_LIWC = pd.read_csv("LIWC_and_emotions.csv")
pos = (df_emotion_and_LIWC[df_emotion_and_LIWC["flagged"]=="Y"])
neg= (df_emotion_and_LIWC[df_emotion_and_LIWC["flagged"]=="N"])
df_neg_only = neg.sample(n=len(pos))

df_emotion = pd.concat([df_neg_only, pos])


In [12]:
df_emotion.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16320 entries, 2360 to 16409
Data columns (total 27 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   date          16320 non-null  object 
 1   reviewID      16320 non-null  object 
 2   reviewerID    16320 non-null  object 
 3   review        16320 non-null  object 
 4   rating        16320 non-null  int64  
 5   flagged       16320 non-null  object 
 6   restaurantID  16320 non-null  object 
 7   WC            16320 non-null  int64  
 8   Analytic      16320 non-null  float64
 9   Clout         16320 non-null  float64
 10  Authentic     16320 non-null  float64
 11  Tone          16320 non-null  float64
 12  WPS           16320 non-null  float64
 13  Sixltr        16320 non-null  float64
 14  pronoun       16320 non-null  float64
 15  ppron         16320 non-null  float64
 16  i             16320 non-null  float64
 17  we            16320 non-null  float64
 18  you           16320 non

## Replicating LIWC PACIS paper: T-tests

Features:

**LIWC**
- WC
- Analytical
- Tone
- Authentic
- Clout

**Other**
- Readability (Coleman Liau index)
- Rating extremity (the absolute difference of review star rating and existing business star rating)
- Days (days from the reviews date to the data collection date)

**To try**
- Days (days from first review to this review)
- Emotion extremity


In [14]:
#df_paired = df_emotion.drop(columns=["date","reviewID","reviewerID","review","rating","restaurantID","pronoun","i", "we", "you", "shehe", "they"])
df_paired = df_emotion.drop(columns=["date","reviewID","reviewerID","review","restaurantID"])
df_paired


Unnamed: 0,rating,flagged,WC,Analytic,Clout,Authentic,Tone,WPS,Sixltr,pronoun,...,we,you,shehe,they,ipron,sadness,anger,disgust,joy,fear
2360,4,N,100,63.83,27.43,56.63,96.76,16.67,15.00,7.00,...,0.00,1.00,0.00,0.00,1.00,0.121112,0.054115,0.046659,0.705281,0.066371
12241,2,N,179,39.76,14.42,55.21,84.76,10.53,8.38,15.08,...,0.56,1.12,0.00,0.00,8.38,0.503424,0.090788,0.080941,0.639578,0.063570
2785,1,N,78,77.75,30.40,30.55,89.84,13.00,11.54,8.97,...,0.00,0.00,0.00,1.28,2.56,0.175217,0.066383,0.506963,0.685059,0.063170
7857,2,N,71,49.49,83.80,5.25,25.77,11.83,18.31,15.49,...,4.23,0.00,2.82,0.00,5.63,0.594249,0.136808,0.105732,0.527662,0.063603
603,3,N,137,35.17,58.65,24.13,99.00,17.12,11.68,17.52,...,1.46,0.73,2.19,1.46,8.03,0.163281,0.083947,0.062964,0.700832,0.124351
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16403,4,Y,95,52.71,41.67,70.38,97.58,11.88,18.95,17.89,...,2.11,0.00,1.05,0.00,5.26,0.577808,0.085841,0.070605,0.688406,0.059792
16404,5,Y,79,72.31,84.44,22.44,99.00,15.80,8.86,7.59,...,0.00,3.80,0.00,0.00,2.53,0.185005,0.071661,0.025063,0.735421,0.006916
16405,5,Y,41,3.52,83.54,7.12,99.00,5.12,17.07,17.07,...,7.32,0.00,0.00,0.00,9.76,0.473392,0.053337,0.057628,0.708207,0.092933
16407,3,Y,176,56.25,41.03,35.37,35.71,17.60,15.34,17.61,...,2.27,0.00,1.14,0.00,5.68,0.585019,0.593745,0.497132,0.545144,0.118086


In [20]:
#plt.hist(df_paired["WC"])
print(np.mean(df_paired["WC"]))
print(np.median(df_paired["WC"]))

123.67640931372549
90.0


In [24]:
wc_mean_real = np.mean(df_neg_only["WC"])
wc_median_real = np.median(df_neg_only["WC"])
wc_std_real = np.std(df_neg_only["WC"])


## Regression

In [6]:
df_reg_emotion = df_emotion.drop(columns=["date","reviewID","reviewerID","review","rating","restaurantID","pronoun","i", "we", "you", "shehe", "they"])
le_emotion = preprocessing.LabelEncoder()
df_reg_emotion["flagged"] = le_emotion.fit_transform(df_reg_emotion["flagged"])

X = df_reg_emotion.drop(columns=["flagged"])
Y = df_reg_emotion["flagged"]

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=1, shuffle=True)
clf = LogisticRegression(random_state=1, max_iter = 1000).fit(X_train, Y_train)
clf.predict(X_test)
clf.score(X_test, Y_test)

0.6127450980392157

In [7]:
from sklearn.metrics import accuracy_score

#from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(random_state = 1, n_estimators=1000)
#X = feature_selection(X_train,Y_train, 150)

rf_model.fit(X_train, Y_train)
rf_predictions = rf_model.predict(X_test)
acc = accuracy_score(rf_predictions,Y_test)
acc

0.6366421568627451