In [19]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing


In [121]:
df_emotion_and_LIWC = pd.read_csv("LIWC_and_emotions.csv")
pos = (df_emotion_and_LIWC[df_emotion_and_LIWC["flagged"]=="Y"])
neg= (df_emotion_and_LIWC[df_emotion_and_LIWC["flagged"]=="N"])
df_neg_only = neg.sample(n=len(pos))

df_emotion = pd.concat([df_neg_only, pos])


In [122]:
df_emotion.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16320 entries, 6263 to 16409
Data columns (total 27 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   date          16320 non-null  object 
 1   reviewID      16320 non-null  object 
 2   reviewerID    16320 non-null  object 
 3   review        16320 non-null  object 
 4   rating        16320 non-null  int64  
 5   flagged       16320 non-null  object 
 6   restaurantID  16320 non-null  object 
 7   WC            16320 non-null  int64  
 8   Analytic      16320 non-null  float64
 9   Clout         16320 non-null  float64
 10  Authentic     16320 non-null  float64
 11  Tone          16320 non-null  float64
 12  WPS           16320 non-null  float64
 13  Sixltr        16320 non-null  float64
 14  pronoun       16320 non-null  float64
 15  ppron         16320 non-null  float64
 16  i             16320 non-null  float64
 17  we            16320 non-null  float64
 18  you           16320 non

## Replicating LIWC PACIS paper: T-tests

Features:

**LIWC**
- WC
- Analytical
- Tone
- Authentic
- Clout

**Other**
- Readability (Coleman Liau index)
- Rating extremity (the absolute difference of review star rating and existing business star rating)
- Days (days from the reviews date to the data collection date)

**To try**
- Days (days from first review to this review)
- Emotion extremity


In [99]:
#df_paired = df_emotion.drop(columns=["date","reviewID","reviewerID","review","rating","restaurantID","pronoun","i", "we", "you", "shehe", "they"])
#df_paired = df_emotion.drop(columns=["date","reviewID","reviewerID","review","restaurantID"])
#df_paired


In [123]:
# calculating Readability (Coleman Liau index)
df_emotion["Sentences"] = df_emotion["WC"]/df_emotion["WPS"]

for i,row in df_emotion.iterrows():
    df_emotion.at[i,"Letters"] = sum(c.isalpha() for c in df_emotion.at[i,"review"])
    #len(df_emotion.at[i,"review"])
   # print(df_emotion.at[i,"review"])

df_emotion["Readability"] = (df_emotion["Letters"]*100/df_emotion["WC"]*0.0588)-(0.296*df_emotion["Sentences"]*100/df_emotion["WC"])-15.8



In [131]:
wc_mean_real = np.mean(df_neg_only["WC"])
wc_median_real = np.median(df_neg_only["WC"])
wc_std_real = np.std(df_neg_only["WC"])

data = {'Feature':  ['WC', 'WC', 
                     'Analytic','Analytic',
                     'Tone','Tone',
                     'Clout','Clout',
                     'Authentic','Authentic',
                     'Readability','Readability'],
        'Reviews': ['Real', 'Fake', 'Real', 'Fake','Real', 'Fake','Real', 'Fake','Real', 'Fake','Real', 'Fake'],
        'Mean': [np.mean(df_neg_only["WC"]), np.mean(pos["WC"]),
                 np.mean(df_neg_only["Analytic"]), np.mean(pos["Analytic"]),
                 np.mean(df_neg_only["Tone"]), np.mean(pos["Tone"]),
                 np.mean(df_neg_only["Clout"]), np.mean(pos["Clout"]),
                 np.mean(df_neg_only["Authentic"]), np.mean(pos["Authentic"]),
                 np.mean(df_emotion[df_emotion["flagged"]=="N"]["Readability"]), np.mean(df_emotion[df_emotion["flagged"]=="Y"]["Readability"])],

        'Median': [np.median(df_neg_only["WC"]), np.median(pos["WC"]),
                   np.median(df_neg_only["Analytic"]), np.median(pos["Analytic"]),
                   np.median(df_neg_only["Tone"]), np.median(pos["Tone"]),
                   np.median(df_neg_only["Clout"]), np.median(pos["Clout"]),
                   np.median(df_neg_only["Authentic"]), np.median(pos["Authentic"]),
                   np.median(df_emotion[df_emotion["flagged"]=="N"]["Readability"]), np.median(df_emotion[df_emotion["flagged"]=="Y"]["Readability"])],
        
        'Stdev': [np.std(df_neg_only["WC"]), np.std(pos["WC"]),
                  np.std(df_neg_only["Analytic"]), np.std(pos["Analytic"]),
                  np.std(df_neg_only["Tone"]), np.std(pos["Tone"]),
                  np.std(df_neg_only["Clout"]), np.std(pos["Clout"]),
                  np.std(df_neg_only["Authentic"]), np.std(pos["Authentic"]),
                  np.std(df_emotion[df_emotion["flagged"]=="N"]["Readability"]), np.std(df_emotion[df_emotion["flagged"]=="Y"]["Readability"])]
    
        }

paired_stats = pd.DataFrame(data)

paired_stats

Unnamed: 0,Feature,Reviews,Mean,Median,Stdev
0,WC,Real,146.210784,114.0,121.098144
1,WC,Fake,101.407475,71.0,99.644241
2,Analytic,Real,57.071744,58.74,23.504155
3,Analytic,Fake,56.974985,58.68,25.955518
4,Tone,Real,81.758081,94.5,25.004482
5,Tone,Fake,80.651121,97.58,28.672983
6,Clout,Real,47.643725,47.14,23.492169
7,Clout,Fake,51.715174,50.0,25.854837
8,Authentic,Real,48.123295,47.07,27.914343
9,Authentic,Fake,47.877473,47.07,30.546632


In [132]:
paired_stats.transpose()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
Feature,WC,WC,Analytic,Analytic,Tone,Tone,Clout,Clout,Authentic,Authentic,Readability,Readability
Reviews,Real,Fake,Real,Fake,Real,Fake,Real,Fake,Real,Fake,Real,Fake
Mean,146.211,101.407,57.0717,56.975,81.7581,80.6511,47.6437,51.7152,48.1233,47.8775,6.61554,6.58927
Median,114,71,58.74,58.68,94.5,97.58,47.14,50,47.07,47.07,6.60937,6.59962
Stdev,121.098,99.6442,23.5042,25.9555,25.0045,28.673,23.4922,25.8548,27.9143,30.5466,1.99912,2.42207


In [133]:
df_emotion[df_emotion["flagged"]=="N"]

Unnamed: 0,date,reviewID,reviewerID,review,rating,flagged,restaurantID,WC,Analytic,Clout,...,they,ipron,sadness,anger,disgust,joy,fear,Sentences,Letters,Readability
6263,5/18/2012,kzfRJq8looL9rjU5vrJMcg,xOunEwtayNchLXE9IM7XHQ,"Yes, this restaurant is awesome. the only Bad ...",5,N,LMaoM2Ue2BR_HI9ba3JsZg,131,55.29,87.39,...,0.76,5.34,0.129982,0.097658,0.068855,0.658567,0.706310,11.996337,601.0,8.465560
1674,1/25/2011,C1LiRnlMV95zIiQRIr5o-w,t16vWq9ZYH1lYSoHKKfqZw,Very friendly pub atmosphere. We've been twice...,5,N,OKOfoisduqP24uK3WYvx7Q,115,73.46,50.00,...,1.74,4.35,0.510948,0.083127,0.067928,0.698898,0.079495,7.997218,467.0,6.019499
8061,2/6/2011,LuN1gwGwt0RneIowwB,o9sjufIabihnayQbdmJDiQ,Love this place. I have been here three times ...,5,N,mlhHPnHmjO2_ypcP5Vc9Zw,64,42.94,56.21,...,1.56,6.25,0.115946,0.088060,0.029878,0.731274,0.054353,5.998126,263.0,5.588992
9588,3/10/2009,FHSmkH2zShjD6i,SFa-GP9EXJc298frDq-oUg,What happened Frontera? This is the last and f...,4,N,AqgG-1aD6JYj9D6OmBWO3w,254,47.99,88.87,...,0.79,7.48,0.568259,0.114541,0.451547,0.521832,0.099869,11.998111,1034.0,6.738488
11072,11/13/2010,l1RBvNmvKkCJ5PS4lbcYSw,eqVWf0donIn6fD2DOCajFA,A friend and I went here for Saturday brunch. ...,5,N,AqgG-1aD6JYj9D6OmBWO3w,270,55.85,28.91,...,0.74,4.81,0.635465,0.117987,0.078354,0.638787,0.090032,18.000000,1113.0,6.465333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9022,8/7/2010,2Hcoqet5w2ZHjeGzxz4wHw,pt9vY72eWMIs-koHzkxb_g,"In a few words, great brunch joint. A key poin...",4,N,RYw_no9ohtCg09aKorVffg,151,54.95,83.98,...,2.65,4.64,0.446580,0.473803,0.108356,0.677692,0.088323,12.994836,695.0,8.716244
5861,8/12/2006,km25lpRBA-3IkTEMccXgHw,dhuWV1YmyMhjCTS4bh1uoA,Really yummy deep dish pizza! But you gotta ha...,5,N,tW2jfL-qMccAYZSghPBbHA,80,23.10,73.40,...,3.75,7.50,0.157558,0.073714,0.065535,0.714233,0.063741,6.001500,344.0,7.263445
7053,4/27/2012,oMMtcwO,ifMlBSNnQTE5i9D7Zao0DA,I was so stoked about writing this review beca...,5,N,LMaoM2Ue2BR_HI9ba3JsZg,398,87.60,46.98,...,0.25,5.03,0.116470,0.076068,0.111267,0.695240,0.079263,27.001357,1804.0,8.843919
12625,12/15/2011,UMK643UFCgPXgsHkmNsB2g,LZt80LbsfQ9kOz96H7c1bA,"People are going to hate me for this, and I ki...",3,N,IuFEm-19YYh49T5ZlvwkDA,124,40.85,20.99,...,1.61,7.26,0.224602,0.563090,0.496468,0.576496,0.094027,7.001694,492.0,5.858950


## Regression

In [6]:
df_reg_emotion = df_emotion.drop(columns=["date","reviewID","reviewerID","review","rating","restaurantID","pronoun","i", "we", "you", "shehe", "they"])
le_emotion = preprocessing.LabelEncoder()
df_reg_emotion["flagged"] = le_emotion.fit_transform(df_reg_emotion["flagged"])

X = df_reg_emotion.drop(columns=["flagged"])
Y = df_reg_emotion["flagged"]

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=1, shuffle=True)
clf = LogisticRegression(random_state=1, max_iter = 1000).fit(X_train, Y_train)
clf.predict(X_test)
clf.score(X_test, Y_test)

0.6127450980392157

In [7]:
from sklearn.metrics import accuracy_score

#from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(random_state = 1, n_estimators=1000)
#X = feature_selection(X_train,Y_train, 150)

rf_model.fit(X_train, Y_train)
rf_predictions = rf_model.predict(X_test)
acc = accuracy_score(rf_predictions,Y_test)
acc

0.6366421568627451