In [19]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing


In [121]:
df_emotion_and_LIWC = pd.read_csv("LIWC_and_emotions.csv")
pos = (df_emotion_and_LIWC[df_emotion_and_LIWC["flagged"]=="Y"])
neg= (df_emotion_and_LIWC[df_emotion_and_LIWC["flagged"]=="N"])
df_neg_only = neg.sample(n=len(pos))

df_emotion = pd.concat([df_neg_only, pos])


In [122]:
df_emotion.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16320 entries, 6263 to 16409
Data columns (total 27 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   date          16320 non-null  object 
 1   reviewID      16320 non-null  object 
 2   reviewerID    16320 non-null  object 
 3   review        16320 non-null  object 
 4   rating        16320 non-null  int64  
 5   flagged       16320 non-null  object 
 6   restaurantID  16320 non-null  object 
 7   WC            16320 non-null  int64  
 8   Analytic      16320 non-null  float64
 9   Clout         16320 non-null  float64
 10  Authentic     16320 non-null  float64
 11  Tone          16320 non-null  float64
 12  WPS           16320 non-null  float64
 13  Sixltr        16320 non-null  float64
 14  pronoun       16320 non-null  float64
 15  ppron         16320 non-null  float64
 16  i             16320 non-null  float64
 17  we            16320 non-null  float64
 18  you           16320 non

## Replicating LIWC PACIS paper: T-tests

Features:

**LIWC**
- WC
- Analytical
- Tone
- Authentic
- Clout

**Other**
- Readability (Coleman Liau index)
- Rating extremity (the absolute difference of review star rating and existing business star rating)
- Days (days from the reviews date to the data collection date)

**To try**
- Days (days from first review to this review)
- Emotion extremity


In [99]:
#df_paired = df_emotion.drop(columns=["date","reviewID","reviewerID","review","rating","restaurantID","pronoun","i", "we", "you", "shehe", "they"])
#df_paired = df_emotion.drop(columns=["date","reviewID","reviewerID","review","restaurantID"])
#df_paired


In [123]:
# calculating Readability (Coleman Liau index)
df_emotion["Sentences"] = df_emotion["WC"]/df_emotion["WPS"]

for i,row in df_emotion.iterrows():
    df_emotion.at[i,"Letters"] = sum(c.isalpha() for c in df_emotion.at[i,"review"])
    #len(df_emotion.at[i,"review"])
   # print(df_emotion.at[i,"review"])

df_emotion["Readability"] = (df_emotion["Letters"]*100/df_emotion["WC"]*0.0588)-(0.296*df_emotion["Sentences"]*100/df_emotion["WC"])-15.8



In [45]:
wc_mean_real = np.mean(df_neg_only["WC"])
wc_median_real = np.median(df_neg_only["WC"])
wc_std_real = np.std(df_neg_only["WC"])

data = {'Feature':  ['WC', 'WC', 
                     'Analytic','Analytic',
                     'Tone','Tone',
                     'Clout','Clout',
                     'Authentic','Authentic'],
        'Flag': ['Real', 'Fake', 'Real', 'Fake','Real', 'Fake','Real', 'Fake','Real', 'Fake'],
        'Mean': [np.mean(df_neg_only["WC"]), np.mean(pos["WC"]),
                 np.mean(df_neg_only["Analytic"]), np.mean(pos["Analytic"]),
                 np.mean(df_neg_only["Tone"]), np.mean(pos["Tone"]),
                 np.mean(df_neg_only["Clout"]), np.mean(pos["Clout"]),
                 np.mean(df_neg_only["Authentic"]), np.mean(pos["Authentic"])],

        'Median': [np.median(df_neg_only["WC"]), np.median(pos["WC"]),
                   np.median(df_neg_only["Analytic"]), np.median(pos["Analytic"]),
                   np.median(df_neg_only["Tone"]), np.median(pos["Tone"]),
                   np.median(df_neg_only["Clout"]), np.median(pos["Clout"]),
                   np.median(df_neg_only["Authentic"]), np.median(pos["Authentic"])],
        
        'Stdev': [np.std(df_neg_only["WC"]), np.std(pos["WC"]),
                  np.std(df_neg_only["Analytic"]), np.std(pos["Analytic"]),
                  np.std(df_neg_only["Tone"]), np.std(pos["Tone"]),
                  np.std(df_neg_only["Clout"]), np.std(pos["Clout"]),
                  np.median(df_neg_only["Authentic"]), np.median(pos["Authentic"])]
    
        }

paired_stats = pd.DataFrame(data)

paired_stats

Unnamed: 0,Feature,Flag,Mean,Median,Stdev
0,WC,Real,146.008211,114.0,120.94349
1,WC,Fake,101.407475,71.0,99.644241
2,Analytic,Real,57.059205,58.8,23.50995
3,Analytic,Fake,56.974985,58.68,25.955518
4,Tone,Real,81.754786,94.53,24.998607
5,Tone,Fake,80.651121,97.58,28.672983
6,Clout,Real,47.635384,47.12,23.483563
7,Clout,Fake,51.715174,50.0,25.854837
8,Authentic,Real,48.161558,47.15,47.15
9,Authentic,Fake,47.877473,47.07,47.07


## Regression

In [6]:
df_reg_emotion = df_emotion.drop(columns=["date","reviewID","reviewerID","review","rating","restaurantID","pronoun","i", "we", "you", "shehe", "they"])
le_emotion = preprocessing.LabelEncoder()
df_reg_emotion["flagged"] = le_emotion.fit_transform(df_reg_emotion["flagged"])

X = df_reg_emotion.drop(columns=["flagged"])
Y = df_reg_emotion["flagged"]

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=1, shuffle=True)
clf = LogisticRegression(random_state=1, max_iter = 1000).fit(X_train, Y_train)
clf.predict(X_test)
clf.score(X_test, Y_test)

0.6127450980392157

In [7]:
from sklearn.metrics import accuracy_score

#from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(random_state = 1, n_estimators=1000)
#X = feature_selection(X_train,Y_train, 150)

rf_model.fit(X_train, Y_train)
rf_predictions = rf_model.predict(X_test)
acc = accuracy_score(rf_predictions,Y_test)
acc

0.6366421568627451