In [229]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing

from scipy import stats


In [230]:
df_emotion_and_LIWC = pd.read_csv("LIWC_and_emotions.csv")
pos = (df_emotion_and_LIWC[df_emotion_and_LIWC["flagged"]=="Y"])
neg= (df_emotion_and_LIWC[df_emotion_and_LIWC["flagged"]=="N"])
df_neg_only = neg.sample(n=len(pos))

df_emotion = pd.concat([df_neg_only, pos])


In [231]:
df_emotion.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16320 entries, 1907 to 16409
Data columns (total 27 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   date          16320 non-null  object 
 1   reviewID      16320 non-null  object 
 2   reviewerID    16320 non-null  object 
 3   review        16320 non-null  object 
 4   rating        16320 non-null  int64  
 5   flagged       16320 non-null  object 
 6   restaurantID  16320 non-null  object 
 7   WC            16320 non-null  int64  
 8   Analytic      16320 non-null  float64
 9   Clout         16320 non-null  float64
 10  Authentic     16320 non-null  float64
 11  Tone          16320 non-null  float64
 12  WPS           16320 non-null  float64
 13  Sixltr        16320 non-null  float64
 14  pronoun       16320 non-null  float64
 15  ppron         16320 non-null  float64
 16  i             16320 non-null  float64
 17  we            16320 non-null  float64
 18  you           16320 non

## Replicating LIWC PACIS paper: T-tests

Features:

**LIWC**
- WC
- Analytical
- Tone
- Authentic
- Clout

**Other**
- Readability (Coleman Liau index)
- Rating extremity (the absolute difference of review star rating and existing business star rating)
- Days (days from the reviews date to the data collection date)

**To try**
- Days (days from first review to this review)
- Emotion extremity
- Neg/pos rating


In [232]:
#df_paired = df_emotion.drop(columns=["date","reviewID","reviewerID","review","rating","restaurantID","pronoun","i", "we", "you", "shehe", "they"])
#df_paired = df_emotion.drop(columns=["date","reviewID","reviewerID","review","restaurantID"])
res_ids = tuple(df_emotion["restaurantID"])


In [233]:
import sqlite3

def connect_db(db_file):
    conn = None
    try:
        conn = sqlite3.connect(db_file)
    except Error as e:
        print(e)

    return conn

conn = connect_db("yelpResData.db")

cur = conn.cursor()
query = "SELECT * FROM restaurant WHERE restaurantID in {}".format(res_ids)
query_dist = "SELECT count(*),count(distinct restaurantID) FROM restaurant"

cur.execute(query_dist)
rows_dist = cur.fetchall()
print(rows_dist) # checking for duplicates to see if maybe same record with diff ratings exists for a restaurant

# get restaurants matching our balanced reviews dataset
cur.execute(query)
rows = cur.fetchall()

OperationalError: no such table: restaurant

In [238]:
#for i,row in df_emotion.iterrows():
 #   df_emotion.at[i,"ResRating"] = rows[[rows[0]]==df_emotion.at[i,"restaurantID"]][4]
 
for i in range (0,len(rows)):
    df_emotion.at[df_emotion["restaurantID"]==rows[i][0],"ResRating"]= rows[i][4]

    
#df_emotion["ResRating"] = rows[[rows[0]]==df_emotion["restaurantID"]][4]



In [239]:
df_emotion["RatingDiff"] = np.abs(df_emotion["rating"]-df_emotion["ResRating"])
df_emotion

Unnamed: 0,date,reviewID,reviewerID,review,rating,flagged,restaurantID,WC,Analytic,Clout,...,shehe,they,ipron,sadness,anger,disgust,joy,fear,ResRating,RatingDiff
1907,1/22/2009,cBD6KENvFfUNpFrGRl7UoA,fNRFVr2VfZRdSHhJ0bAuIg,I am biased to Chicago Diner because I am a Ch...,5,N,o54U2VkQama8FI30qDkWvw,312,16.45,12.43,...,0.00,1.28,5.45,0.526334,0.058953,0.415800,0.659447,0.469948,4.0,1.0
14793,2/21/2012,3DaGmy3A9,ZYZNcugF3xUEGyLOVGiZ0Q,Sunday afternoon and we need some food! So whe...,4,N,GEpXn9rtTMtIj5NsGFiNQw,485,65.80,73.19,...,1.65,2.06,7.01,0.533953,0.073339,0.087051,0.608863,0.084966,4.0,0.0
511,9/3/2012,B7Kh3Q0qzMeNUEZJuXHOag,OnNFiXpYhrf0d_tqtK_hQA,I had the jalapeno scramble with poached eggs ...,3,N,tCkwBTuEFvhuaBoEUWqbuQ,101,71.80,61.69,...,0.00,0.00,1.98,0.490463,0.067765,0.067651,0.745796,0.032441,4.0,1.0
144,9/12/2011,vW69xu8tJVijKjo4Z2tYwg,Uia_yCmUkEiIfvY-z8PK5w,great fried chicken! up there with the best an...,4,N,WBU0yq9J8qiYQfI_fh2P1Q,33,57.84,38.08,...,0.00,0.00,3.03,0.033796,0.008173,0.007715,0.918996,0.010600,4.0,0.0
16070,9/15/2010,6lUnaRNl5d2lf8UNKjaajw,tobbLmgyO0Sb1U732tJcwA,"First, I have to say that the outdoor patio is...",2,N,JMa9cTNQA5mm4GbR2W8Oiw,477,42.24,12.88,...,0.21,0.42,6.71,0.553102,0.099967,0.120556,0.638909,0.135840,4.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16403,11/19/2010,BKXex6CFqUeQetnTvtibKg,oNFaRkiq17cfFc2q111nug,First time visiting and we were being treated ...,4,Y,8d_DiHWB-pjBVW3a7D8EWg,95,52.71,41.67,...,1.05,0.00,5.26,0.577808,0.085841,0.070605,0.688406,0.059792,4.0,0.0
16404,3/1/2011,ukeqNQkw3rlFaQuwz8p_xw,lH913sX4Mr7e8v2LMEAyJA,"Great food, great service, and great atmospher...",5,Y,6gZTET0y7ARZMrbJRmI2mw,79,72.31,84.44,...,0.00,0.00,2.53,0.185005,0.071661,0.025063,0.735421,0.006916,3.5,1.5
16405,5/4/2010,vwBJwQLYGumSvpe_cxxY1A,FtiocxJuP_RebtIqotRrpg,This place was incredible.. EXCELLENT service....,5,Y,_UJ3KqtHIHjExbAg-iIGUA,41,3.52,83.54,...,0.00,0.00,9.76,0.473392,0.053337,0.057628,0.708207,0.092933,4.0,1.0
16407,5/10/2010,7aAqvADp2FlbvY3OJ3I9OQ,BgPMUkC8LlLl1fc17RvUog,"So, it's nearly impossible for me to write a c...",3,Y,n-mj6IJkWyCu4BuD2dvm0A,176,56.25,41.03,...,1.14,0.00,5.68,0.585019,0.593745,0.497132,0.545144,0.118086,4.0,1.0


In [240]:
# calculating Readability (Coleman Liau index)
df_emotion["Sentences"] = df_emotion["WC"]/df_emotion["WPS"]

for i,row in df_emotion.iterrows():
    df_emotion.at[i,"Letters"] = sum(c.isalpha() for c in df_emotion.at[i,"review"])
    #len(df_emotion.at[i,"review"])
   # print(df_emotion.at[i,"review"])

df_emotion["Readability"] = (df_emotion["Letters"]*100/df_emotion["WC"]*0.0588)-(0.296*df_emotion["Sentences"]*100/df_emotion["WC"])-15.8
df_emotion.rename(columns={'rating': 'Rating'}, inplace=True)


In [246]:
stats_lst = []
no_emotions_lst = []

feature_lst = [ # includes emotions
 'WC',
 'Analytic',
 'Tone',
 'Clout',
 'Authentic',
 'joy',
 'anger',
 'sadness',
 'disgust',
 'fear',
 'Readability',
 'Rating',
 'RatingDiff']

################ without emotions ##############
linguist_feature_lst = [ # the Top Reviewer paper features only
 'WC',
 'Analytic',
 'Tone',
 'Clout',
 'Authentic',
 'Readability',
 'Rating',
 'RatingDiff']


for feature in feature_lst:
    mean_real = np.mean(df_emotion[df_emotion["flagged"]=="N"][feature])
    mean_fake = np.mean(df_emotion[df_emotion["flagged"]=="Y"][feature])
    med_real = np.median(df_emotion[df_emotion["flagged"]=="N"][feature])
    med_fake = np.median(df_emotion[df_emotion["flagged"]=="Y"][feature])
    stdev_real = np.std(df_emotion[df_emotion["flagged"]=="N"][feature])
    stdev_fake = np.std(df_emotion[df_emotion["flagged"]=="Y"][feature])
    stats_lst.append([feature,"real",mean_real,med_real,stdev_real])
    stats_lst.append([feature,"fake",mean_fake,med_fake,stdev_fake])

    


In [254]:
paired_stats = pd.DataFrame(stats_lst,columns=["Feature","Review","Mean","Median","Stdev"])

################ without emotions ##############
paired_stats_original_paper = paired_stats[paired_stats["Feature"].isin(linguist_feature_lst)]

In [257]:
paired_stats

################ without emotions ##############
#paired_stats_original_paper  

Unnamed: 0,Feature,Review,Mean,Median,Stdev
0,WC,real,146.257843,114.0,121.148709
1,WC,fake,101.407475,71.0,99.644241
2,Analytic,real,57.0641,58.73,23.515106
3,Analytic,fake,56.974985,58.68,25.955518
4,Tone,real,81.775381,94.47,24.968413
5,Tone,fake,80.651121,97.58,28.672983
6,Clout,real,47.670039,47.205,23.447147
7,Clout,fake,51.715174,50.0,25.854837
8,Authentic,real,48.125395,47.085,27.903612
9,Authentic,fake,47.877473,47.07,30.546632


In [244]:
t_test_df = []
for feature in feature_lst:
    t_test = stats.ttest_ind(df_emotion[df_emotion["flagged"]=="N"][feature],df_emotion[df_emotion["flagged"]=="Y"][feature])
    t_stat = round(t_test[0],3)
    p_val = round(t_test[1],3)
    t_test_df.append([feature,t_stat,p_val])

pd.DataFrame(t_test_df, columns=["Feature","t-stat","p-value"])

Unnamed: 0,Feature,t-stat,p-value
0,WC,25.826,0.0
1,Analytic,0.23,0.818
2,Tone,2.671,0.008
3,Clout,-10.469,0.0
4,Authentic,0.541,0.588
5,joy,-1.797,0.072
6,anger,4.184,0.0
7,sadness,15.646,0.0
8,disgust,4.565,0.0
9,fear,10.155,0.0


In [186]:
round(stats.ttest_ind(df_neg_only["WC"],pos["WC"])[0],3)
#stats.ttest_ind(df_emotion[df_emotion["flagged"]=="N"]["RatingDiff"],df_emotion[df_emotion["flagged"]=="N"]["RatingDiff"])

25.806

## Regression

In [6]:
df_reg_emotion = df_emotion.drop(columns=["date","reviewID","reviewerID","review","rating","restaurantID","pronoun","i", "we", "you", "shehe", "they"])
le_emotion = preprocessing.LabelEncoder()
df_reg_emotion["flagged"] = le_emotion.fit_transform(df_reg_emotion["flagged"])

X = df_reg_emotion.drop(columns=["flagged"])
Y = df_reg_emotion["flagged"]

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=1, shuffle=True)
clf = LogisticRegression(random_state=1, max_iter = 1000).fit(X_train, Y_train)
clf.predict(X_test)
clf.score(X_test, Y_test)

0.6127450980392157

In [7]:
from sklearn.metrics import accuracy_score

#from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(random_state = 1, n_estimators=1000)
#X = feature_selection(X_train,Y_train, 150)

rf_model.fit(X_train, Y_train)
rf_predictions = rf_model.predict(X_test)
acc = accuracy_score(rf_predictions,Y_test)
acc

0.6366421568627451