In [294]:
import pandas as pd
import random
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing
import statsmodels.api as sm


from scipy import stats


In [282]:
random.seed(1)

df_emotion_and_LIWC = pd.read_csv("LIWC_and_emotions.csv")
pos = (df_emotion_and_LIWC[df_emotion_and_LIWC["flagged"]=="Y"])
neg= (df_emotion_and_LIWC[df_emotion_and_LIWC["flagged"]=="N"])
df_neg_only = neg.sample(n=len(pos))

df_emotion = pd.concat([df_neg_only, pos])


In [283]:
df_emotion.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16320 entries, 11007 to 16409
Data columns (total 27 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   date          16320 non-null  object 
 1   reviewID      16320 non-null  object 
 2   reviewerID    16320 non-null  object 
 3   review        16320 non-null  object 
 4   rating        16320 non-null  int64  
 5   flagged       16320 non-null  object 
 6   restaurantID  16320 non-null  object 
 7   WC            16320 non-null  int64  
 8   Analytic      16320 non-null  float64
 9   Clout         16320 non-null  float64
 10  Authentic     16320 non-null  float64
 11  Tone          16320 non-null  float64
 12  WPS           16320 non-null  float64
 13  Sixltr        16320 non-null  float64
 14  pronoun       16320 non-null  float64
 15  ppron         16320 non-null  float64
 16  i             16320 non-null  float64
 17  we            16320 non-null  float64
 18  you           16320 no

## Replicating LIWC PACIS paper: T-tests

Features:

**LIWC**
- WC
- Analytical
- Tone
- Authentic
- Clout

**Other**
- Readability (Coleman Liau index)
- Rating extremity (the absolute difference of review star rating and existing business star rating)
- Days (days from the reviews date to the data collection date)

**To try**
- Days (days from first review to this review)
- Emotion extremity
- Neg/pos rating


In [284]:
#df_paired = df_emotion.drop(columns=["date","reviewID","reviewerID","review","rating","restaurantID","pronoun","i", "we", "you", "shehe", "they"])
#df_paired = df_emotion.drop(columns=["date","reviewID","reviewerID","review","restaurantID"])
res_ids = tuple(df_emotion["restaurantID"])


In [285]:
import sqlite3

def connect_db(db_file):
    conn = None
    try:
        conn = sqlite3.connect(db_file)
    except Error as e:
        print(e)

    return conn

conn = connect_db("/Users/jamesfyfe/summer vac misc/yelpResData.db")

cur = conn.cursor()
query = "SELECT * FROM restaurant WHERE restaurantID in {}".format(res_ids)
query_dist = "SELECT count(*),count(distinct restaurantID) FROM restaurant"

cur.execute(query_dist)
rows_dist = cur.fetchall()
print(rows_dist) # checking for duplicates to see if maybe same record with diff ratings exists for a restaurant

# get restaurants matching our balanced reviews dataset
cur.execute(query)
rows = cur.fetchall()

[(242652, 242652)]


In [286]:
#for i,row in df_emotion.iterrows():
 #   df_emotion.at[i,"ResRating"] = rows[[rows[0]]==df_emotion.at[i,"restaurantID"]][4]
 
for i in range (0,len(rows)):
    df_emotion.at[df_emotion["restaurantID"]==rows[i][0],"ResRating"]= rows[i][4]

    
#df_emotion["ResRating"] = rows[[rows[0]]==df_emotion["restaurantID"]][4]



In [287]:
df_emotion["RatingDiff"] = np.abs(df_emotion["rating"]-df_emotion["ResRating"])
df_emotion

Unnamed: 0,date,reviewID,reviewerID,review,rating,flagged,restaurantID,WC,Analytic,Clout,...,shehe,they,ipron,sadness,anger,disgust,joy,fear,ResRating,RatingDiff
11007,5/15/2010,HBTRNWQGAW8VTfzqj5j1zA,sQ4RrXt8I-C41jzIIP9b7g,I'm downgrading Icosium for some recent change...,4,N,Rlg07N7Sl1qTzjc7zc6Whg,147,23.01,13.82,...,0.00,0.68,5.44,0.491672,0.102453,0.082873,0.541424,0.108875,4.0,0.0
2311,10/15/2008,h73VpwyDGGFiag9h9BCVSg,BzfxA_QOitHQIvCR23Kncw,This is the only Ruths Chris I've been to that...,5,N,n-mj6IJkWyCu4BuD2dvm0A,154,50.49,27.95,...,0.00,1.30,5.84,0.204665,0.112385,0.154933,0.686811,0.039286,4.0,1.0
13117,4/29/2011,HbReHBMC8n1-yr4aZT8Aaw,ny7Bdvl0R-zQ-4mR1CHsyA,"NO way will I ever ever wait for a ground ""who...",1,N,HOJqzz1WvOmeR9oESJ4d9A,57,84.03,75.87,...,0.00,0.00,7.02,0.035153,0.485899,0.366629,0.279918,0.051897,4.5,3.5
2070,10/28/2007,bPnBmv9fRr1W7ihOMgb7nA,yYMSq-x44SoExJkKgqWBpw,07/25/2010: Hmmm. Reading my earlier review ma...,2,N,xjZ9eqYrzMrl2v6_1Kv6hA,219,72.01,27.65,...,0.00,0.91,3.20,0.610605,0.056033,0.053013,0.666116,0.108468,4.0,2.0
3636,12/10/2010,4BYfbB9sWJZXcX8373pMOA,vzxzviEmA5Ao_tsvfXVwzg,THE OWNER IS A TOTAL DOUCHE. He is SO rude. We...,1,N,_b69OdBNLhysN1jFlYNQiA,137,15.71,41.35,...,5.11,0.00,3.65,0.590570,0.579410,0.127471,0.120227,0.104933,4.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16403,11/19/2010,BKXex6CFqUeQetnTvtibKg,oNFaRkiq17cfFc2q111nug,First time visiting and we were being treated ...,4,Y,8d_DiHWB-pjBVW3a7D8EWg,95,52.71,41.67,...,1.05,0.00,5.26,0.577808,0.085841,0.070605,0.688406,0.059792,4.0,0.0
16404,3/1/2011,ukeqNQkw3rlFaQuwz8p_xw,lH913sX4Mr7e8v2LMEAyJA,"Great food, great service, and great atmospher...",5,Y,6gZTET0y7ARZMrbJRmI2mw,79,72.31,84.44,...,0.00,0.00,2.53,0.185005,0.071661,0.025063,0.735421,0.006916,3.5,1.5
16405,5/4/2010,vwBJwQLYGumSvpe_cxxY1A,FtiocxJuP_RebtIqotRrpg,This place was incredible.. EXCELLENT service....,5,Y,_UJ3KqtHIHjExbAg-iIGUA,41,3.52,83.54,...,0.00,0.00,9.76,0.473392,0.053337,0.057628,0.708207,0.092933,4.0,1.0
16407,5/10/2010,7aAqvADp2FlbvY3OJ3I9OQ,BgPMUkC8LlLl1fc17RvUog,"So, it's nearly impossible for me to write a c...",3,Y,n-mj6IJkWyCu4BuD2dvm0A,176,56.25,41.03,...,1.14,0.00,5.68,0.585019,0.593745,0.497132,0.545144,0.118086,4.0,1.0


In [288]:
# calculating Readability (Coleman Liau index)
df_emotion["Sentences"] = df_emotion["WC"]/df_emotion["WPS"]

for i,row in df_emotion.iterrows():
    df_emotion.at[i,"Letters"] = sum(c.isalpha() for c in df_emotion.at[i,"review"])
    #len(df_emotion.at[i,"review"])
   # print(df_emotion.at[i,"review"])

df_emotion["Readability"] = (df_emotion["Letters"]*100/df_emotion["WC"]*0.0588)-(0.296*df_emotion["Sentences"]*100/df_emotion["WC"])-15.8
df_emotion.rename(columns={'rating': 'Rating'}, inplace=True)


In [310]:
stats_lst = []
no_emotions_lst = []

feature_lst = [ # includes emotions
 'WC',
 'Analytic',
 'Tone',
 'Clout',
 'Authentic',
 'joy',
 'anger',
 'sadness',
 'disgust',
 'fear',
 'Readability',
 'Rating',
 'RatingDiff',
 'ipron', #impersonal pronouns
    'i',
    'we',
    'you',
 'shehe',
 'they']

################ without emotions ##############
linguist_feature_lst = [ # the Top Reviewer paper features only
 'WC',
 'Analytic',
 'Tone',
 'Clout',
 'Authentic',
 'Readability',
 'Rating',
 'RatingDiff']


for feature in feature_lst:
    mean_real = np.mean(df_emotion[df_emotion["flagged"]=="N"][feature])
    mean_fake = np.mean(df_emotion[df_emotion["flagged"]=="Y"][feature])
    med_real = np.median(df_emotion[df_emotion["flagged"]=="N"][feature])
    med_fake = np.median(df_emotion[df_emotion["flagged"]=="Y"][feature])
    stdev_real = np.std(df_emotion[df_emotion["flagged"]=="N"][feature])
    stdev_fake = np.std(df_emotion[df_emotion["flagged"]=="Y"][feature])
    stats_lst.append([feature,"real",mean_real,med_real,stdev_real])
    stats_lst.append([feature,"fake",mean_fake,med_fake,stdev_fake])

    


In [311]:
paired_stats = pd.DataFrame(stats_lst,columns=["Feature","Review","Mean","Median","Stdev"])

################ without emotions ##############
paired_stats_original_paper = paired_stats[paired_stats["Feature"].isin(linguist_feature_lst)]

In [312]:
paired_stats

################ without emotions ##############
#paired_stats_original_paper  

Unnamed: 0,Feature,Review,Mean,Median,Stdev
0,WC,real,146.27549,114.0,121.077323
1,WC,fake,101.407475,71.0,99.644241
2,Analytic,real,57.05524,58.73,23.501866
3,Analytic,fake,56.974985,58.68,25.955518
4,Tone,real,81.763053,94.47,24.982012
5,Tone,fake,80.651121,97.58,28.672983
6,Clout,real,47.620385,47.095,23.456895
7,Clout,fake,51.715174,50.0,25.854837
8,Authentic,real,48.138125,47.07,27.909566
9,Authentic,fake,47.877473,47.07,30.546632


In [313]:
t_test_df = []
for feature in feature_lst:
    t_test = stats.ttest_ind(df_emotion[df_emotion["flagged"]=="N"][feature],df_emotion[df_emotion["flagged"]=="Y"][feature])
    t_stat = round(t_test[0],3)
    p_val = round(t_test[1],3)
    t_test_df.append([feature,t_stat,p_val])

pd.DataFrame(t_test_df, columns=["Feature","t-stat","p-value"])

Unnamed: 0,Feature,t-stat,p-value
0,WC,25.846,0.0
1,Analytic,0.207,0.836
2,Tone,2.641,0.008
3,Clout,-10.595,0.0
4,Authentic,0.569,0.569
5,joy,-1.757,0.079
6,anger,4.165,0.0
7,sadness,15.713,0.0
8,disgust,4.633,0.0
9,fear,10.175,0.0


## Regression

In [330]:
df_reg_emotion = df_emotion.drop(columns=["date","reviewID","reviewerID","review","Rating","restaurantID","ResRating","Sixltr","Sentences","Letters","pronoun","ppron"])

le_emotion = preprocessing.LabelEncoder()
df_reg_emotion["flagged"] = le_emotion.fit_transform(df_reg_emotion["flagged"])

X = df_reg_emotion.drop(columns=["flagged"])
Y = df_reg_emotion["flagged"]
X = sm.add_constant(X)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=1, shuffle=True)

model = sm.Logit(np.asarray(Y_train), np.asarray(X_train))
result = model.fit(method='newton')
result.predict(np.asarray(X_test))
print("\nAccuracy: " + str(sum((result.predict(np.asarray(X_test)) >= 0.5).astype(int) == Y_test)/len(Y_test)))
result.summary()



Optimization terminated successfully.
         Current function value: 0.631163
         Iterations 6

Accuracy: 0.640625


0,1,2,3
Dep. Variable:,y,No. Observations:,13056.0
Model:,Logit,Df Residuals:,13036.0
Method:,MLE,Df Model:,19.0
Date:,"Tue, 19 Jan 2021",Pseudo R-squ.:,0.08941
Time:,15:33:11,Log-Likelihood:,-8240.5
converged:,True,LL-Null:,-9049.6
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-0.7190,0.180,-3.995,0.000,-1.072,-0.366
x1,-0.0049,0.000,-19.924,0.000,-0.005,-0.004
x2,-0.0022,0.001,-2.225,0.026,-0.004,-0.000
x3,0.0145,0.001,10.689,0.000,0.012,0.017
x4,0.0033,0.001,4.302,0.000,0.002,0.005
x5,-0.0009,0.001,-0.949,0.342,-0.003,0.001
x6,0.0381,0.003,11.089,0.000,0.031,0.045
x7,0.0073,0.008,0.912,0.362,-0.008,0.023
x8,-0.0647,0.015,-4.313,0.000,-0.094,-0.035


In [None]:
x = sm.add_constant(x)
model = sm.Logit(y, x)
result = model.fit(method='newton')
result.predict(x)
(result.predict(x) >= 0.5).astype(int)
result.summary()

# standardise and try penalty

In [None]:
df_reg_emotion = df_emotion.drop(columns=["date","reviewID","reviewerID","review","rating","restaurantID","pronoun","i", "we", "you", "shehe", "they"])

le_emotion = preprocessing.LabelEncoder()
df_reg_emotion["flagged"] = le_emotion.fit_transform(df_reg_emotion["flagged"])

X = df_reg_emotion.drop(columns=["flagged"])
Y = df_reg_emotion["flagged"]
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=1, shuffle=True)
clf = LogisticRegression(random_state=1, max_iter = 1000).fit(X_train, Y_train)
clf.predict(X_test)
clf.score(X_test, Y_test)

In [7]:
from sklearn.metrics import accuracy_score

#from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(random_state = 1, n_estimators=1000)
#X = feature_selection(X_train,Y_train, 150)

rf_model.fit(X_train, Y_train)
rf_predictions = rf_model.predict(X_test)
acc = accuracy_score(rf_predictions,Y_test)
acc

0.6366421568627451