In [42]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import prince

from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import ADASYN
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

In [43]:
df = pd.read_csv("tweets-dataset.csv", encoding='latin1')
df.head()

Unnamed: 0,T1_Tweets,T1_Positive_Tweets,T1_Winner_Prediction,T2_Tweets,T2_Positive_Tweets,T2_Winner_Prediction,Winner
0,12,12,0,9,8,0,T2
1,31,24,0,6,6,0,T2
2,56,47,1,101,84,2,T1
3,13,13,0,15,13,0,T2
4,32,26,0,14,13,0,T1


In [44]:
# Convert T1/T2 string columns into int columns of 1/0
df[['Winner']] = \
(df[['Winner']] == 'T1').astype(int)
df.head()

Unnamed: 0,T1_Tweets,T1_Positive_Tweets,T1_Winner_Prediction,T2_Tweets,T2_Positive_Tweets,T2_Winner_Prediction,Winner
0,12,12,0,9,8,0,0
1,31,24,0,6,6,0,0
2,56,47,1,101,84,2,1
3,13,13,0,15,13,0,0
4,32,26,0,14,13,0,1


T1_Twitter_Vol = T1_Tweets/ (T1_Tweets+ T2_Tweets)

T1_Fans_Sent = T1_Positive_Tweets/( T1_Tweets+ T2_Tweets)

T1_Fans_Pred = T1_Winner_Prediction/( T1_Winner_Prediction+ T2_Winner_Prediction)

In [45]:
# create Twitter Volume var
df['T1_Twitter_Vol'] = df.apply(lambda row:  (row.T1_Tweets/(row.T1_Tweets+row.T2_Tweets)) , axis = 1)
df['T2_Twitter_Vol'] = df.apply(lambda row:  (row.T2_Tweets/(row.T1_Tweets+row.T2_Tweets)) , axis = 1)
df.head()

Unnamed: 0,T1_Tweets,T1_Positive_Tweets,T1_Winner_Prediction,T2_Tweets,T2_Positive_Tweets,T2_Winner_Prediction,Winner,T1_Twitter_Vol,T2_Twitter_Vol
0,12,12,0,9,8,0,0,0.571429,0.428571
1,31,24,0,6,6,0,0,0.837838,0.162162
2,56,47,1,101,84,2,1,0.356688,0.643312
3,13,13,0,15,13,0,0,0.464286,0.535714
4,32,26,0,14,13,0,1,0.695652,0.304348


In [46]:
# create Fans Sentiment var
df['T1_Fans_Sent'] = df.apply(lambda row:  (row.T1_Positive_Tweets/(row.T1_Tweets+row.T2_Tweets)) , axis = 1)
df['T2_Fans_Sent'] = df.apply(lambda row:  (row.T2_Positive_Tweets/(row.T1_Tweets+row.T2_Tweets)) , axis = 1)
df.head()

Unnamed: 0,T1_Tweets,T1_Positive_Tweets,T1_Winner_Prediction,T2_Tweets,T2_Positive_Tweets,T2_Winner_Prediction,Winner,T1_Twitter_Vol,T2_Twitter_Vol,T1_Fans_Sent,T2_Fans_Sent
0,12,12,0,9,8,0,0,0.571429,0.428571,0.571429,0.380952
1,31,24,0,6,6,0,0,0.837838,0.162162,0.648649,0.162162
2,56,47,1,101,84,2,1,0.356688,0.643312,0.299363,0.535032
3,13,13,0,15,13,0,0,0.464286,0.535714,0.464286,0.464286
4,32,26,0,14,13,0,1,0.695652,0.304348,0.565217,0.282609


In [47]:
# create Fans pred var
df['T1_Fans_Pred'] = df.apply(lambda row:  (row.T1_Winner_Prediction/(row.T1_Winner_Prediction+row.T2_Winner_Prediction))  if (row.T1_Winner_Prediction+row.T2_Winner_Prediction) !=0 else 0, axis = 1)
df['T2_Fans_Pred'] = df.apply(lambda row:  (row.T2_Winner_Prediction/(row.T1_Winner_Prediction+row.T2_Winner_Prediction))  if (row.T1_Winner_Prediction+row.T2_Winner_Prediction) !=0 else 0, axis = 1)
df.head()

Unnamed: 0,T1_Tweets,T1_Positive_Tweets,T1_Winner_Prediction,T2_Tweets,T2_Positive_Tweets,T2_Winner_Prediction,Winner,T1_Twitter_Vol,T2_Twitter_Vol,T1_Fans_Sent,T2_Fans_Sent,T1_Fans_Pred,T2_Fans_Pred
0,12,12,0,9,8,0,0,0.571429,0.428571,0.571429,0.380952,0.0,0.0
1,31,24,0,6,6,0,0,0.837838,0.162162,0.648649,0.162162,0.0,0.0
2,56,47,1,101,84,2,1,0.356688,0.643312,0.299363,0.535032,0.333333,0.666667
3,13,13,0,15,13,0,0,0.464286,0.535714,0.464286,0.464286,0.0,0.0
4,32,26,0,14,13,0,1,0.695652,0.304348,0.565217,0.282609,0.0,0.0


In [48]:
# remove unvanted columns
data = df.drop(['T1_Tweets', 'T1_Positive_Tweets', 'T1_Winner_Prediction', 
                'T2_Tweets', 'T2_Positive_Tweets', 'T2_Winner_Prediction'], axis=1)
data.head()

Unnamed: 0,Winner,T1_Twitter_Vol,T2_Twitter_Vol,T1_Fans_Sent,T2_Fans_Sent,T1_Fans_Pred,T2_Fans_Pred
0,0,0.571429,0.428571,0.571429,0.380952,0.0,0.0
1,0,0.837838,0.162162,0.648649,0.162162,0.0,0.0
2,1,0.356688,0.643312,0.299363,0.535032,0.333333,0.666667
3,0,0.464286,0.535714,0.464286,0.464286,0.0,0.0
4,1,0.695652,0.304348,0.565217,0.282609,0.0,0.0


In [49]:
data.shape

(519, 7)

In [50]:
# save the dataset
data.to_csv('twitter-featured.csv')

## APPENDIX

In [None]:
df = pd.read_csv("tweets-dataset.csv", encoding='latin1')
# Convert T1/T2 string columns into int columns of 1/0
df[['Winner']] = \
(df[['Winner']] == 'T1').astype(int)
# create Twitter Volume var
df['T1_Twitter_Vol'] = df.apply(lambda row:  (row.T1_Tweets/(row.T1_Tweets+row.T2_Tweets)) , axis = 1)
df['T2_Twitter_Vol'] = df.apply(lambda row:  (row.T2_Tweets/(row.T1_Tweets+row.T2_Tweets)) , axis = 1)
# create Fans Sentiment var
df['T1_Fans_Sent'] = df.apply(lambda row:  (row.T1_Positive_Tweets/(row.T1_Tweets+row.T2_Tweets)) , axis = 1)
df['T2_Fans_Sent'] = df.apply(lambda row:  (row.T2_Positive_Tweets/(row.T1_Tweets+row.T2_Tweets)) , axis = 1)
# create Fans pred var
df['T1_Fans_Pred'] = df.apply(lambda row:  (row.T1_Winner_Prediction/(row.T1_Winner_Prediction+row.T2_Winner_Prediction))  if (row.T1_Winner_Prediction+row.T2_Winner_Prediction) !=0 else 0, axis = 1)
df['T2_Fans_Pred'] = df.apply(lambda row:  (row.T2_Winner_Prediction/(row.T1_Winner_Prediction+row.T2_Winner_Prediction))  if (row.T1_Winner_Prediction+row.T2_Winner_Prediction) !=0 else 0, axis = 1)
# remove unvanted columns
data = df.drop(['T1_Tweets', 'T1_Positive_Tweets', 'T1_Winner_Prediction', 
                'T2_Tweets', 'T2_Positive_Tweets', 'T2_Winner_Prediction'], axis=1)
# save the dataset
data.to_csv('twitter-featured.csv')