In [24]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import prince

from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import ADASYN
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

In [25]:
df = pd.read_csv("tweets-correct-WC.csv", encoding='latin1')
df.head()

Unnamed: 0,T1_Tweets,T1_Positive_Tweets,T1_Winner_Prediction,T2_Tweets,T2_Positive_Tweets,T2_Winner_Prediction,Winner
0,557,450,5,235,215,1,T2
1,7952,6988,238,4147,3751,109,T1
2,678,464,2,143,123,0,T1
3,144,117,0,98,80,0,T1
4,297,279,0,136,110,1,T1


In [26]:
# Convert T1/T2 string columns into int columns of 1/0
df[['Winner']] = \
(df[['Winner']] == 'T1').astype(int)
df.head()

Unnamed: 0,T1_Tweets,T1_Positive_Tweets,T1_Winner_Prediction,T2_Tweets,T2_Positive_Tweets,T2_Winner_Prediction,Winner
0,557,450,5,235,215,1,0
1,7952,6988,238,4147,3751,109,1
2,678,464,2,143,123,0,1
3,144,117,0,98,80,0,1
4,297,279,0,136,110,1,1


T1_Twitter_Vol = T1_Tweets/ (T1_Tweets+ T2_Tweets)

T1_Fans_Sent = T1_Positive_Tweets/( T1_Tweets+ T2_Tweets)

T1_Fans_Pred = T1_Winner_Prediction/( T1_Winner_Prediction+ T2_Winner_Prediction)

In [27]:
# create Twitter Volume var
df['T1_Twitter_Vol'] = df.apply(lambda row:  (row.T1_Tweets/(row.T1_Tweets+row.T2_Tweets)) , axis = 1)
df['T2_Twitter_Vol'] = df.apply(lambda row:  (row.T2_Tweets/(row.T1_Tweets+row.T2_Tweets)) , axis = 1)
df.head()

Unnamed: 0,T1_Tweets,T1_Positive_Tweets,T1_Winner_Prediction,T2_Tweets,T2_Positive_Tweets,T2_Winner_Prediction,Winner,T1_Twitter_Vol,T2_Twitter_Vol
0,557,450,5,235,215,1,0,0.703283,0.296717
1,7952,6988,238,4147,3751,109,1,0.657244,0.342756
2,678,464,2,143,123,0,1,0.825822,0.174178
3,144,117,0,98,80,0,1,0.595041,0.404959
4,297,279,0,136,110,1,1,0.685912,0.314088


In [28]:
# create Fans Sentiment var
df['T1_Fans_Sent'] = df.apply(lambda row:  (row.T1_Positive_Tweets/(row.T1_Tweets+row.T2_Tweets)) , axis = 1)
df['T2_Fans_Sent'] = df.apply(lambda row:  (row.T2_Positive_Tweets/(row.T1_Tweets+row.T2_Tweets)) , axis = 1)
df.head()

Unnamed: 0,T1_Tweets,T1_Positive_Tweets,T1_Winner_Prediction,T2_Tweets,T2_Positive_Tweets,T2_Winner_Prediction,Winner,T1_Twitter_Vol,T2_Twitter_Vol,T1_Fans_Sent,T2_Fans_Sent
0,557,450,5,235,215,1,0,0.703283,0.296717,0.568182,0.271465
1,7952,6988,238,4147,3751,109,1,0.657244,0.342756,0.577568,0.310026
2,678,464,2,143,123,0,1,0.825822,0.174178,0.565164,0.149817
3,144,117,0,98,80,0,1,0.595041,0.404959,0.483471,0.330579
4,297,279,0,136,110,1,1,0.685912,0.314088,0.644342,0.254042


In [29]:
# create Fans pred var
df['T1_Fans_Pred'] = df.apply(lambda row:  (row.T1_Winner_Prediction/(row.T1_Winner_Prediction+row.T2_Winner_Prediction))  if (row.T1_Winner_Prediction+row.T2_Winner_Prediction) !=0 else 0, axis = 1)
df['T2_Fans_Pred'] = df.apply(lambda row:  (row.T2_Winner_Prediction/(row.T1_Winner_Prediction+row.T2_Winner_Prediction))  if (row.T1_Winner_Prediction+row.T2_Winner_Prediction) !=0 else 0, axis = 1)
df.head()

Unnamed: 0,T1_Tweets,T1_Positive_Tweets,T1_Winner_Prediction,T2_Tweets,T2_Positive_Tweets,T2_Winner_Prediction,Winner,T1_Twitter_Vol,T2_Twitter_Vol,T1_Fans_Sent,T2_Fans_Sent,T1_Fans_Pred,T2_Fans_Pred
0,557,450,5,235,215,1,0,0.703283,0.296717,0.568182,0.271465,0.833333,0.166667
1,7952,6988,238,4147,3751,109,1,0.657244,0.342756,0.577568,0.310026,0.685879,0.314121
2,678,464,2,143,123,0,1,0.825822,0.174178,0.565164,0.149817,1.0,0.0
3,144,117,0,98,80,0,1,0.595041,0.404959,0.483471,0.330579,0.0,0.0
4,297,279,0,136,110,1,1,0.685912,0.314088,0.644342,0.254042,0.0,1.0


In [30]:
# remove unvanted columns
data = df.drop(['T1_Tweets', 'T1_Positive_Tweets', 'T1_Winner_Prediction', 
                'T2_Tweets', 'T2_Positive_Tweets', 'T2_Winner_Prediction'], axis=1)
data.head()

Unnamed: 0,Winner,T1_Twitter_Vol,T2_Twitter_Vol,T1_Fans_Sent,T2_Fans_Sent,T1_Fans_Pred,T2_Fans_Pred
0,0,0.703283,0.296717,0.568182,0.271465,0.833333,0.166667
1,1,0.657244,0.342756,0.577568,0.310026,0.685879,0.314121
2,1,0.825822,0.174178,0.565164,0.149817,1.0,0.0
3,1,0.595041,0.404959,0.483471,0.330579,0.0,0.0
4,1,0.685912,0.314088,0.644342,0.254042,0.0,1.0


In [31]:
data.shape

(14, 7)

In [33]:
# save the dataset
data.to_csv('twitter-featured-WC.csv')