In [1]:
# imports
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
import string as str

In [2]:
# load data
data = pd.read_csv('combined_samples_gt.csv')

# peek data
data.head()

Unnamed: 0,ID,Tweet,Sentiment,Time,Age,Country,Population -2020,Land Area (KmÂ²),Density (P/KmÂ²),Ground Truth
0,d1d7ed938d,great site photos fantastic useful visit pemb...,positive,morning,46-60,Egypt,102334404,995450,103,happiness
1,be0a927a80,_n too want ooze speaking either why lame jok...,neutral,noon,60-70,Guinea-Bissau,1968001,28120,70,df
2,6b4eb2871c,thanks link ive voted ill send too,positive,noon,21-30,Bulgaria,6948445,108560,64,df
3,9c503adc87,also liptons sparkling green tea gross,neutral,noon,21-30,Somalia,15893222,627340,25,disgust
4,3e1f7acd3c,know lender was yea people definitely sucked b...,negative,night,70-100,Monaco,39242,1,26337,anger


In [3]:
# drop nan
data = data.dropna()

In [4]:
# initialize X and y
X = data['Tweet']
y = data['Ground Truth']

In [5]:
# split data
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2,random_state=42)

In [6]:
# create logistic regression model
model = Pipeline(steps=[('cv',CountVectorizer()),('lr',LogisticRegression())])

In [7]:
# fit data
model.fit(X_train,y_train)

In [8]:
# insert model predictions
data['Predictions'] = ''
for i in (data.index - 1):
    data.iloc[i, 10] = model.predict([data.iloc[i, 1]])

data.head()

Unnamed: 0,ID,Tweet,Sentiment,Time,Age,Country,Population -2020,Land Area (KmÂ²),Density (P/KmÂ²),Ground Truth,Predictions
0,d1d7ed938d,great site photos fantastic useful visit pemb...,positive,morning,46-60,Egypt,102334404,995450,103,happiness,[happiness]
1,be0a927a80,_n too want ooze speaking either why lame jok...,neutral,noon,60-70,Guinea-Bissau,1968001,28120,70,df,[df]
2,6b4eb2871c,thanks link ive voted ill send too,positive,noon,21-30,Bulgaria,6948445,108560,64,df,[df]
3,9c503adc87,also liptons sparkling green tea gross,neutral,noon,21-30,Somalia,15893222,627340,25,disgust,[disgust]
4,3e1f7acd3c,know lender was yea people definitely sucked b...,negative,night,70-100,Monaco,39242,1,26337,anger,[anger]


In [9]:
# match column
data['Match'] = 'True'

for i in (data.index - 1):
    if (data['Predictions'].iloc[i] != data['Ground Truth'].iloc[i]):
        data['Match'].iloc[i] = False
        
data.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Match'].iloc[i] = False


Unnamed: 0,ID,Tweet,Sentiment,Time,Age,Country,Population -2020,Land Area (KmÂ²),Density (P/KmÂ²),Ground Truth,Predictions,Match
0,d1d7ed938d,great site photos fantastic useful visit pemb...,positive,morning,46-60,Egypt,102334404,995450,103,happiness,[happiness],True
1,be0a927a80,_n too want ooze speaking either why lame jok...,neutral,noon,60-70,Guinea-Bissau,1968001,28120,70,df,[df],True
2,6b4eb2871c,thanks link ive voted ill send too,positive,noon,21-30,Bulgaria,6948445,108560,64,df,[df],True
3,9c503adc87,also liptons sparkling green tea gross,neutral,noon,21-30,Somalia,15893222,627340,25,disgust,[disgust],True
4,3e1f7acd3c,know lender was yea people definitely sucked b...,negative,night,70-100,Monaco,39242,1,26337,anger,[anger],True


In [10]:
# count correct and incorrect
result = data['Match'].value_counts()

# find correct count
correct = result.iloc[0]

# find incorrect count
incorrect = result.iloc[1]

# find total
total = correct + incorrect

# find accuracy
accuracy = ((correct / total) * 100).round(2)

# print accuracy
print("Model accuracy is: {}%".format(accuracy))

Model accuracy is: 88.36%


In [11]:
# export dataframe
# data.to_csv('LR Model Trained Data.csv', index = False)

In [12]:
# Get the original dataset and run the model prediction on the Tweets (non-samples)
original_df = pd.read_csv("original_dataset.csv", encoding = "ISO-8859-1")

original_df = original_df.dropna()
original_df['Predictions'] = ''
original_df.head(30)

Unnamed: 0,ID,Tweet,Sentiment,Time,Age,Country,Population -2020,Land Area (KmÃÂ²),Density (P/KmÃÂ²),Predictions
0,1e551df661,think normally drawn people familiar similar u...,positive,morning,46-60,Georgia,3989167.0,69490.0,57.0,
1,5d470d3192,days aight cleaning mostly went mcds check had...,negative,noon,60-70,Libya,6871292.0,1759540.0,4.0,
2,0c89ab7ac0,happy star wars day im going make pasta get sh...,positive,night,70-100,Chile,19116201.0,743532.0,26.0,
3,98fe7210fa,ouch back im sick pamela anderson esque chest,negative,night,31-45,Portugal,10196709.0,91590.0,111.0,
4,f0bb64692f,wishing sunny day might bothered get outta bed...,negative,night,70-100,Netherlands,17134872.0,33720.0,508.0,
5,3e2e986a68,im happy,negative,noon,60-70,Cyprus,1207359.0,9240.0,131.0,
6,af3cb04ffc,day long text conversation me,neutral,morning,46-60,Liberia,5057681.0,96320.0,53.0,
7,56437e0860,know exactly mean ive lost many friends feel,negative,night,31-45,Zimbabwe,14862924.0,386850.0,38.0,
8,c930341ca9,funny thanks,positive,night,70-100,Mexico,128932753.0,1943950.0,66.0,
9,5a17107aba,done absolutely nothing day todayhow pitiful t...,negative,noon,21-30,Colombia,50882891.0,1109500.0,46.0,


In [13]:
# insert model predictions
for i, row in original_df.iterrows():
    original_df.at[i, 'Predictions'] = model.predict([row['Tweet']])

original_df.head(30)

Unnamed: 0,ID,Tweet,Sentiment,Time,Age,Country,Population -2020,Land Area (KmÃÂ²),Density (P/KmÃÂ²),Predictions
0,1e551df661,think normally drawn people familiar similar u...,positive,morning,46-60,Georgia,3989167.0,69490.0,57.0,[df]
1,5d470d3192,days aight cleaning mostly went mcds check had...,negative,noon,60-70,Libya,6871292.0,1759540.0,4.0,[happiness]
2,0c89ab7ac0,happy star wars day im going make pasta get sh...,positive,night,70-100,Chile,19116201.0,743532.0,26.0,[happiness]
3,98fe7210fa,ouch back im sick pamela anderson esque chest,negative,night,31-45,Portugal,10196709.0,91590.0,111.0,[sadness]
4,f0bb64692f,wishing sunny day might bothered get outta bed...,negative,night,70-100,Netherlands,17134872.0,33720.0,508.0,[happiness]
5,3e2e986a68,im happy,negative,noon,60-70,Cyprus,1207359.0,9240.0,131.0,[happiness]
6,af3cb04ffc,day long text conversation me,neutral,morning,46-60,Liberia,5057681.0,96320.0,53.0,[happiness]
7,56437e0860,know exactly mean ive lost many friends feel,negative,night,31-45,Zimbabwe,14862924.0,386850.0,38.0,[sadness]
8,c930341ca9,funny thanks,positive,night,70-100,Mexico,128932753.0,1943950.0,66.0,[happiness]
9,5a17107aba,done absolutely nothing day todayhow pitiful t...,negative,noon,21-30,Colombia,50882891.0,1109500.0,46.0,[sadness]


In [14]:
original_df.to_csv('LR_full_predictions.csv', index = False)