In [53]:
# naive bayes for twitter mood detection
# Data used in this programme:
#    https://www.kaggle.com/datasets/kazanova/sentiment140/download?datasetVersionNumber=2

In [54]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [55]:
raw = pd.read_csv('data/twitter.csv', encoding='ISO-8859-1', low_memory=False)

In [56]:
# We just need the tweet and the mood
# 0 = negative, 1 = positive
data = pd.DataFrame()
data["tweet"] = raw['@switchfoot http://twitpic.com/2y1zl - Awww, that\'s a bummer.  You shoulda got David Carr of Third Day to do it. ;D']
data["mood"] = raw['0']
data['mood'] = data['mood'].replace(4,1)

In [57]:
data['mood'].value_counts()

mood
1    800000
0    799999
Name: count, dtype: int64

In [58]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

In [59]:
X, y = data['tweet'], data['mood']

In [60]:
from sklearn.feature_extraction.text import CountVectorizer

In [61]:
vectorizer = CountVectorizer()

In [62]:
X = vectorizer.fit_transform(X)

In [63]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [64]:
clf = MultinomialNB(force_alpha=True, alpha=11)

In [65]:
clf.fit(X_train, y_train)

In [66]:
y_pred = clf.predict(X_test)


In [67]:
from sklearn.metrics import accuracy_score

In [68]:
accuracy_score(y_test, y_pred)

0.7754

In [69]:
from sklearn.metrics import confusion_matrix

In [70]:
confusion_matrix(y_test, y_pred)

array([[135102,  24986],
       [ 46886, 113026]])

In [71]:
from sklearn.metrics import classification_report

In [72]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.74      0.84      0.79    160088
           1       0.82      0.71      0.76    159912

    accuracy                           0.78    320000
   macro avg       0.78      0.78      0.77    320000
weighted avg       0.78      0.78      0.77    320000



In [73]:
# import pickle
# with open('Models/15-05-2023.pkl', "rb") as f:
#     clf = pickle.load(f)

In [74]:
# Predicting with custom tweets
tweet = ""
if clf.predict(vectorizer.transform([tweet])) == 1:
    print("Positive")
else:
    print("Negative")

Positive


In [60]:
raw.head()

Unnamed: 0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


In [79]:
import pickle
import time
pickle.dump(clf, open(f'Models/{time.strftime("%d-%m-%Y")}.pkl', 'wb'))