In [None]:
from sklearn import datasets
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd
from sklearn.naive_bayes import MultinomialNB


# https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Coimbra
# https://machinelearningmastery.com/tutorial-to-implement-k-nearest-neighbors-in-python-from-scratch/
"""This notebook describes the Naive Bayes function in relation to Covid Tweets created in the last year. We will be comparing Covid tweets and the overall sentiment the tweet is in.
I have used the Bag of words expression in relation to both CNB and MNB , I have additionally experimented with a tfid vectorizer with a linear SVC.
Pavle Janev 217127556, York University, 2024"""

In [460]:
df = pd.read_csv('Corona_NLP_train.csv')
df.head()


Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,Positive
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,Positive
3,3802,48754,,16-03-2020,My food stock is not the only one which is emp...,Positive
4,3803,48755,,16-03-2020,"Me, ready to go at supermarket during the #COV...",Extremely Negative


In [461]:
def drop(p):
    p.drop(["UserName","ScreenName","Location","TweetAt"],axis=1,inplace=True)

drop(df)


In [462]:
df.head()

Unnamed: 0,OriginalTweet,Sentiment
0,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,advice Talk to your neighbours family to excha...,Positive
2,Coronavirus Australia: Woolworths to give elde...,Positive
3,My food stock is not the only one which is emp...,Positive
4,"Me, ready to go at supermarket during the #COV...",Extremely Negative


In [463]:
df["Sentiment"].value_counts()
convert_to_digits={"Sentiment":{'Positive':3,'Negative':1,"Neutral":2,"Extremely Positive":4,"Extremely Negative":0}}
df.replace(convert_to_digits,inplace=True)
 

In [464]:
from sklearn.feature_extraction.text import CountVectorizer

# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html?highlight=countvectorizer#sklearn.feature_extraction.text.CountVectorizer
c_vectorizer = CountVectorizer()   # bag of words 
c_vectorizer

CountVectorizer()

In [465]:
X = df['OriginalTweet']

y = df['Sentiment']

In [466]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.6, random_state=30)

In [467]:
mnb_model = MultinomialNB()

In [468]:
from sklearn.pipeline import Pipeline

# https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html

v_mnb_model = Pipeline(steps=[('vectorizer', c_vectorizer), ('classifier', mnb_model)])

v_mnb_model

Pipeline(steps=[('vectorizer', CountVectorizer()),
                ('classifier', MultinomialNB())])

In [469]:
v_mnb_model.fit(X_train, y_train)      

Pipeline(steps=[('vectorizer', CountVectorizer()),
                ('classifier', MultinomialNB())])

In [470]:
v_mnb_model.score(X_test, y_test)


0.43478436930552744

In [471]:
from sklearn.metrics import accuracy_score
predictions = v_mnb_model.predict(X_test)
accuracy_score(y_test,predictions)

0.43478436930552744

In [472]:
from sklearn.naive_bayes import ComplementNB
cnb_model = ComplementNB()

In [473]:
v_cnb_model = Pipeline(steps=[('vectorizer', c_vectorizer), ('classifier', cnb_model)])

v_cnb_model

Pipeline(steps=[('vectorizer', CountVectorizer()),
                ('classifier', ComplementNB())])

In [474]:
v_cnb_model.fit(X_train, y_train) 
v_cnb_model.score(X_train,y_train)

0.9029887012513668

In [475]:
from sklearn.metrics import accuracy_score
predictions_cnb = v_cnb_model.predict(X_test)
accuracy_score(y_test,predictions_cnb)

0.44389552541000205

Testing Bag of Words Vectorizor with both MNB and CNB options

In [476]:
text = ['Not because Im paranoid, but because my food stock is litteraly empty. The coronavirus is a serious thing, but please, dont panic. It causes shortage...']
predictions = v_mnb_model.predict(text)
if predictions == 0:
    print('This tweet is extremely negative')
elif predictions == 1:
    print('This tweet is negative!')
elif predictions == 2:
    print('This tweet is neutral.')
elif predictions == 3:
    print('This tweet is positive!')
elif predictions == 4:
    print('This tweet is super positive!')
else:
    print("I am a little unsure what this could be?")

print(predictions)


This tweet is negative!
[1]


In [477]:
text = ['Not because Im paranoid, but because my food stock is litteraly empty. The coronavirus is a serious thing, but please, dont panic. It causes shortage...']
predictions_cnb = v_cnb_model.predict(text)
if predictions_cnb == 0:
    print('This tweet is extremely negative')
elif predictions_cnb == 1:
    print('This tweet is negative!')
elif predictions_cnb == 2:
    print('This tweet is neutral.')
elif predictions_cnb == 3:
    print('This tweet is positive!')
elif predictions_cnb == 4:
    print('This tweet is super positive!')
else:
    print("I am a little unsure what this could be?")

print(predictions_cnb)


This tweet is extremely negative
[0]


In [478]:
text = ['Looking very negative']
predictions = v_mnb_model.predict(text)
if predictions == 0:
    print('This tweet is extremely negative')
elif predictions == 1:
    print('This tweet is negative!')
elif predictions == 2:
    print('This tweet is neutral.')
elif predictions == 3:
    print('This tweet is positive!')
elif predictions == 4:
    print('This tweet is super positive!')
else:
    print("I am a little unsure what this could be?")

print(predictions)


This tweet is negative!
[1]


In [479]:
text = ['Looking very negative']
predictions_cnb = v_cnb_model.predict(text)
if predictions_cnb == 0:
    print('This tweet is extremely negative')
elif predictions_cnb == 1:
    print('This tweet is negative!')
elif predictions_cnb == 2:
    print('This tweet is neutral.')
elif predictions_cnb == 3:
    print('This tweet is positive!')
elif predictions_cnb == 4:
    print('This tweet is super positive!')
else:
    print("I am a little unsure what this could be?")

print(predictions_cnb)


This tweet is extremely negative
[0]


In [480]:
text = ['I think covid will be over soon, we will then be very happy!']
predictions = v_mnb_model.predict(text)
if predictions == 0:
    print('This tweet is extremely negative')
elif predictions == 1:
    print('This tweet is negative!')
elif predictions == 2:
    print('This tweet is neutral.')
elif predictions == 3:
    print('This tweet is positive!')
elif predictions == 4:
    print('This tweet is super positive!')
else:
    print("I am a little unsure what this could be?")

print(predictions)


This tweet is super positive!
[4]


In [481]:
text = ['I think covid will be over soon, we will then be very happy!']
predictions_cnb = v_cnb_model.predict(text)
if predictions_cnb == 0:
    print('This tweet is extremely negative')
elif predictions_cnb == 1:
    print('This tweet is negative!')
elif predictions_cnb == 2:
    print('This tweet is neutral.')
elif predictions_cnb == 3:
    print('This tweet is positive!')
elif predictions_cnb == 4:
    print('This tweet is super positive!')
else:
    print("I am a little unsure what this could be?")

print(predictions_cnb)


This tweet is super positive!
[4]


In [482]:

text = ['Covid bad']
predictions = v_mnb_model.predict(text)
if predictions == 0:
    print('This tweet is extremely negative')
elif predictions == 1:
    print('This tweet is negative!')
elif predictions == 2:
    print('This tweet is neutral.')
elif predictions == 3:
    print('This tweet is positive!')
elif predictions == 1:
    print('This tweet is super positive!')
else:
    print("I am a little unsure what this could be?")

print(predictions)

This tweet is extremely negative
[0]


In [483]:
text = ['Covid bad']
predictions_cnb = v_cnb_model.predict(text)
if predictions_cnb == 0:
    print('This tweet is extremely negative')
elif predictions_cnb == 1:
    print('This tweet is negative!')
elif predictions_cnb == 2:
    print('This tweet is neutral.')
elif predictions_cnb == 3:
    print('This tweet is positive!')
elif predictions_cnb == 4:
    print('This tweet is super positive!')
else:
    print("I am a little unsure what this could be?")

print(predictions_cnb)

This tweet is extremely negative
[0]


In [484]:

text = ['What is the deal with airline food? I mean it really sucks!']
predictions = v_mnb_model.predict(text)
if predictions == 0:
    print('This tweet is extremely negative')
elif predictions == 1:
    print('This tweet is negative!')
elif predictions == 2:
    print('This tweet is neutral.')
elif predictions == 3:
    print('This tweet is positive!')
elif predictions == 1:
    print('This tweet is super positive!')
else:
    print("I am a little unsure what this could be?")

print(predictions)

This tweet is negative!
[1]


In [485]:
text = ['What is the deal with airline food? I mean it really sucks!']
predictions = v_cnb_model.predict(text)
if predictions == 0:
    print('This tweet is extremely negative')
elif predictions == 1:
    print('This tweet is negative!')
elif predictions == 2:
    print('This tweet is neutral.')
elif predictions == 3:
    print('This tweet is positive!')
elif predictions == 1:
    print('This tweet is super positive!')
else:
    print("I am a little unsure what this could be?")

print(predictions_cnb)

This tweet is extremely negative
[0]


TFID Vectorizor with a Linear SVC module

In [486]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
v_mnb_tfid_linear = Pipeline([('tfidf',TfidfVectorizer()),('svm',LinearSVC())])

v_mnb_tfid_linear

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('svm', LinearSVC())])

In [487]:
v_mnb_tfid_linear.fit(X_train, y_train) 
v_mnb_tfid_linear.score(X_train,y_train)

0.9835985906937189

In [488]:
predictions_tfid = v_mnb_tfid_linear.predict(X_test)
accuracy_score(y_test,predictions_tfid)

0.5228588783154484

In [489]:
text = ['Joe Biden has to be responsible for Covid-19 if he only gives me $400 for a stimulus check! ']
predictions = v_mnb_tfid_linear.predict(text)
if predictions == 0:
    print('This tweet is extremely negative')
elif predictions == 1:
    print('This tweet is negative!')
elif predictions == 2:
    print('This tweet is neutral.')
elif predictions == 3:
    print('This tweet is positive!')
elif predictions == 1:
    print('This tweet is super positive!')
else:
    print("I am a little unsure what this could be?")

print(predictions)

This tweet is neutral.
[2]


In [490]:
text = ['Joe Biden has to be responsible for Covid-19 if he only gives me $400 for a stimulus check! ']
predictions = v_mnb_tfid_linear.predict(text)
if predictions == 0:
    print('This tweet is extremely negative')
elif predictions == 1:
    print('This tweet is negative!')
elif predictions == 2:
    print('This tweet is neutral.')
elif predictions == 3:
    print('This tweet is positive!')
elif predictions == 1:
    print('This tweet is super positive!')
else:
    print("I am a little unsure what this could be?")

print(predictions)

This tweet is neutral.
[2]


In [491]:
text = ['housing rentals have become so damn expensive because of the stupid property owners! I hate them all extremely! ']
predictions = v_mnb_tfid_linear.predict(text)
if predictions == 0:
    print('This tweet is extremely negative')
elif predictions == 1:
    print('This tweet is negative!')
elif predictions == 2:
    print('This tweet is neutral.')
elif predictions == 3:
    print('This tweet is positive!')
elif predictions == 1:
    print('This tweet is super positive!')
else:
    print("I am a little unsure what this could be?")

print(predictions)

This tweet is extremely negative
[0]
