In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import spacy

In [None]:
columns = ['id','country','Label','Text']
df = pd.read_csv("twitter_training.csv", names=columns)

print(df.shape)
df.head(5)

(46295, 4)


Unnamed: 0,id,country,Label,Text
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


In [None]:
df['Label'].value_counts()


Unnamed: 0_level_0,count
Label,Unnamed: 1_level_1
Positive,13710
Negative,12300
Neutral,11483
Irrelevant,8802


In [None]:
nlp = spacy.load("en_core_web_sm")

In [None]:
def preprocess(text):
    if isinstance(text, str):
        doc = nlp(text)
        filtered_tokens = []
        for token in doc:
            if token.is_stop or token.is_punct:
                continue
            filtered_tokens.append(token.lemma_)

        return " ".join(filtered_tokens)
    else:
        return ""

In [None]:
df['Preprocessed Text'] = df['Text'].apply(preprocess)
df


Unnamed: 0,id,country,Label,Text,Preprocessed Text
0,2401,Borderlands,3,im getting on borderlands and i will murder yo...,m get borderland murder
1,2401,Borderlands,3,I am coming to the borders and I will kill you...,come border kill
2,2401,Borderlands,3,im getting on borderlands and i will kill you ...,m get borderland kill
3,2401,Borderlands,3,im coming on borderlands and i will murder you...,m come borderland murder
4,2401,Borderlands,3,im getting on borderlands 2 and i will murder ...,m get borderland 2 murder
...,...,...,...,...,...
46290,11944,Verizon,2,The last 3 August’s I have broken my phone. Th...,3 August break phone year different Joseph Ver...
46291,11944,Verizon,2,The last 3 August's I've broken my phone. This...,3 August break phone year different Joseph Ver...
46292,11944,Verizon,2,The last time I broke my phone was on August 3...,time break phone August 3 year different time ...
46293,11944,Verizon,2,The last 3 August’s I have broken my phone. Th...,3 August break phone year different Joseph Ver...


In [None]:
le_model = LabelEncoder()
df['Label'] = le_model.fit_transform(df['Label'])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df['Preprocessed Text'], df['Label'], test_size=0.2, random_state=42, stratify=df['Label'])

In [None]:
print("Shape of X_train: ", X_train.shape)
print("Shape of X_test: ", X_test.shape)

Shape of X_train:  (37036,)
Shape of X_test:  (9259,)


In [None]:
clf = Pipeline([
    ('vectorizer_tri_grams', TfidfVectorizer()),
    ('naive_bayes', (MultinomialNB()))
])

In [None]:
clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_test)


In [None]:
print(accuracy_score(y_test, y_pred))

0.7555891564963819


In [None]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.52      0.67      1760
           1       0.75      0.84      0.79      2460
           2       0.86      0.68      0.76      2297
           3       0.66      0.89      0.76      2742

    accuracy                           0.76      9259
   macro avg       0.80      0.73      0.75      9259
weighted avg       0.79      0.76      0.75      9259



In [None]:
test_df = pd.read_csv('twitter_validation.csv', names=columns)
test_df.head()

Unnamed: 0,id,country,Label,Text
0,3364,Facebook,Irrelevant,I mentioned on Facebook that I was struggling ...
1,352,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
2,8312,Microsoft,Negative,@Microsoft Why do I pay for WORD when it funct...
3,4371,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,..."
4,4433,Google,Neutral,Now the President is slapping Americans in the...


In [None]:
test_text = test_df['Text'][10]
print(f"{test_text} ===> {test_df['Label'][10]}")

The professional dota 2 scene is fucking exploding and I completely welcome it.

Get the garbage out. ===> Positive


In [None]:
test_text_processed = [preprocess(test_text)]
test_text_processed

['professional dota 2 scene fucking explode completely welcome \n\n garbage']

In [None]:
test_text = clf.predict(test_text_processed)


In [None]:
classes = ['Irrelevant', 'Natural', 'Negative', 'Positive']

print(f"True Label: {test_df['Label'][10]}")
print(f'Predict Label: {classes[test_text[0]]}')

True Label: Positive
Predict Label: Positive
