# Imported data and cleaned it

In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv("/Users/mahakkaurchhabra/Downloads/twitter_sentiment/twitter_training.csv",names = ['id','source','sentiment','tweet'])

In [3]:
data.head()

Unnamed: 0,id,source,sentiment,tweet
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


In [4]:
data.shape

(74682, 4)

In [5]:
data.drop(['id'],axis=1,inplace=True)

In [6]:
data.isnull().sum()

source         0
sentiment      0
tweet        686
dtype: int64

In [7]:
data.dropna()

Unnamed: 0,source,sentiment,tweet
0,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,Borderlands,Positive,I am coming to the borders and I will kill you...
2,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,Borderlands,Positive,im coming on borderlands and i will murder you...
4,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
...,...,...,...
74677,Nvidia,Positive,Just realized that the Windows partition of my...
74678,Nvidia,Positive,Just realized that my Mac window partition is ...
74679,Nvidia,Positive,Just realized the windows partition of my Mac ...
74680,Nvidia,Positive,Just realized between the windows partition of...


In [16]:
data.sentiment.value_counts()

Negative      22542
Positive      20832
Neutral       18318
Irrelevant    12990
Name: sentiment, dtype: int64

In [17]:
sentiment_counts = pd.DataFrame(data.sentiment.value_counts())

In [18]:
sentiment_counts

Unnamed: 0,sentiment
Negative,22542
Positive,20832
Neutral,18318
Irrelevant,12990


# Preprocessing by removing unnecessary characters

In [37]:
import re
def clean_text(tweet):
    tweet = re.sub(r'http\S+', '', str(tweet))
    tweet = re.sub(r'@[A-Za-z0-9_]+|#[A-Za-z0-9_]+', '', str(tweet))
    tweet = re.sub(r'[^A-Za-z\s]', '', str(tweet))
    tweet = re.sub(r'\bRT\b', '', str(tweet))
    return tweet.lower()

In [38]:
data['tweet'] = data['tweet'].apply(clean_text)
data.head()

Unnamed: 0,source,sentiment,tweet
0,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,Borderlands,Positive,i am coming to the borders and i will kill you...
2,Borderlands,Positive,im getting on borderlands and i will kill you all
3,Borderlands,Positive,im coming on borderlands and i will murder you...
4,Borderlands,Positive,im getting on borderlands and i will murder y...


# Labelling the output

In [39]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
data.sentiment = encoder.fit_transform(data.sentiment)

In [40]:
encoder.classes_

array(['Irrelevant', 'Negative', 'Neutral', 'Positive'], dtype=object)

In [42]:
X = data['tweet']
y = data['sentiment']

# Vectorization and Training the model

In [44]:
from sklearn.model_selection import train_test_split 
x_train, x_test, y_train,y_test = train_test_split(X, y, random_state=42, test_size=0.2, stratify=y)

In [45]:
from sklearn.feature_extraction.text import TfidfVectorizer
v = TfidfVectorizer()
X_train_normalized = v.fit_transform(x_train)
X_test_normalized = v.transform(x_test)

In [46]:
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier(n_estimators=60)
rf_clf.fit(X_train_normalized,y_train)

RandomForestClassifier(n_estimators=60)

In [47]:
from sklearn.metrics import accuracy_score, classification_report
y_pred = rf_clf.predict(X_test_normalized)
print(accuracy_score(y_test,y_pred))

0.8945571399879494


In [48]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.97      0.80      0.88      2598
           1       0.89      0.92      0.91      4509
           2       0.86      0.90      0.88      3664
           3       0.89      0.92      0.90      4166

    accuracy                           0.89     14937
   macro avg       0.90      0.89      0.89     14937
weighted avg       0.90      0.89      0.89     14937



# Validation dataset 

In [49]:
test = pd.read_csv("/Users/mahakkaurchhabra/Downloads/twitter_sentiment/twitter_validation.csv",names = ['id','source','sentiment','tweet'])
test.head()

Unnamed: 0,id,source,sentiment,tweet
0,3364,Facebook,Irrelevant,I mentioned on Facebook that I was struggling ...
1,352,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
2,8312,Microsoft,Negative,@Microsoft Why do I pay for WORD when it funct...
3,4371,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,..."
4,4433,Google,Neutral,Now the President is slapping Americans in the...


In [56]:
y_test = test['sentiment'].loc[3]
x = test['tweet'].loc[3]

In [57]:
x = clean_text(x)

In [58]:
x

'csgo matchmaking is so full of closet hacking its a truly awful game'

In [59]:
x_testing = v.transform([x])

In [60]:
x_testing.shape

(1, 34851)

In [62]:
y_pred = rf_clf.predict(x_testing)
print(f"the predicted output is {y_pred} and it corresponds to {encoder.classes_[y_pred]} and the true value is {y_test}")

the predicted output is [1] and it corresponds to ['Negative'] and the true value is Negative


In [63]:
print(y_test)

Negative


# Creating a function for new input and checking its classification result 

In [66]:
def unseen_input(text):
    text = clean_text(text)
    text = v.transform([text])
    y_pred = rf_clf.predict(text)
    print(f"the predicted output is {encoder.classes_[y_pred]}")


In [67]:
unseen_input("Hard Working Individuals!")

the predicted output is ['Neutral']


In [82]:
unseen_input("suicide.")

the predicted output is ['Neutral']


In [69]:
unseen_input("a dear friend of mine committed suicide with a shotgun two years ago")

the predicted output is ['Neutral']


In [71]:
unseen_input("Sensex plunges 790 points")

the predicted output is ['Neutral']


In [72]:
unseen_input("die")

the predicted output is ['Negative']


In [73]:
unseen_input("she won a lottery !!!")

the predicted output is ['Neutral']


# Saving the trained model 

In [74]:
import pickle

In [75]:
filename = "trained_model.sav"
# or .pkl

In [76]:
pickle.dump(rf_clf,open(filename,'wb'))

# Using saved model for future Predictions  

In [77]:
#loading model
loaded_model = pickle.load(open('trained_model.sav','rb'))

In [78]:
X_new = test['tweet'].loc[200]
y_test = test['sentiment'].loc[200]
print(y_test)

Neutral


In [79]:
print(test.loc[200])

id                                                        6889
source                                         johnson&johnson
sentiment                                              Neutral
tweet        Johnson & Johnson pauses COVID-19 vaccine tria...
Name: 200, dtype: object


In [80]:
def new_input(text):
    text = clean_text(text)
    text = v.transform([text])
    y_pred = loaded_model.predict(text)
    print(f"the predicted output is {encoder.classes_[y_pred]}")

In [83]:
new_input(X_new)

the predicted output is ['Neutral']


In [None]:
# https://youtu.be/4YGkfAd2iXM?si=i-okjsETDCSIDEuI
# https://www.kaggle.com/code/ranahafezz/twitter-sentiment-analysis-nlp
# https://www.kaggle.com/code/abdallahwagih/twitter-sentiment-analysis