<a href="https://colab.research.google.com/github/munavarhs/Sentiment-Analysis-of-Twitter-Data/blob/main/Twitter_Data_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#Basic libraries
import pandas as pd
import re
import string
import numpy as np
import matplotlib.pyplot as plt
import nltk

#using sklearn model to split our testing and training data
from sklearn.model_selection import train_test_split

#Sentiment Analysis Models
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from gensim.models import Word2Vec
import tensorflow_hub as hub

# Loading pre-trained Word2Vec model
import gensim.downloader as api

#using SVM model to preprocess the data
from sklearn.svm import SVC


#this library is used to predict the accuracy score of the model
from sklearn.metrics import accuracy_score

#supress warnings
import warnings

from sklearn.exceptions import ConvergenceWarning

# Suppressing ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)


In [2]:
df = pd.read_csv("dataset.csv")
df.head()


Unnamed: 0,id,text,text_sentiment,username,hashtags,created_at,user followers count,replycount,retweetcount,likecount,quotecount,language,media,retweetedTweet,quotedtweet,inReplyToTweetId,inReplyToUser,mentionedUsers
0,1538666561615015938,When will the #NYSE #stockmarketcrash happen?,Neutral,tradexlnc,"['NYSE', 'stockmarketcrash']",2022-06-19 23:34:29+00:00,10669,0,0,1,0,en,,,,,,
1,1538665013799489536,Aaj ka gyan:\n\nIf a company isn't a quality c...,Negative,niftymonday,"['stockmarkets', 'stockmarketcrash', 'trading'...",2022-06-19 23:28:20+00:00,100,0,1,8,0,en,,,,,,
2,1538660868027830274,The stock market needs to crash hard to make i...,Negative,kyle132313,"['stockmarketcrash', 'economy', 'rich', 'Fed']",2022-06-19 23:11:52+00:00,0,0,0,0,0,en,,,,,,
3,1538657239849836544,"Those who are ""Buying on DIP"" will very soon b...",Neutral,ChintanRajput16,"['stockmarketcrash', 'StocksToBuy', 'stockstow...",2022-06-19 22:57:27+00:00,54,0,2,2,0,en,,,,,,
4,1538654339044196358,@rdrhwke I wish our so-called President were t...,Positive,DrPCJustice,"['Bidenomics', 'inflation', 'recession', 'stoc...",2022-06-19 22:45:55+00:00,28,0,0,0,0,en,,,,1.538653e+18,https://twitter.com/rdrhwke,"[User(username='rdrhwke', id=43753976, display..."


In [3]:
#PRE-PROCESSING

In [4]:
def preprocess(input_text):
    input_text = input_text.lower()
    input_text = re.sub(r'https?:\/\/\S+', '', input_text)
    input_text = re.sub(r"www\.[a-z]?\.?(com)+|[a-z]+\.(com)", '', input_text)
    input_text = re.sub(r'{link}', '', input_text)
    input_text = re.sub(r"\[video\]", '', input_text)
    input_text = re.sub(r'&[a-z]+;', '', input_text)
    input_text = re.sub(r"[^a-z\s\(\-:\)\\\/\];='#]", '', input_text)
    input_text = re.sub(r'@', '', input_text)
    return input_text

In [5]:
#Labelling the preprocessed data
df['preprocessed_text'] = df['text'].apply(preprocess)

In [6]:
#Split the preprocessed data into labels and features
X = df['preprocessed_text']
y = df['text_sentiment']

In [7]:
#Splitting the preprocessed data into taining and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=17)

In [12]:
#Converting text into a bag-of-words model
vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

chi2_sqaure = SelectKBest(chi2, k=100)
X_train_new_bow = chi2_sqaure.fit_transform(X_train_vec,y_train)
X_test_new_bow = chi2_sqaure.transform(X_test_vec)


# Train an SVM classifier
svm_classifier = SVC(max_iter=5000)
svm_classifier.fit(X_train_new_bow, y_train)

# Predict on test data with SVM Model
y_pred_svm = svm_classifier.predict(X_test_new_bow)

# Report accuracy
svm_bag_accuracy = accuracy_score(y_test, y_pred_svm)
print(f'Accuracy of Bag of words model: {svm_bag_accuracy:.2f}')


Accuracy of Bag of words model: 0.67


In [13]:
#Converting text into a TF-IDF model
tf_idf = TfidfVectorizer()
X_train_tf_idf = tf_idf.fit_transform(X_train)
X_test_tf_idf = tf_idf.transform(X_test)

chi2_sqaure = SelectKBest(chi2, k=100)
X_train_new_bow = chi2_sqaure.fit_transform(X_train_vec,y_train)
X_test_new_bow = chi2_sqaure.transform(X_test_vec)

# Train an SVM classifier
svm_classifier = SVC(max_iter=5000)
svm_classifier.fit(X_train_new_bow, y_train)

# Predict on test data with SVM Model
y_pred_svm = svm_classifier.predict(X_test_new_bow)

# Report accuracy
svm_tf_idf_accuracy = accuracy_score(y_test, y_pred_svm)
print(f'Accuracy of TF-IDF model using SVM model: {svm_tf_idf_accuracy:.2f}')


Accuracy of TF-IDF model using SVM model: 0.67


In [14]:
#Techniques for Sentiment Analysis

In [8]:
word2vec_model = api.load('word2vec-google-news-300')



In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=17)


# Function to compute average Word2Vec embedding for each sentence
def get_word2vec_embeddings(text):
    words = text.split()
    word_vecs = [word2vec_model[word] for word in words if word in word2vec_model]
    if len(word_vecs) == 0:
        return np.zeros(300)
    return np.mean(word_vecs, axis=0)

X_word2vec = np.array([get_word2vec_embeddings(text) for text in X_train])
X_word2vec_test = np.array([get_word2vec_embeddings(text) for text in X_test])


# Train the SVM
svm_classifier = SVC()
svm_classifier.fit(X_word2vec, y_train)

# Predict on test data using SVM model
y_pred_w2v = svm_classifier.predict(X_word2vec_test)

#Report the accuracy score
accuracy_w2v = accuracy_score(y_test, y_pred_w2v)
print(f'Accuracy with Word2Vec using SVM Model: {accuracy_w2v:.2f}')


Accuracy with Word2Vec using SVM Model: 0.76


In [10]:
glove_model = api.load("glove-wiki-gigaword-300")



In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=17)

# Function to compute GloVe embedding for each sentence
def get_glove_embeddings(input_text):
    words = input_text.split()
    word_vecs = [glove_model[word] for word in words if word in glove_model]
    if len(word_vecs) == 0:
        return np.zeros(300)
    return np.mean(word_vecs, axis=0)

X_glove = np.array([get_glove_embeddings(text) for text in X_train])
X_glove_test = np.array([get_glove_embeddings(text) for text in X_test])


# Train the SVM
svm_classifier = SVC()
svm_classifier.fit(X_glove, y_train)

# Predict on test data using SVM Model
y_pred_glove = svm_classifier.predict(X_glove_test)

#Report the accuracy
accuracy_glove = accuracy_score(y_test, y_pred_glove)
print(f'Accuracy with GloVe using SVM model: {accuracy_glove:.2f}')

Accuracy with GloVe using SVM model: 0.71


In [17]:
print('Displaying accuracy of all the models::::::::\n')

print(f'Accuracy of Bag of words using SVM Model: {svm_bag_accuracy:.2f}')
print(f'Accuracy of TF-IDF model using SVM model: {svm_tf_idf_accuracy:.2f}')
print(f'Accuracy with Word2Vec using SVM Model: {accuracy_w2v:.2f}')
print(f'Accuracy with GloVe using SVM model: {accuracy_glove:.2f}')
print('\n')
print(f'The algorithm which gave me the highest accuracy is Word2vec with score of {accuracy_w2v:.2f}')

Displaying accuracy of all the models::::::::

Accuracy of Bag of words using SVM Model: 0.67
Accuracy of TF-IDF model using SVM model: 0.67
Accuracy with Word2Vec using SVM Model: 0.76
Accuracy with GloVe using SVM model: 0.71


The algorithm which gave me the highest accuracy is Word2vec with score of 0.76


In [18]:
#BONUS QUESTION
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=56)

# Create sentence embeddings
X_universal = np.array([embed([text]).numpy()[0] for text in X_train])
X_universal_test = np.array([embed([text]).numpy()[0] for text in X_test])

# Train the SVM
svm_universal = SVC()
svm_universal.fit(X_universal, y_train)

# Predict on test data using SVM model
y_pred_use = svm_universal.predict(X_universal_test)

#Reporting the accuracy
accuracy_use = accuracy_score(y_test, y_pred_use)
print(f'Accuracy with Universal Sentence Encoder using SVM model: {accuracy_use:.2f}')


Accuracy with Universal Sentence Encoder using SVM model: 0.70
