# Sentiment prediciting based on tweeter's datasets

In [78]:
import warnings
warnings.filterwarnings("ignore")

# EDA tools
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Preprocessing tools
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')
from gensim.models import Word2Vec
import random

# training models tools
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ALFA\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ALFA\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ALFA\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
df_tr = pd.read_csv('twitter_training.csv', encoding='ISO-8859-1')
df_te = pd.read_csv('twitter_test.csv')
df_val = pd.read_csv('twitter_validation.csv')

In [3]:
df_tr.head()

Unnamed: 0,Tweet ID,entity,sentiment,Tweet content
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


In [4]:
df_te.head()

Unnamed: 0,Tweet ID,entity,sentiment,Tweet content
0,3364,Facebook,Irrelevant,I mentioned on Facebook that I was struggling ...
1,352,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
2,8312,Microsoft,Negative,@Microsoft Why do I pay for WORD when it funct...
3,4371,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,..."
4,4433,Google,Neutral,Now the President is slapping Americans in the...


In [5]:
df_val.head()

Unnamed: 0,Tweet ID,entity,sentiment,Tweet content
0,5328,Hearthstone,Negative,@BlizzardCS what’s going on with Hearthstone f...
1,7618,MaddenNFL,Negative,@EAMaddenNFL is there a reason OFFLINE franchi...
2,7108,johnson&johnson,Negative,Johnson & Johnson is about to enter phase 3 tr...
3,10008,PlayerUnknownsBattlegrounds(PUBG),Negative,How is banning #PUBG going to fix anything? Al...
4,49,Amazon,Neutral,I played this interesting quiz on Amazon - Try...


In [6]:
tr_cont = df_tr['Tweet content'].to_numpy()
te_cont = df_te['Tweet content'].to_numpy()
val_cont = df_te['Tweet content'].to_numpy()

In [7]:
def remove_specialChars(text):
    """This function removes all special characters from text."""
    temp = ''.join(letter for letter in text if letter.isalnum() or letter.isspace())
    return ''.join(c for c in temp if ord(c) < 128 or c.isspace())

In [8]:
def lemmatize_text(text):
    """This function lemmatizes the text."""
    lemmatizer = WordNetLemmatizer()
    words = word_tokenize(text)
    lemmatized_words = [lemmatizer.lemmatize(word, pos='v') for word in words]
    return ' '.join(lemmatized_words)

In [9]:
def remove_stopwords(text):
    """This function converts all letters into lowercase and removes all English stop words from text."""
    text = text.lower()
    stop_words = set(stopwords.words('english'))
    stop_words.add('im') #customizing list of stop words
    word_tokens = word_tokenize(text)
    filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
    filtered_sentence = []
    for w in word_tokens:
        if w not in stop_words:
            filtered_sentence.append(w)
    return filtered_sentence

In [10]:
#Firing functions into process our data and convert them into lemmatized words
processed_content = []
for text in tr_cont:
    try:
        new_text = remove_specialChars(text)
        new_text = lemmatize_text(new_text)
        new_text = remove_stopwords(new_text)
        if new_text != '' and new_text != ' ':
            processed_content.append(new_text)
        else:
            processed_content.append(np.nan)
    except:
        processed_content.append(np.nan)
    
df_tr['new_content'] = np.array(processed_content)
df_tr

Unnamed: 0,Tweet ID,entity,sentiment,Tweet content,new_content
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...,"[get, borderlands, murder]"
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...,"[come, border, kill]"
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...,"[get, borderlands, kill]"
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...,"[come, borderlands, murder]"
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...,"[get, borderlands, 2, murder]"
...,...,...,...,...,...
74677,9200,Nvidia,Positive,Just realized that the Windows partition of my...,"[realize, windows, partition, mac, like, 6, ye..."
74678,9200,Nvidia,Positive,Just realized that my Mac window partition is ...,"[realize, mac, window, partition, 6, years, be..."
74679,9200,Nvidia,Positive,Just realized the windows partition of my Mac ...,"[realize, windows, partition, mac, 6, years, b..."
74680,9200,Nvidia,Positive,Just realized between the windows partition of...,"[realize, windows, partition, mac, like, 6, ye..."


In [11]:
df_tr.dropna(inplace=True)
df_tr.isna().sum()

Tweet ID         0
entity           0
sentiment        0
Tweet content    0
new_content      0
dtype: int64

In [12]:
#Firing functions into process our data and convert them into lemmatized words
processed_content = []
for text in te_cont:
    try:
        new_text = remove_specialChars(text)
        new_text = lemmatize_text(new_text)
        new_text = remove_stopwords(new_text)
        if new_text != '' and new_text != ' ':
            processed_content.append(new_text)
        else:
            processed_content.append(np.nan)
    except:
        processed_content.append(np.nan)
    
df_te['new_content'] = np.array(processed_content)
df_te

Unnamed: 0,Tweet ID,entity,sentiment,Tweet content,new_content
0,3364,Facebook,Irrelevant,I mentioned on Facebook that I was struggling ...,"[mention, facebook, struggle, motivation, go, ..."
1,352,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...,"[bbc, news, amazon, boss, jeff, bezos, reject,..."
2,8312,Microsoft,Negative,@Microsoft Why do I pay for WORD when it funct...,"[microsoft, pay, word, function, poorly, samsu..."
3,4371,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,...","[csgo, matchmaking, full, closet, hack, truly,..."
4,4433,Google,Neutral,Now the President is slapping Americans in the...,"[president, slap, americans, face, really, com..."
...,...,...,...,...,...
495,8055,Microsoft,Positive,special shoutouts to microsoft excel 2013,"[special, shoutouts, microsoft, excel, 2013]"
496,6787,Fortnite,Irrelevant,Dumb Lucky☘️ (Fortnite Montage) youtu.be/psW...,"[dumb, lucky, fortnite, montage, youtubepswjtn..."
497,3838,Cyberpunk2077,Positive,Dang there goes my birthday present but maybe ...,"[dang, go, birthday, present, maybe, better]"
498,2008,CallOfDuty,Irrelevant,It was ab fab seeing the 6 bungalows built in ...,"[ab, fab, see, 6, bungalows, build, walsden, l..."


In [13]:
#drop missing values
df_te.dropna(inplace=True)
df_te.isna().sum()

Tweet ID         0
entity           0
sentiment        0
Tweet content    0
new_content      0
dtype: int64

In [14]:
#Firing functions into process our data and convert them into lemmatized words
processed_content = []
for text in te_cont:
    try:
        new_text = remove_specialChars(text)
        new_text = lemmatize_text(new_text)
        new_text = remove_stopwords(new_text)
        if new_text != '' and new_text != ' ':
            processed_content.append(new_text)
        else:
            processed_content.append(np.nan)
    except:
        processed_content.append(np.nan)
    
df_val['new_content'] = np.array(processed_content)
df_val

Unnamed: 0,Tweet ID,entity,sentiment,Tweet content,new_content
0,5328,Hearthstone,Negative,@BlizzardCS what’s going on with Hearthstone f...,"[mention, facebook, struggle, motivation, go, ..."
1,7618,MaddenNFL,Negative,@EAMaddenNFL is there a reason OFFLINE franchi...,"[bbc, news, amazon, boss, jeff, bezos, reject,..."
2,7108,johnson&johnson,Negative,Johnson & Johnson is about to enter phase 3 tr...,"[microsoft, pay, word, function, poorly, samsu..."
3,10008,PlayerUnknownsBattlegrounds(PUBG),Negative,How is banning #PUBG going to fix anything? Al...,"[csgo, matchmaking, full, closet, hack, truly,..."
4,49,Amazon,Neutral,I played this interesting quiz on Amazon - Try...,"[president, slap, americans, face, really, com..."
...,...,...,...,...,...
495,4891,GrandTheftAuto(GTA),Irrelevant,⭐️ Toronto is the arts and culture capital of ...,"[special, shoutouts, microsoft, excel, 2013]"
496,4359,CS-GO,Irrelevant,tHIS IS ACTUALLY A GOOD MOVE TOT BRING MORE VI...,"[dumb, lucky, fortnite, montage, youtubepswjtn..."
497,2652,Borderlands,Positive,Today sucked so it’s time to drink wine n play...,"[dang, go, birthday, present, maybe, better]"
498,8069,Microsoft,Positive,Bought a fraction of Microsoft today. Small wins.,"[ab, fab, see, 6, bungalows, build, walsden, l..."


In [15]:
#drop missing values
df_val.dropna(inplace=True)
df_val.isna().sum()

Tweet ID         0
entity           0
sentiment        0
Tweet content    0
new_content      0
dtype: int64

In [25]:
def getVec(dataframe):
    """This function uses word embedding (word2vec) to extract vectors from words"""
    all_vectors = []
    sentiment = []
    index = -1
    for tweet in dataframe['new_content']:
        index+=1
        vectors = []
        array = []
        array.append(tweet)
        try:
            model = Word2Vec(array, vector_size=100, window=5, min_count=1, sg=0)
            senti = np.array(dataframe['sentiment'])[index]
            sentiment.append(senti)
        except:
            continue
        for word in tweet:
            vector = model.wv[word]
            vectors.append(np.array(vector).mean())                
        # using mean of vectors of each tweet
        all_vectors.append(vectors)
    new_df = pd.DataFrame(all_vectors)
    new_df['sentiment'] = sentiment
    return new_df

In [28]:
new_df = getVec(df_tr)

In [29]:
new_df1 = getVec(df_te)

In [30]:
new_df2 = getVec(df_val)

In [31]:
#convert sentiment categories into numeric values for train dataset
categories = new_df['sentiment'].unique()
category_to_number = {category: number for number, category in enumerate(categories)}
categories_list = new_df['sentiment'].to_numpy()
numeric_values = [category_to_number[category] for category in categories_list]
new_df['numeric_sentiment'] = numeric_values
new_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,155,156,157,158,159,160,161,162,sentiment,numeric_sentiment
0,-8e-06,0.000681,0.000157,,,,,,,,...,,,,,,,,,Positive,0
1,-8e-06,0.000681,0.000157,,,,,,,,...,,,,,,,,,Positive,0
2,-8e-06,0.000681,0.000157,,,,,,,,...,,,,,,,,,Positive,0
3,-8e-06,0.000681,0.000157,,,,,,,,...,,,,,,,,,Positive,0
4,0.000655,-8e-06,0.000681,0.000157,,,,,,,...,,,,,,,,,Positive,0


In [32]:
#convert sentiment categories into numeric values for test dataset
categories = new_df1['sentiment'].unique()
categories_list = new_df1['sentiment'].to_numpy()
numeric_values = [category_to_number[category] for category in categories_list]
new_df1['numeric_sentiment'] = numeric_values
new_df1.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,33,34,35,36,37,38,39,40,sentiment,numeric_sentiment
0,4e-06,-8e-06,0.000655,-0.000187,-0.000666,-0.000651,-0.000104,0.000101,0.000464,0.000681,...,,,,,,,,,Irrelevant,3
1,0.0011,-0.000123,-0.000273,0.00021,0.000464,0.000101,-0.000104,-0.000651,-0.000666,-0.000187,...,,,,,,,,,Neutral,1
2,-0.000651,-0.000666,-0.000187,0.000655,-8e-06,0.000681,0.000157,,,,...,,,,,,,,,Negative,2
3,-0.000104,-0.000651,-0.000666,-0.000187,0.000655,-8e-06,0.000681,0.000157,,,...,,,,,,,,,Negative,2
4,-0.000273,0.00021,0.000464,0.000101,-0.000103,-0.000651,-0.000666,-0.000187,0.000655,-8e-06,...,,,,,,,,,Neutral,1


In [33]:
#convert sentiment categories into numeric values for test dataset
categories = new_df2['sentiment'].unique()
categories_list = new_df1['sentiment'].to_numpy()
numeric_values = [category_to_number[category] for category in categories_list]
new_df2['numeric_sentiment'] = numeric_values
new_df2.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,33,34,35,36,37,38,39,40,sentiment,numeric_sentiment
0,4e-06,-8e-06,0.000655,-0.000187,-0.000666,-0.000651,-0.000104,0.000101,0.000464,0.000681,...,,,,,,,,,Negative,3
1,0.0011,-0.000123,-0.000273,0.00021,0.000464,0.000101,-0.000104,-0.000651,-0.000666,-0.000187,...,,,,,,,,,Negative,1
2,-0.000651,-0.000666,-0.000187,0.000655,-8e-06,0.000681,0.000157,,,,...,,,,,,,,,Negative,2
3,-0.000104,-0.000651,-0.000666,-0.000187,0.000655,-8e-06,0.000681,0.000157,,,...,,,,,,,,,Negative,2
4,-0.000273,0.00021,0.000464,0.000101,-0.000103,-0.000651,-0.000666,-0.000187,0.000655,-8e-06,...,,,,,,,,,Neutral,1


In [34]:
category_to_number

{'Positive': 0, 'Neutral': 1, 'Negative': 2, 'Irrelevant': 3}

In [37]:
# filling NaN values with 0
new_df.fillna(0, inplace=True)
new_df1.fillna(0, inplace=True)
new_df2.fillna(0, inplace=True)

In [53]:
# Spliting datasets
X_train = new_df.drop(['sentiment', 'numeric_sentiment'], axis=1)
y_train = new_df['numeric_sentiment']
X_test = new_df1.drop(['sentiment', 'numeric_sentiment'], axis=1)
y_test = new_df1['numeric_sentiment']
X_val = new_df2.drop(['sentiment', 'numeric_sentiment'], axis=1)
y_val = new_df2['numeric_sentiment']

In [54]:
# Using Zero padding
diff = len(X_train.columns)-len(X_val.columns)
for i in range(diff+1):
    zPadd = np.zeros([len(X_test)])
    X_test[40+i]=zPadd
    X_val[40+i]=zPadd

In [77]:
# Train a SVM model
clf = SVC(kernel='linear')
clf.fit(X_train, y_train)
accuracy = clf.score(X_test, y_test) #Evaluate SVM
print("Accuracy of SVM: ", accuracy)

Accuracy of SVM:  0.530501002004008


In [87]:
# Define the parameter distribution for random search for GaussianNB
param_dist = {'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5]}

# Perform random search
clf_random_GNB = RandomizedSearchCV(GaussianNB(), param_distributions=param_dist, cv=5, n_iter=10)
clf_random_GNB.fit(X_train, y_train)

RandomizedSearchCV(cv=5, estimator=GaussianNB(),
                   param_distributions={'var_smoothing': [1e-09, 1e-08, 1e-07,
                                                          1e-06, 1e-05]})

In [89]:
# Evaluate GNB
accuracy = clf_random_GNB.score(X_test, y_test)
print("Accuracy of GNB: ", accuracy)

Accuracy of GNB:  0.473259104211305


In [124]:
# predicting target for X_val
y_pred = clf.predict(X_val)
pred_df = pd.DataFrame({'y_true': y_test, 'y_pred': y_pred}) #creating dataframe to compare real values with predicted values
pred_df

Unnamed: 0,y_true,y_pred
0,3,3
1,1,3
2,2,1
3,2,1
4,1,3
...,...,...
494,0,0
495,3,4
496,0,0
497,3,4


In [125]:
pred_df.to_csv('y_val-vs-y_pred.csv')


* SVM is better because it works well when classes are well separated. On the other hand, GaussianNB assumes that the features are conditionally independent given the class, which may not be true in many real-world text classification problems

* accuracy of SVM and the true values which were predicted were more than GNB so it predicted more than half of tweets' sentiments. However, there are some reasons which cause SVM predict better than GNB:
* Complexity of Decision Boundary
* Handling Non-Linearity
* Robustness to Outliers
* Parameter Sensitivity
* Data Size
* Imbalanced Datasets






