# Tweeter Sentiment Analysis

In [1]:
#import all required packages
import pandas as pd
import numpy as np
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import matplotlib.pyplot as plt
import re
import string
import ast
from tqdm import tqdm
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import metrics
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.express as px
import tensorflow
from tensorflow import keras
import keras
from keras import backend as K
# from keras.callbacks import ModelCheckpoint
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
# from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import plot_model
# from tensorflow.python.client import device_lib 

In [2]:
#Set the pre-processing parameters
CLEAN = False
EXTRACT = False

In [3]:
#load data
df = pd.read_csv('data/tweets.csv', header=None)
#select important columns
df = df[[0,5]]
df.columns = ['sentiment','text']
df['sentiment'] = df['sentiment'].replace([0, 4],['Negative','Positive'])

df.head()

Unnamed: 0,sentiment,text
0,Negative,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,Negative,is upset that he can't update his Facebook by ...
2,Negative,@Kenichan I dived many times for the ball. Man...
3,Negative,my whole body feels itchy and like its on fire
4,Negative,"@nationwideclass no, it's not behaving at all...."


In [4]:
# df = df.sample(frac=1).reset_index(drop=True)
# df = df[:10000]
# df.sentiment.value_counts()

## Text Pre-Processing

1. Remove links 
2. Remove mentions
3. Remove punctuation
4. Remove stopwords
5. Lemmatize
6. Stemming?
7. Lowercase
8. Strip whitespaces

In [5]:
# remove stopwords with spacy
def remove_stopwords(nlp, text):
    #tokenize
    doc = nlp(text)

    # Create list of word tokens
    token_list = []
    for token in doc:
        token_list.append(token.text)

    # Create list of word tokens after removing stopwords
    filtered_sentence = [] 
    for word in token_list:
        lexeme = nlp.vocab[word]
        if lexeme.is_stop == False:
            filtered_sentence.append(word) 
    return ' '.join(filtered_sentence)

# lemmatize tweet with spacy
def lemmatize(nlp,text):

    #tokenize
    doc = nlp(text)

    lemmatized = []
    for token in doc:
        # print(token, token.lemma, token.lemma_)
        lemmatized.append(token.lemma_)

    return ' '.join(lemmatized)

#Clean tweet
def clean_tweet(nlp, tweet):
    
    clean = tweet
    #remove links
    clean = re.sub(r"http\S+", "", clean)
    #remove mentions
    clean = re.sub(r"@\S+", "", clean)
    #remove punctuation
    clean = clean.translate(str.maketrans('', '', string.punctuation)) #https://stackoverflow.com/questions/265960/best-way-to-strip-punctuation-from-a-string
    #remove stopwords
    clean = remove_stopwords(nlp, clean)
    #lemmatizing
    clean = lemmatize(nlp, clean)
    #strip whitespaces
    clean = clean.strip()
    #lowercase
    clean = clean.lower()

    return clean

In [6]:
#Pre-Process
if CLEAN:
    #Load spacy model -> needed for stopword removal and lemmatizing
    nlp = spacy.load("en_core_web_sm")
    #Enable progress tracking
    tqdm.pandas()
    #Run pre-processing on the whole dataset
    df['clean'] = df.text.progress_apply(lambda x: clean_tweet(nlp,x))
    #Save pre-processed data
    df.to_csv('data/cleaned.csv')
# else load the already pre-processed dataset
else:
    df = pd.read_csv('data/cleaned.csv', index_col=0)

## Build Word Frequences for Each Class

In [7]:
#Build frequency for a single class
def build_freq(df,text_col):
    #initiate counter
    freq = Counter()
    df[text_col].str.lower().str.split().apply(freq.update)
    freq = dict(freq)
    #sort the dictionary
    freq = dict(sorted(freq.items(), key=lambda item: item[1], reverse=True)) #https://stackoverflow.com/questions/613183/how-do-i-sort-a-dictionary-by-value

    return freq

#Build frequencies for each target class
def multiclass_freqs(df,target_col,text_col):
    #inititate frequency list
    freq_list = []
    #loop over unique classes
    for c in df[target_col].unique():
        #build frequency for the "c" class
        freq = build_freq(df.loc[df[target_col] == c], text_col)
        #append to the list
        freq_list.append(freq)

    return freq_list

In [8]:
#Sample less tweets to run the tests as it will take ages to pre-process who;e 1.6 million tweets
SAMPLE_SIZE = 100000
df = df.sample(frac=1).reset_index(drop=True)
df = df[:SAMPLE_SIZE]
df.sentiment.value_counts()

Positive    50058
Negative    49942
Name: sentiment, dtype: int64

In [9]:
#Get frequencies for all columns
freqs = multiclass_freqs(df,'sentiment','clean')

# # positive frequency
# pos_freq = build_freq(df.loc[df.sentiment == 'Positive'], 'clean')
# # negative frequency 
# neg_freq = build_freq(df.loc[df.sentiment == 'Negative'], 'clean')

## Feature Extraction with Frequencies

In [10]:
def extract_features(tweet,freq_list):
    #split the tweet into words
    tweet_words = tweet.split(' ')
    #inititate the feature, bias = 1
    feature = [1]

    # loop over the given frequencies
    for freq in freq_list:
        #inititate frequency feature
        f = 0
        for word in tweet_words:
            #if word is present in the freq dictionary
            freq_words = list(freq.keys())
            #sum the frequencies of each word in the tweet
            if word in freq_words:
                #add its frequency to the feature
                f += freq[word]
        feature.append(f)

    return feature

In [11]:
if EXTRACT:
    #Enable progress tracking
    tqdm.pandas()
    #Extract features on the whole dataset
    features = df.clean.progress_apply(lambda tweet: extract_features(tweet,freqs))
    df['features'] = features
    df.to_csv('cleaned_with_features.csv')
else:
    df = pd.read_csv('data/cleaned_with_features.csv', index_col=0)
    #transform features from string to list
    df.features = df.features.apply(lambda x: ast.literal_eval(x))
    features = df.features

## Prepare Data for Model Training

In [12]:
#transform feature vector into a numpy array
X = np.array([np.array(features[i]) for i in range(len(features))])
#label encoding
y = np.array(df.sentiment.astype('category').cat.codes)
#split into train/test, train size 80k
train_X, test_X, train_y, test_y = train_test_split(X,y, test_size=0.2, random_state = 42)
#split test set into val and test, each 10k records
test_X, val_X, test_y, val_y = train_test_split(test_X,test_y, test_size=0.5, random_state = 42)
#print lengths of the datasets
len(train_X), len(val_X), len(test_X)

(80000, 10000, 10000)

## Train Models

### Logistic Regression

In [13]:
model = LogisticRegression(solver='liblinear', random_state=42)
model.fit(train_X, train_y)

LogisticRegression(random_state=42, solver='liblinear')

In [14]:
#Make predictions
y_pred = model.predict(test_X)
# Model Accuracy: how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(test_y, y_pred))
# Model Precision: what percentage of positive tuples are labeled as such?
print("Precision:",metrics.precision_score(test_y, y_pred))
# Model Recall: what percentage of positive tuples are labelled as such?
print("Recall:",metrics.recall_score(test_y, y_pred))

Accuracy: 0.7027
Precision: 0.6764452113891286
Recall: 0.7811877241929055


### Support Vector Machine

In [15]:
# #Create a svm Classifier
# clf = svm.SVC(kernel='linear', verbose=1) # Linear Kernel

# #Train the model using the training sets
# clf.fit(train_X, train_y)

# #Predict the response for test dataset
# y_pred = clf.predict(test_X)

# #Make predictions
# y_pred = model.predict(test_X)
# # Model Accuracy: how often is the classifier correct?
# print("Accuracy:",metrics.accuracy_score(test_y, y_pred))
# # Model Precision: what percentage of positive tuples are labeled as such?
# print("Precision:",metrics.precision_score(test_y, y_pred))
# # Model Recall: what percentage of positive tuples are labelled as such?
# print("Recall:",metrics.recall_score(test_y, y_pred))

### Artificial Neural Network

In [16]:
# Plot loss history
def plot_loss_history(history):
    loss_history = pd.DataFrame(history.history)
    loss_history['epoch'] = loss_history.index.values
    fig = px.line(loss_history, x = 'epoch', y = ['loss','val_loss'], title = 'Train vs Validation Loss During Training')
    fig.show()
    return loss_history

In [15]:
EPOCHS = 50
BATCH_SIZE = 128
VERBOSITY = 1
input_dim = train_X.shape[1]  # Number of features

model = Sequential()
model.add(Dense(16, input_dim=input_dim, activation='relu'))
model.add(Dense(8, input_dim=input_dim, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', 
                  optimizer='adam', 
                  metrics=['accuracy'])
# model.summary()

history = model.fit(train_X, train_y,
                    epochs=EPOCHS,
                    verbose=VERBOSITY,
                    validation_data=(val_X, val_y),
                    batch_size=BATCH_SIZE)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [17]:
loss, accuracy = model.evaluate(train_X, train_y, verbose=False)
print("Train Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(test_X, test_y, verbose=False)
print("Test Accuracy:  {:.4f}".format(accuracy))

Train Accuracy: 0.5708
Test Accuracy:  0.5763


In [34]:
plot_loss_history(history)

Unnamed: 0,loss,accuracy,val_loss,val_accuracy,epoch
0,50.424183,0.579862,4.340445,0.5116,0
1,2.913967,0.598013,1.39529,0.6171,1
2,2.895701,0.6041,5.926442,0.5288,2
3,2.663509,0.614575,3.302673,0.5532,3
4,2.564348,0.620162,4.931434,0.5328,4
5,2.781494,0.62385,1.18615,0.6642,5
6,2.814181,0.62495,1.21974,0.6133,6
7,2.261082,0.6276,1.243598,0.6928,7
8,2.488367,0.63045,4.872571,0.5404,8
9,2.531886,0.629138,1.891949,0.5481,9
