# Natural Language Processing: Twitter Sentiment Analysis

## Import Libraries and Twitter Data

In [1]:
import numpy as np
import pandas as pd
import json

In [2]:
with open('output.txt') as f:
    lines = f.readlines()

In [3]:
json_data = []
for line in lines:
    json_data.append(json.loads(line))

In [4]:
tweet_data = []
for tweet in json_data:
    tweet_data.append(tweet['text'])

## Data Cleaning

In [5]:
labels = []

def define_sentiment(text):
    if ':(' in text and ':)' in text:
        return 1 #Postive
    elif ':(' in text:
        return 0 #Negative
    elif ':)' in text:
        return 1 #Positive
    else:
        return 0 #Negative

In [6]:
for tweet in tweet_data:
    labels.append(define_sentiment(tweet))

In [7]:
import nltk 
stopwords = nltk.corpus.stopwords.words('english')
import string
punctuations = string.punctuation
stopwords.extend(['RT'])

import re
pattern = '[^A-Za-z0-9]+'
from nltk import word_tokenize

def remove_stopwords(text):
    return [word for word in text.split() if word not in stopwords]

def remove_punc(text):
    return " ".join([word for word in text if word not in punctuations])

def convert_lowercase(text):
    return ''.join([word.lower() for word in text])

def preprocess_text(text):
    clean_text = re.sub('@.*?($| )','', text) 
    clean_text = re.sub('https.*?($| )', '', clean_text)
    clean_text = re.sub(pattern, ' ', clean_text)
    clean_text = re.sub("\d+", "", clean_text)
    clean_text = re.sub(r'\b\w{1,2}\b', '', clean_text)
    clean_text = convert_lowercase(clean_text)
    clean_text = remove_stopwords(clean_text)
    clean_text = remove_punc(clean_text)
    return clean_text

In [8]:
tweet_data_clean = []
labels_clean = []

for i in range(0, len(tweet_data)):
    if preprocess_text(tweet_data[i]) == '':
        pass
    else:
        tweet_data_clean.append(preprocess_text(tweet_data[i]))
        labels_clean.append(labels[i])

In [9]:
len(tweet_data_clean)

6825

In [10]:
len(labels_clean)

6825

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import csr_matrix

tfidf = TfidfVectorizer(smooth_idf=True, use_idf=True)

In [13]:
tfidf_data = tfidf.fit_transform(tweet_data_clean)
#tfidf_data = tfidf_data.todense()

In [14]:
print("Number of negative comments:", labels_clean.count(0))
print("Number of positive comments:", labels_clean.count(1))

Number of negative comments: 3410
Number of positive comments: 3415


## Binary Classification Modelling

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(smooth_idf=True, use_idf=True)
tfidf_data = tfidf.fit_transform(tweet_data_clean)
tfidf_data = tfidf_data.todense()

In [16]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(tfidf_data, labels_clean, train_size=0.75)

### Naive Bayes

In [22]:
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, roc_auc_score

In [18]:
from sklearn.naive_bayes import GaussianNB

In [19]:
nb = GaussianNB()
nb.fit(X_train, y_train)

GaussianNB()

In [20]:
y_pred = nb.predict(X_test)

In [25]:
print('Accuracy: ', accuracy_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred))
print('Recall Score:', recall_score(y_test, y_pred))
print('ROC-AUC-Score:', roc_auc_score(y_test, y_pred))
print('Confusion Matrix: \n', confusion_matrix(y_test, y_pred))

Accuracy:  0.7065026362038664
Precision: 0.6727107887579329
Recall Score: 0.8412698412698413
ROC-AUC-Score: 0.7018470418470419
Confusion Matrix: 
 [[464 361]
 [140 742]]


### Logistic Regression

In [26]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()
logreg.fit(X_train, y_train)

LogisticRegression()

In [27]:
y_pred = logreg.predict(X_test)

In [28]:
print('Accuracy: ', accuracy_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred))
print('Recall Score:', recall_score(y_test, y_pred))
print('ROC-AUC-Score:', roc_auc_score(y_test, y_pred))
print('Confusion Matrix: \n', confusion_matrix(y_test, y_pred))

Accuracy:  0.7539543057996485
Precision: 0.7583892617449665
Recall Score: 0.7687074829931972
ROC-AUC-Score: 0.7534446505875076
Confusion Matrix: 
 [[609 216]
 [204 678]]


### Random Forest Classifier

In [29]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)

RandomForestClassifier()

In [30]:
y_pred = rfc.predict(X_test)

In [31]:
print('Accuracy: ', accuracy_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred))
print('Recall Score:', recall_score(y_test, y_pred))
print('ROC-AUC-Score:', roc_auc_score(y_test, y_pred))
print('Confusion Matrix: \n', confusion_matrix(y_test, y_pred))

Accuracy:  0.7586408904510837
Precision: 0.7225378787878788
Recall Score: 0.8650793650793651
ROC-AUC-Score: 0.754963924963925
Confusion Matrix: 
 [[532 293]
 [119 763]]
