# Natural Language Processing: Twitter Sentiment Analysis

## Import Libraries and Twitter Data

In [337]:
import numpy as np
import pandas as pd
import json

In [338]:
with open('output.txt') as f:
    lines = f.readlines()

In [339]:
json_data = []
for line in lines:
    json_data.append(json.loads(line))

In [340]:
tweet_data = []
for tweet in json_data:
    tweet_data.append(tweet['text'])

## Data Cleaning

In [342]:
labels = []

def define_sentiment(text):
    if ':(' in text and ':)' in text:
        return 1 #Postive
    elif ':(' in text:
        return 0 #Negative
    elif ':)' in text:
        return 1 #Positive
    else:
        return 0 #Negative

In [343]:
for tweet in tweet_data:
    labels.append(define_sentiment(tweet))

In [388]:
import nltk 
stopwords = nltk.corpus.stopwords.words('english')
import string
punctuations = string.punctuation
stopwords.extend(['RT'])

import re
pattern = '[^A-Za-z0-9]+'
from nltk import word_tokenize

def remove_stopwords(text):
    return [word for word in text.split() if word not in stopwords]

def remove_punc(text):
    return " ".join([word for word in text if word not in punctuations])

def convert_lowercase(text):
    return ''.join([word.lower() for word in text])

def preprocess_text(text):
    clean_text = re.sub('@.*?($| )','', text) 
    clean_text = re.sub('https.*?($| )', '', clean_text)
    clean_text = re.sub(pattern, ' ', clean_text)
    clean_text = re.sub("\d+", "", clean_text)
    clean_text = re.sub(r'\b\w{1,2}\b', '', clean_text)
    clean_text = convert_lowercase(clean_text)
    clean_text = remove_stopwords(clean_text)
    clean_text = remove_punc(clean_text)
    return clean_text

In [389]:
tweet_data_clean = []
labels_clean = []

for i in range(0, len(tweet_data)):
    if preprocess_text(tweet_data[i]) == '':
        pass
    else:
        tweet_data_clean.append(preprocess_text(tweet_data[i]))
        labels_clean.append(labels[i])

In [400]:
len(tweet_data_clean)

6825

In [401]:
len(labels_clean)

6825

In [407]:
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import csr_matrix

tfidf = TfidfVectorizer(smooth_idf=True, use_idf=True)

In [409]:
tfidf_data = tfidf.fit_transform(tweet_data_clean)
tfidf_data = tfidf_data.todense()

In [352]:
print("Number of negative comments:", labels_clean.count(0))
print("Number of positive comments:", labels_clean.count(1))

Number of negative comments: 3372
Number of positive comments: 3374


## Binary Classification Modelling

In [410]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(smooth_idf=True, use_idf=True)
tfidf_data = tfidf.fit_transform(tweet_data_clean)
tfidf_data = tfidf_data.todense()

In [411]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(tfidf_data, labels_clean, train_size=0.75)

### Naive Bayes

In [420]:
from sklearn.metrics import accuracy_score, confusion_matrix

In [403]:
from sklearn.naive_bayes import GaussianNB

In [412]:
nb = GaussianNB()
nb.fit(X_train, y_train)

GaussianNB()

In [416]:
y_pred = nb.predict(X_test)

In [423]:
print('Accuracy: ', accuracy_score(y_test, y_pred))
print('Confusion Matrix: \n', confusion_matrix(y_test, y_pred))

Accuracy:  0.6760398359695372
Confusion Matrix: 
 [[424 401]
 [152 730]]


### Logistic Regression

In [425]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()
logreg.fit(X_train, y_train)

LogisticRegression()

In [426]:
y_pred = logreg.predict(X_test)

In [427]:
print('Accuracy: ', accuracy_score(y_test, y_pred))
print('Confusion Matrix: \n', confusion_matrix(y_test, y_pred))

Accuracy:  0.7492677211482133
Confusion Matrix: 
 [[593 232]
 [196 686]]


### Random Forest Classifier

In [428]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)

RandomForestClassifier()

In [429]:
y_pred = rfc.predict(X_test)

In [430]:
print('Accuracy: ', accuracy_score(y_test, y_pred))
print('Confusion Matrix: \n', confusion_matrix(y_test, y_pred))

Accuracy:  0.7387229056824839
Confusion Matrix: 
 [[514 311]
 [135 747]]
