In [36]:
import os
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.model_selection import train_test_split # function for splitting data to train and test sets
from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score, confusion_matrix

import nltk
from nltk.corpus import stopwords
from nltk.classify import SklearnClassifier
from nltk.tokenize import TweetTokenizer
import matplotlib.pyplot as plt
'exec(%matplotlib inline)'
from subprocess import check_output

In [37]:
data = pd.read_csv(os.getcwd()+"/input/Sentiment_GOP.csv", encoding = "ISO-8859-1")
data = data[['text','sentiment']]

# Splitting the dataset into train and test set
train, test = train_test_split(data,test_size = 0.1)
# Removing neutral sentiments
train = train[train.sentiment != "Neutral"]
test = test[test.sentiment != "Neutral"]

train_pos = train[ train['sentiment'] == 'Positive']
train_pos = train_pos['text']
train_neg = train[ train['sentiment'] == 'Negative']
train_neg = train_neg['text']

In [38]:
tweets = []
stopwords_set = set(stopwords.words("english")) #stopwords are words like is, are, that
tknzr = TweetTokenizer()

for index, row in train.iterrows():
    words_filtered = [e.lower() for e in tknzr.tokenize(row.text) if len(e) >= 3] #gets rid of words that are shorter than 3
    words_cleaned = [word for word in words_filtered
        if 'http' not in word
        and not word.startswith('@')
        and not word.startswith('#')
        and word != 'RT']   #gets rid of words with http, @, #, or RT
    words_without_stopwords = [word for word in words_cleaned if not word in stopwords_set]  #gets rid of stopwords
    tweets.append((words_without_stopwords, row.sentiment))
    

test_pos = test[ test['sentiment'] == 'Positive']
test_pos = test_pos['text']
test_neg = test[ test['sentiment'] == 'Negative']
test_neg = test_neg['text']
test_set = test['text']
test_ans = test['sentiment']

In [39]:
# Extracting word features

#returns list of all words
def get_words_in_tweets(tweets):
    all = []
    for (words, sentiment) in tweets:
        all.extend(words)
    return all 

#
def get_word_features(wordlist):
    wordlist = nltk.FreqDist(wordlist)   
    #FreqDist is a class that is like dictionary with word as key and # of times it appears as value

    features = wordlist.keys()
    return features

w_features = get_word_features(get_words_in_tweets(tweets)) #All unique words in all tweets

def extract_features(document):
    document_words = set(document)  #document was list of words, now list of unique words
    features = {}
    for word in w_features:
        features['contains(%s)' % word] = (word in document_words) #adds word from all unique words into features if that word is in document
    return features


In [40]:
# Training the Naive Bayes classifier
training_set = nltk.classify.apply_features(extract_features,tweets)
classifier = nltk.NaiveBayesClassifier.train(training_set)

In [41]:
def accuracyScore(correct, total):
    return correct/total      #How likely that u answer right

def sensitivityScore(truePos, totalPos):
    return truePos/totalPos    #How likely detect positive when is positive
    
def specificityScore(trueNeg, totalNeg):
    return trueNeg/totalNeg   #How likely detect negative when is negative
    
def precisionScore(truePos, guessedPos):
    return truePos/guessedPos #How likely is positive when detect positive

def f1Score(sensitivity, precision):
    return 2*sensitivity*precision/(sensitivity+precision)  
    #weighted avg of recall and precision, useful if harm of falsePos and falseNeg differs


In [43]:
test_pred = [classifier.classify(extract_features(tknzr.tokenize(obj))) for obj in test_set]   

In [45]:
cm = confusion_matrix(test_ans, test_pred, labels=["Positive", "Negative"])
tp, fn, fp, tn = cm.ravel()

In [46]:
print(cm)

[[122 114]
 [ 57 743]]


In [47]:
print(tn, fp, fn, tp)

743 57 114 122


In [48]:
print(accuracy_score(test_ans, test_pred))
print(recall_score(test_ans, test_pred, labels=["Positive", "Negative"], pos_label = "Positive"))
print(precision_score(test_ans, test_pred, labels=["Positive", "Negative"], pos_label = "Positive"))
print(f1_score(test_ans, test_pred, labels=["Positive", "Negative"], pos_label = "Positive"))

0.834942084942085
0.5169491525423728
0.6815642458100558
0.5879518072289157


In [49]:
accuracy = accuracyScore(tp+tn, tp+tn+fp+fn)
sensitivity = sensitivityScore(tp,tp+fn)
specificity = specificityScore(tn, tn+fp)
precision = precisionScore(tp, tp+fp)
f1 = f1Score(sensitivity, precision)

print(accuracy)
print(sensitivity)
print(specificity)
print(precision)
print(f1)

0.834942084942085
0.5169491525423728
0.92875
0.6815642458100558
0.5879518072289157
