In [90]:
import os
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split # function for splitting data to train and test sets

import nltk
from nltk.corpus import stopwords
from nltk.classify import SklearnClassifier
from nltk.tokenize import TweetTokenizer
import matplotlib.pyplot as plt
'exec(%matplotlib inline)'
from subprocess import check_output

In [91]:
data = pd.read_csv(os.getcwd()+"/input/Sentiment_GOP.csv", encoding = "ISO-8859-1")
data = data[['text','sentiment']]

# Splitting the dataset into train and test set
train, test = train_test_split(data,test_size = 0.1)
# Removing neutral sentiments
train = train[train.sentiment != "Neutral"]
test = test[test.sentiment != "Neutral"]

train_pos = train[ train['sentiment'] == 'Positive']
train_pos = train_pos['text']
train_neg = train[ train['sentiment'] == 'Negative']
train_neg = train_neg['text']

In [92]:
tweets = []
stopwords_set = set(stopwords.words("english")) #stopwords are words like is, are, that
tknzr = TweetTokenizer()

for index, row in train.iterrows():
    words_filtered = [e.lower() for e in tknzr.tokenize(row.text) if len(e) >= 3] #gets rid of words that are shorter than 3
    words_cleaned = [word for word in words_filtered
        if 'http' not in word
        and not word.startswith('@')
        and not word.startswith('#')
        and word != 'RT']   #gets rid of words with http, @, #, or RT
    words_without_stopwords = [word for word in words_cleaned if not word in stopwords_set]  #gets rid of stopwords
    tweets.append((words_without_stopwords, row.sentiment))
    

test_pos = test[ test['sentiment'] == 'Positive']
test_pos = test_pos['text']
test_neg = test[ test['sentiment'] == 'Negative']
test_neg = test_neg['text']
test_set = test['text']
test_ans = test['sentiment']

In [93]:
# Extracting word features

#returns list of all words
def get_words_in_tweets(tweets):
    all = []
    for (words, sentiment) in tweets:
        all.extend(words)
    return all 

#
def get_word_features(wordlist):
    wordlist = nltk.FreqDist(wordlist)   
    #FreqDist is a class that is like dictionary with word as key and # of times it appears as value

    features = wordlist.keys()
    return features

w_features = get_word_features(get_words_in_tweets(tweets)) #All unique words in all tweets

def extract_features(document):
    document_words = set(document)  #document was list of words, now list of unique words
    features = {}
    for word in w_features:
        features['contains(%s)' % word] = (word in document_words) #adds word from all unique words into features if that word is in document
    return features


In [94]:
# Training the Naive Bayes classifier
training_set = nltk.classify.apply_features(extract_features,tweets)
classifier = nltk.NaiveBayesClassifier.train(training_set)

In [95]:
def accuracyScore(correct, total):
    return correct/total      #How likely that u answer right

def sensitivityScore(truePos, totalPos):
    return truePos/totalPos    #How likely detect positive when is positive
    
def specificityScore(trueNeg, totalNeg):
    return trueNeg/totalNeg   #How likely detect negative when is negative
    
def precisionScore(truePos, guessedPos):
    return truePos/guessedPos #How likely is positive when detect positive

def f1Score(sensitivity, precision):
    return 2*sensitivity*precision/(sensitivity+precision)  
    #weighted avg of recall and precision, useful if harm of falsePos and falseNeg differs


In [16]:
trueNeg = 0
truePos = 0
falseNeg = 0
falsePos = 0

#Detecting Negatives
wrongNeg = []
count = 0
for obj in test_neg: 
    processedObj = extract_features(tknzr.tokenize(obj))
    res =  classifier.classify(processedObj)
    if(res == 'Negative'): 
        trueNeg+=1
    else:
        falsePos+=1
        if(count<20):
            count+=1
            wrongNeg.append(processedObj)
        
#Detecting Positives
wrongPos = []
count = 0
for obj in test_pos:
    processedObj = extract_features(tknzr.tokenize(obj))
    res =  classifier.classify(processedObj)
    if(res == 'Positive'): 
        truePos+=1
    else:
        falseNeg+=1
        if(count<20):
            count+=1
            wrongPos.append(processedObj)

In [97]:
trueNeg = 0
truePos = 0
falseNeg = 0
falsePos = 0

wrongNeg = []
wrongPos = []
for obj, ans in zip(test_set, test_ans): 
    res =  classifier.classify(extract_features(tknzr.tokenize(obj)))
    if(ans == 'Negative'):
        if(res == 'Negative'): 
            trueNeg+=1
        else:
            falsePos+=1
            if(len(wrongNeg)<10):
                wrongNeg.append([obj, ans])        
    else:
        if(res == 'Positive'): 
            truePos+=1
        else:
            falseNeg+=1
            if(len(wrongPos)<10):
                wrongPos.append([obj, ans])


In [98]:
print(str(truePos)+" "+ str(falseNeg))       
print(str(falsePos)+ " "+ str(trueNeg))  

accuracy = accuracyScore(truePos+trueNeg, truePos+trueNeg+falsePos+falseNeg)
sensitivity = sensitivityScore(truePos,truePos+falseNeg)
specificity = specificityScore(trueNeg, trueNeg+falsePos)
precision = precisionScore(truePos, truePos+falsePos)
f1 = f1Score(sensitivity, precision)

print(accuracy)
print(sensitivity)
print(specificity)
print(precision)
print(f1)

139 89
36 807
0.8832866479925303
0.6096491228070176
0.9572953736654805
0.7942857142857143
0.6898263027295285


In [99]:
for obj in wrongNeg:
    print(obj)
    print("\n")


RT @larryelder: Trump should have said, "Megyn, ask these nine candidates, if they plan to support ME when I win the nomination."
#GOPDebatÛ_


RT @RWSurferGirl: Ask Trump a legitimate question. Look at Wallace's face when Trump nails it. _Ùà¼_Ùàü #GOPDebate  #GOPDebates


RT @larryelder: Trump should have said, "Megyn, ask these nine candidates, if they plan to support ME when I win the nomination."
#GOPDebatÛ_


Me reading my family's comments about how great the #GOPDebate was http://t.co/gIaGjPygXZ


RT @RWSurferGirl: Ask Trump a legitimate question. Look at Wallace's face when Trump nails it. _Ùà¼_Ùàü #GOPDebate  #GOPDebates


RT @RWSurferGirl: I think Cruz and Trump need to band together and expose this set up job, and get rid of Bush and Rubio, _Ùà¼_Ùàü #GOPDebate  #GÛ_


What I'm reading says Donald Trump was front runner &amp; star of #GOPDebate! Like how is this possible? Wasn't he boo'ed? #WasntAbletoWatch


RT @RWSurferGirl: You would never know @realDonaldTrump  is the 

In [100]:
for obj in wrongPos:
    print(obj)
    print("\n")

RT @ElizabethND04: Wow. Awesome Dr. Carson race answer! @RealBenCarson #GOPDebate #FOXNEWSDEBATE #FoxNews #FoxDebate @FoxNews #FNC  https:/Û_


@realDonaldTrump lives to fight another day. Odds seem to ever be in his favor. #GOPDebate #hungergames #DonaldTrump http://t.co/eG6X6onkKT


@notaxation Thank you for your thoughts on the #GOPDebate. How long would you think some of the round 1 candidates will hang around?


@MikeHuckabeeGOP says he wants 2 legalize prostitution &amp; drugs &amp; tax the hell out of them? That's what I've been saying for yrs. #GOPDebate


I like senator #GOPDebate #Carson2016


'It wasn't too late for Wisconsin, it's not too late for America. Oh, and pass the cheese'. Scott Walker. #GOPDebates


Sen. Marco #Rubio (R-#Florida) is #GOPDebate winner according to @morningmoneyben: http://t.co/YjQ4hEDs4P


After watching the #GOPdebates I've made my decision! I'm voting for @megynkelly OMG! Where has she been all my life ? She is so HOT


RT @ericstonestreet: Trum