In [1]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split, cross_val_score

In [2]:
from keras.preprocessing.text import text_to_word_sequence

stop_words = pd.read_csv('../data/stopwords.csv')['words'].values

def remove_stop_words(text):
    word_tokens = text_to_word_sequence(text) 
    filtered_sentence = [w for w in word_tokens if not w in stop_words] 
    return ' '.join(filtered_sentence)

print(stop_words)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


['only' 'y' 'by' 'am' 'most' 'me' 'same' 'these' 'so' 'some' 'why' 'down'
 'had' 'd' 'at' 'having' 'those' 'has' 'few' 'theirs' "you've" 'more' 'i'
 'than' 'through' 'be' 'what' 'where' 'myself' 'which' 'doing' 'ours'
 'will' 'in' 'both' 'do' 'it' 'o' 'on' 'yours' 'once' 'ourselves' 'here'
 'about' "it's" 'my' 'for' 'her' 'then' 'after' "should've" 'from' 'each'
 'when' 'does' 'now' 'off' 'don' 'are' 'we' 'itself' 'should' 'his'
 'between' 'our' 'were' 'under' 'other' 'all' 'she' 'won' 'been' "you're"
 'how' 'did' 'yourself' 'they' 'into' 'there' 've' 'such' 't' 's' 'and'
 'over' 'to' 'just' 'was' 'being' 'because' 'if' 'who' 'further' 'the'
 'any' "that'll" 'themselves' 'as' 'again' "you'd" 'until' 'he' 'him'
 'this' 'or' 'of' 'below' 'an' "she's" 'weren' 'm' 'their' 'ma' 'up' 'll'
 'whom' 'hers' 'can' 'you' 'them' 'very' 'a' 'herself' 'before' 'too'
 'himself' 'during' 're' 'out' 'its' 'above' 'own' 'have' 'while'
 'yourselves' 'that' 'with' "you'll" 'is' 'your']


In [3]:
remove_stop_words("testing to see if this will not works hooray")

'testing see not works hooray'

In [4]:
data = pd.read_csv('../data/twitter-airline-sentiment.csv')

In [5]:
data['text'] = list(map(remove_stop_words, data['text'].values))
data

Unnamed: 0,text,pos,neg
0,said,0.0,0.0
1,plus youve added commercials experience tacky,1.0,0.0
2,didnt today must mean need take another trip,0.0,0.0
3,really aggressive blast obnoxious entertainmen...,0.0,1.0
4,really big bad thing,0.0,1.0
5,seriously would pay 30 flight seats didnt playing,0.0,1.0
6,yes,1.0,0.0
7,really missed prime opportunity men without ha...,0.0,0.0
8,well,1.0,0.0
9,amazing,1.0,0.0


In [6]:
classifier = GaussianNB()
# classifier = RandomForestClassifier()
vectorizer = TfidfVectorizer()
# vectorizer = CountVectorizer()

In [7]:
train_x, test_x, train_y, test_y = train_test_split(
            np.array(data.text), 
            np.array(data.pos),
            test_size=0.2, 
            random_state=3945)

In [8]:
tfidf_train_x = vectorizer.fit_transform(train_x)
classifier.fit(tfidf_train_x.toarray(), train_y)

GaussianNB(priors=None)

In [9]:
tfidf_test_x = vectorizer.transform(test_x)
scores = cross_val_score(classifier, tfidf_test_x.toarray(), test_y, cv=5)
acc = scores.mean()
print("Accuracy: %0.2f percent" % (acc *100))

Accuracy: 75.06 percent


In [10]:
mess = [
    'woah that', 
    'fuck that',
    'i think it was fine but that shit was horrible!',
    'meg how are you?',
    'i really did not like this!',
    'i hated this',
    "that's not really nice",
    "i'm not into you",
    "i want to try things out",
    "i really think this will work out great!",
    "this was great!",
    "this was not great!",
    "i didn't get how it worked? what is wrong?",
    "your service is subpar",
    "service",
    "good",
    "bad",    
    "hate",
    "love",

]
t = vectorizer.transform(list(map(remove_stop_words, mess))).toarray();
output = classifier.predict(t)
proba = classifier.predict_proba(t)

for i ,m in enumerate(mess):
    print(m, ' == ', output[i], ' hmm ', proba[i])

woah that  ==  1.0  hmm  [0. 1.]
fuck that  ==  0.0  hmm  [1. 0.]
i think it was fine but that shit was horrible!  ==  1.0  hmm  [0. 1.]
meg how are you?  ==  1.0  hmm  [0. 1.]
i really did not like this!  ==  1.0  hmm  [0. 1.]
i hated this  ==  1.0  hmm  [0. 1.]
that's not really nice  ==  1.0  hmm  [0. 1.]
i'm not into you  ==  1.0  hmm  [0. 1.]
i want to try things out  ==  1.0  hmm  [0. 1.]
i really think this will work out great!  ==  1.0  hmm  [0. 1.]
this was great!  ==  1.0  hmm  [0. 1.]
this was not great!  ==  1.0  hmm  [0. 1.]
i didn't get how it worked? what is wrong?  ==  1.0  hmm  [0. 1.]
your service is subpar  ==  0.0  hmm  [1. 0.]
service  ==  1.0  hmm  [0. 1.]
good  ==  1.0  hmm  [0. 1.]
bad  ==  1.0  hmm  [0. 1.]
hate  ==  1.0  hmm  [0. 1.]
love  ==  1.0  hmm  [0. 1.]
