In [2]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split, cross_val_score

In [3]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
 
stop_words = set(stopwords.words('english')) 

def remove_stop_words(text):
    word_tokens = word_tokenize(text) 
    filtered_sentence = [w for w in word_tokens if not w in stop_words] 
    return ' '.join(filtered_sentence)

[nltk_data] Downloading package stopwords to C:\Users\Mike del
[nltk_data]     Castillo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Mike del
[nltk_data]     Castillo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [12]:
print(stop_words)

whitelist = [
    'wouldn',
    "needn't",
    'couldn',
    'didn',
    'aren',
    "isn't", "shan't", "haven't",
    "wouldn't",
    'hasn',
    'shan',
    "mightn't",
    "hadn't",
    'wasn',
    'shouldn',
    "shouldn't",
    'not', "won't"
    "hasn't",
    "aren't",
    'haven',
    'mustn',
    "doesn't",
]

print(whitelist)

print([])

{'for', 'at', 'too', 'most', 'if', 'this', "you're", 'our', 'wouldn', 'only', 'don', 'been', 'y', "needn't", 'once', 'same', 'theirs', 'ain', 'or', 'your', 'all', 'very', 'me', 'needn', 'than', 'how', 'were', 'doing', 'over', 'she', 're', 'couldn', 'doesn', 'didn', 'between', 't', 'is', "it's", 'him', 'which', 'ourselves', 'other', 'why', 'his', 'has', 'aren', 'the', 'its', 'by', 'few', "isn't", "shan't", "haven't", 'there', 'out', "wouldn't", 'have', 'you', 'hasn', 'each', 'shan', 'about', 'should', 'below', 'o', 'after', 'an', 'any', 'ours', 'when', 'themselves', "mightn't", 'itself', 'down', "hadn't", 'shouldn', 'my', 'wasn', 'being', 'herself', 'm', 'no', 'can', 'mightn', "mustn't", 'weren', 'they', 'just', 'where', 'again', 'hadn', 'what', 'yourself', "you've", 'against', 'won', 'until', 'a', 'having', 'further', 'them', "doesn't", 'himself', 'then', 'hers', 'd', 'and', 'isn', 'it', 'had', 'now', 'more', 'whom', 'll', "couldn't", 'myself', 's', 'we', 'because', 'nor', "should've",

In [13]:
remove_stop_words("testing to see if this works hooray")

'testing see works hooray'

In [14]:
data = pd.read_csv('data/twitter-airline-sentiment.csv')

In [15]:
data['text'] = list(map(remove_stop_words, data['text'].values))
data.head()

Unnamed: 0,text,pos,neg
0,said,0.0,0.0
1,plus youve added commercials experience tacky,1.0,0.0
2,didnt today must mean need take another trip,0.0,0.0
3,really aggressive blast obnoxious entertainmen...,0.0,1.0
4,really big bad thing,0.0,1.0


In [16]:
# classifier = GaussianNB()
classifier = RandomForestClassifier()
# vectorizer = TfidfVectorizer()
vectorizer = CountVectorizer()

In [17]:
train_x, test_x, train_y, test_y = train_test_split(
            np.array(data.text), 
            np.array(data.pos),
            test_size=0.25, 
            random_state=2574)

In [18]:
tfidf_train_x = vectorizer.fit_transform(train_x)
classifier.fit(tfidf_train_x.toarray(), train_y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [19]:
tfidf_test_x = vectorizer.transform(test_x)
scores = cross_val_score(classifier, tfidf_test_x.toarray(), test_y, cv=5)
acc = scores.mean()
print("Previous accuracy: %0.2f percent" % (acc *100))

Previous accuracy: 87.17 percent


In [22]:
mess = [
    'woah that', 
    'fuck that',
    'i think it was fine but that shit was horrible!',
    'meg how are you?',
    'i really did not like this!',
    'i hated this',
    "that's not really nice",
    "i'm not into you",
    "i want to try things out",
    "i really think this will work out great!",
    "this was great!",
    "this was not great!",
    "i didn't get how it worked? what is wrong?",
    "your service is subpar",
    "service",
    "good",
    "bad",    
    "hate",
    "love",

]
t = vectorizer.transform(list(map(remove_stop_words, mess))).toarray();
output = classifier.predict(t)

for i ,m in enumerate(mess):
    print(m, ' == ', output[i])

woah that  ==  0.0
fuck that  ==  0.0
i think it was fine but that shit was horrible!  ==  0.0
meg how are you?  ==  0.0
i really did not like this!  ==  0.0
i hated this  ==  0.0
that's not really nice  ==  1.0
i'm not into you  ==  0.0
i want to try things out  ==  0.0
i really think this will work out great!  ==  0.0
this was great!  ==  1.0
this was not great!  ==  1.0
i didn't get how it worked? what is wrong?  ==  1.0
your service is subpar  ==  0.0
service  ==  0.0
good  ==  0.0
bad  ==  0.0
hate  ==  0.0
love  ==  1.0
