In [None]:
import pandas as pd
import numpy as np


df = pd.read_csv('tweets.csv')
target = df['is_there_an_emotion_directed_at_a_brand_or_product']
text = df['tweet_text']

fixed_text = text[pd.notnull(text)]
fixed_target = target[pd.notnull(text)]

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score

In [15]:
# Pipeline 1: Does cross validation with no use of Bi-Grams
p1 = Pipeline(steps=[('counts', CountVectorizer()),
                ('multinomialnb', MultinomialNB())])

p1.fit(fixed_text, fixed_target)

from sklearn.model_selection import cross_val_score

scores = cross_val_score(p1, fixed_text, fixed_target, cv=10)
print(scores)
print(scores.mean())

[ 0.67472527  0.64395604  0.61538462  0.62747253  0.65604396  0.69450549
  0.69856986  0.67656766  0.66593164  0.64939361]
0.660255067391


In [14]:
# Pipeline 2: Does cross validation in the pipline with a Bi-Grams
p2 = Pipeline(steps=[('counts', CountVectorizer(ngram_range=(1, 2))),
                ('multinomialnb', MultinomialNB())])

p2.fit(fixed_text, fixed_target)

from sklearn.model_selection import cross_val_score

scores = cross_val_score(p2, fixed_text, fixed_target, cv=10)
print(scores)
print(scores.mean())


[ 0.68351648  0.66593407  0.65384615  0.64725275  0.68021978  0.69120879
  0.73267327  0.70517052  0.68026461  0.64829107]
0.678837748442


In [17]:
# Pipeline 3: Does cross validation in the pipline with a Bi-Grams
p3 = Pipeline(steps=[('counts', CountVectorizer(ngram_range=(1, 4))),
                ('multinomialnb', MultinomialNB())])

p3.fit(fixed_text, fixed_target)

from sklearn.model_selection import cross_val_score

scores = cross_val_score(p3, fixed_text, fixed_target, cv=10)
print(scores)
print(scores.mean())


[ 0.67252747  0.66153846  0.65824176  0.65384615  0.67582418  0.66483516
  0.71287129  0.7029703   0.66593164  0.65159868]
0.672018509071


In [8]:
print(p1.predict(["I love my iphone!"]))
print(p2.predict(["I love my iphone!"]))
print(p3.predict(["I love my iphone!"]))

['Positive emotion']
['Positive emotion']


In [18]:
# See the number of words we identified in the dataset
print(len(p1.named_steps['counts'].vocabulary_))
print(len(p2.named_steps['counts'].vocabulary_))
print(len(p3.named_steps['counts'].vocabulary_))

9706
59614
216515
