In [2]:
import pandas as pd
import numpy as np


#### 1. LOAD THE DATA

In [3]:

df = pd.read_csv('datasets/tweets.csv')

print(df.head())

# df['name of column']
target = df['is_there_an_emotion_directed_at_a_brand_or_product']
text = df['tweet_text']
print("Target")
print(target[0:5])
print("******")
print("Text")
print(text[0:5])

# target =  positive or negative
# 0    Negative emotion
# 1    Positive emotion

# text = tweet text
# 0    .@wesley83 I have a 3G iPhone. After 3 hrs twe...
# 1    @jessedee Know about @fludapp ? Awesome iPad/i...


                                          tweet_text  \
0  .@wesley83 I have a 3G iPhone. After 3 hrs twe...   
1  @jessedee Know about @fludapp ? Awesome iPad/i...   
2  @swonderlin Can not wait for #iPad 2 also. The...   
3  @sxsw I hope this year's festival isn't as cra...   
4  @sxtxstate great stuff on Fri #SXSW: Marissa M...   

  emotion_in_tweet_is_directed_at  \
0                          iPhone   
1              iPad or iPhone App   
2                            iPad   
3              iPad or iPhone App   
4                          Google   

  is_there_an_emotion_directed_at_a_brand_or_product  
0                                   Negative emotion  
1                                   Positive emotion  
2                                   Positive emotion  
3                                   Negative emotion  
4                                   Positive emotion  
Target
0    Negative emotion
1    Positive emotion
2    Positive emotion
3    Negative emotion
4    Positive e

####  2. CLEANING THE DATA

In [4]:

fixed_text = text[pd.notnull(text)]
fixed_target = target[pd.notnull(text)]
#[pd.notnull] = removing fields that are empty(null)


#### 3. FEATURE EXTRACTION

In [46]:

# class sklearn.feature_extraction.text.CountVectorizer
# http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
# "Convert a collection of text documents to a matrix of token counts"
# Structuring the data into an uniform set so algorithm can learn from it

from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
count_vect.fit(fixed_text)

vocabs = count_vect.vocabulary_
print(sorted(vocabs.items(), key=lambda x: x[1]))
print("*****************************")
print("3g index:", count_vect.vocabulary_.get(u'3g'))
print("*****************************")
print("I love my iphone!!!:\n",count_vect.transform(["I love my iphone!!!"]))
print("*****************************")
counts = count_vect.transform(fixed_text)
print(counts)

*****************************
3g index: 168
*****************************
I love my iphone!!!:
   (0, 4573)	1
  (0, 5169)	1
  (0, 5699)	1
*****************************
  (0, 168)	1
  (0, 430)	1
  (0, 774)	2
  (0, 2291)	1
  (0, 3981)	1
  (0, 4210)	1
  (0, 4573)	1
  (0, 4610)	1
  (0, 5766)	1
  (0, 6478)	1
  (0, 7232)	1
  (0, 8076)	1
  (0, 8323)	1
  (0, 8702)	1
  (0, 8920)	1
  (0, 9062)	1
  (0, 9303)	1
  (0, 9373)	1
  (1, 313)	1
  (1, 527)	1
  (1, 644)	1
  (1, 677)	1
  (1, 774)	1
  (1, 876)	1
  (1, 2386)	1
  :	:
  (9090, 5801)	1
  (9090, 5967)	1
  (9090, 7903)	1
  (9090, 8323)	1
  (9090, 8562)	1
  (9090, 8578)	1
  (9090, 8602)	1
  (9090, 8616)	1
  (9090, 8666)	1
  (9090, 9158)	1
  (9090, 9357)	1
  (9090, 9371)	1
  (9090, 9402)	1
  (9090, 9623)	1
  (9091, 774)	1
  (9091, 1618)	1
  (9091, 3741)	1
  (9091, 4374)	1
  (9091, 5057)	1
  (9091, 5435)	1
  (9091, 5974)	1
  (9091, 7294)	1
  (9091, 8323)	1
  (9091, 8539)	1
  (9091, 9701)	1


#### 4. CLASSIFICATION

In [47]:
# TWEETS SENTIMENT: classification(postive, negative or neutral) and scale(1-5)
# naïve Bayes classifier

from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()

######### Training with 100% of dataset #########
nb.fit(counts, fixed_target)
# .fit = trains algorithm by passing in feature vector and target vector

print(nb.predict(count_vect.transform(["iphone sucks"])))
print(nb.predict(count_vect.transform(["iphone cost too much!!!"])))


print("*********************")
predictions = nb.predict(counts)
correct = sum(predictions == fixed_target)
incorrect = sum(predictions != fixed_target)
accuracy = correct/(correct + incorrect)
print("Accuracy using 100% of dataset:",accuracy*100,"%")


########## Training with 70% of dataset #########

# TEST/SPLIT: 70% for training
nb.fit(counts[0:6000], fixed_target[0:6000])

# 30% for testing
predictions = nb.predict(counts[6000:9092])
correct = sum(predictions == fixed_target[6000:9092])
incorrect = sum(predictions != fixed_target[6000:9092])
accuracy = correct/(correct + incorrect)
print("Accuracy using 70% of dataset:",accuracy*100,"%")

######### Getting Baselines
from sklearn.dummy import DummyClassifier
nb = DummyClassifier(strategy='most_frequent')
nb.fit(counts[0:6000], fixed_target[0:6000])

predictions = nb.predict(counts[6000:9092])
correct = sum(predictions == fixed_target[6000:9092])
incorrect = sum(predictions != fixed_target[6000:9092])
accuracy = correct/(correct + incorrect)
print("Accuracy using dummy classifier(Baselines):",accuracy*100,"%")



['Negative emotion']
['Negative emotion']
*********************
Accuracy using 100% of dataset: 79.5094588649 %
Accuracy using 70% of dataset: 66.3971539457 %
Accuracy using dummy classifier(Baselines): 61.1254851229 %


#### 5. CROSS VALIDATION

In [48]:
from sklearn import cross_validation

nb = MultinomialNB()
scores = cross_validation.cross_val_score(nb, counts, fixed_target, cv=10)
# [cv=10]: 10 folds
print(scores)
print("10 folds Cross Validation Bayes classifier:",scores.mean())

[ 0.65824176  0.63076923  0.60659341  0.60879121  0.64395604  0.68901099
  0.70077008  0.66886689  0.65270121  0.62183021]
10 folds Cross Validation Bayes classifier: 0.648153102333


In [49]:
nb = DummyClassifier(strategy='most_frequent')
scores = cross_validation.cross_val_score(nb, counts, fixed_target, cv=10)
# [cv=10]: 10 folds
print(scores)
print("10 folds Cross Validation Dummy classifier:", scores.mean())

[ 0.59230769  0.59230769  0.59230769  0.59230769  0.59230769  0.59230769
  0.5929593   0.5929593   0.59316428  0.59316428]
10 folds Cross Validation Dummy classifier: 0.592609330138


#### 6. PIPELINING
Pipelining streamlines the training process into a simplified one(Feature selection + classification). When the model is ready, it is used for prediction.

In [50]:
from sklearn.pipeline import Pipeline

p = Pipeline(steps=[('Feature Extraction', CountVectorizer()),
                ('Classificaiton', MultinomialNB())])

p.fit(fixed_text, fixed_target)
print(p.predict(["I love my iphone!"]))

['Positive emotion']


##### ngram
Tweaking algorithm by adding ngram to improve accuracy. Ngram refers to the N of words. In sentence "I love the new iphone", 1-grams means "I", "love", "the", "new", "iphone". 2-grams mean "I love", "love the", "the new", "new iphone". 

In [51]:
v = CountVectorizer(ngram_range=(1, 2))
from pprint import pprint
pprint(v.fit(["I love my iphone"]).vocabulary_)

{'iphone': 0, 'love': 1, 'love my': 2, 'my': 3, 'my iphone': 4}


In [54]:
p = Pipeline(steps=[('Feature Extraction', CountVectorizer(ngram_range=(1, 2))),
                ('Classificaiton', MultinomialNB())])




p.fit(fixed_text, fixed_target)

print(p.named_steps['Feature Extraction'].vocabulary_.get(u'iphone'))


25533


In [53]:
scores = cross_validation.cross_val_score(p, fixed_text, fixed_target, cv=10)
print(scores)
print("10 folds Cross Validation with unigram & bigram:", scores.mean())

[ 0.68351648  0.66593407  0.65384615  0.64725275  0.68021978  0.69120879
  0.73267327  0.70517052  0.68026461  0.64829107]
10 folds Cross Validation with unigram & bigram: 0.678837748442


##### Feature Selection : Having most relevant features will increase the accuracy of prediction. Feature selection also reduces overfitting by removing irrelevant data from the dataset, and reduces the training time due to having less data to train from.
SelectKBest() Selects k highest scoring features

In [65]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

k_score = 20000
p = Pipeline(steps=[('counts', CountVectorizer(ngram_range=(1, 2))),
                ('feature_selection', SelectKBest(chi2, k=k_score)),
                ('multinomialnb', MultinomialNB())])

p.fit(fixed_text, fixed_target)

from sklearn import cross_validation

scores = cross_validation.cross_val_score(p, fixed_text, fixed_target, cv=10)
print(scores)
print("When kscore is %d, the prediciton rate: %r" % (k_score, scores.mean()))

[ 0.67582418  0.66153846  0.65274725  0.66703297  0.68131868  0.68681319
  0.71067107  0.7029703   0.67364939  0.64829107]
When kscore is 20000, the prediciton rate: 0.67608565524761888


##### Grid Search
Automating the fine-tuning process to find the best parameters for training the model

In [66]:
p = Pipeline(steps=[('counts', CountVectorizer()),
                ('feature_selection', SelectKBest(chi2)),
                ('multinomialnb', MultinomialNB())])

from sklearn.grid_search import GridSearchCV

# BELOW ARE Parameters
parameters = {
    'counts__max_df': (0.5, 0.75, 1.0),
    'counts__min_df': (1, 2, 3),
    'counts__ngram_range': ((1,1), (1,2)),
#    'feature_selection__k': (1000, 10000, 100000)
    }

grid_search = GridSearchCV(p, parameters, n_jobs=1, verbose=1, cv=10)

grid_search.fit(fixed_text, fixed_target)

print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))



Fitting 10 folds for each of 18 candidates, totalling 180 fits


[Parallel(n_jobs=1)]: Done 180 out of 180 | elapsed:  1.6min finished


Best score: 0.605
Best parameters set:
	counts__max_df: 0.5
	counts__min_df: 3
	counts__ngram_range: (1, 1)
