In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from scipy import stats
import seaborn as sns
import sklearn
import os
import sys

pd.set_option('max_columns', 100)
pd.set_option('max_colwidth', 100)
pd.set_option('float_format', lambda x: '%.5f' % x)
pd.set_option('precision', 5)
# pd.set_option('display.float_format', lambda x: '%.5f' % x)

In [3]:
# Decision Tree Classifier
from sklearn import datasets
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier

# load the iris datasets
dataset = datasets.load_iris()
# fit a CART model to the data
model = DecisionTreeClassifier()
model.fit(dataset.data, dataset.target)
print(model)
# make predictions
expected = dataset.target
predicted = model.predict(dataset.data)
# summarize the fit of the model
print(metrics.classification_report(expected, predicted))
print(metrics.confusion_matrix(expected, predicted))

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            random_state=None, splitter='best')
             precision    recall  f1-score   support

          0       1.00      1.00      1.00        50
          1       1.00      1.00      1.00        50
          2       1.00      1.00      1.00        50

avg / total       1.00      1.00      1.00       150

[[50  0  0]
 [ 0 50  0]
 [ 0  0 50]]


##    
#    
# Twitter data
##   
#    

In [5]:
os.listdir(os.getcwd())

['.ipynb_checkpoints',
 '1377884607_tweet_product_company.csv',
 'Initial Introduction.ipynb',
 'Untitled.ipynb']

In [4]:
twitter_data = pd.read_csv('1377884607_tweet_product_company.csv')
twitter_data.head()

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,".@wesley83 I have a 3G iPhone. After 3 hrs tweeting at #RISE_Austin, it was dead! I need to upg...",iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/iPhone app that you'll likely appreciate for its de...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. They should sale them down at #SXSW.,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as crashy as this year's iPhone app. #sxsw,iPad or iPhone App,Negative emotion
4,"@sxtxstate great stuff on Fri #SXSW: Marissa Mayer (Google), Tim O'Reilly (tech books/conference...",Google,Positive emotion


In [5]:
target = twitter_data['is_there_an_emotion_directed_at_a_brand_or_product'].fillna('')
text = twitter_data['tweet_text'].fillna('')

### Preprocess

In [6]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import StandardScaler, Normalizer, MinMaxScaler

cv = CountVectorizer(stop_words='english')
cv.fit(text)

counts = cv.transform(text)

# cv = CountVectorizer(token_pattern=u'(?u)\\b\\w\\w+\\b')
# cv.vocabulary_.get(u'love')
# count_vect.transform(["I love my iphone!!!"])

In [7]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB

gnb = MultinomialNB()
gnb.fit(counts, target)
gnb.predict(cv.transform(['I love my iphone']))

array(['Positive emotion'], 
      dtype='|S34')

In [8]:
import pandas as pd
import numpy as np


df = pd.read_csv('1377884607_tweet_product_company.csv')
target = df['is_there_an_emotion_directed_at_a_brand_or_product']
text = df['tweet_text']

fixed_text = text[pd.notnull(text)]
fixed_target = target[pd.notnull(text)]

from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
count_vect.fit(fixed_text)

counts = count_vect.transform(fixed_text)

from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(counts, fixed_target)

predictions = nb.predict(counts)
print sum(predictions == fixed_target)

7229


In [9]:
import pandas as pd
import numpy as np


df = pd.read_csv('1377884607_tweet_product_company.csv')
target = df['is_there_an_emotion_directed_at_a_brand_or_product']
text = df['tweet_text']

fixed_text = text[pd.notnull(text)]
fixed_target = target[pd.notnull(text)]

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

# ngram_range=(1, 2) uses pairs of words

p = Pipeline(steps=[('counts', CountVectorizer(ngram_range=(1, 2))),
                ('multinomialnb', MultinomialNB())])

p.fit(fixed_text, fixed_target)

from sklearn import cross_validation

scores = cross_validation.cross_val_score(p, fixed_text, fixed_target, cv=10)
print scores
print scores.mean()

[ 0.68351648  0.66593407  0.65384615  0.64725275  0.68021978  0.69120879
  0.73267327  0.70517052  0.68026461  0.64829107]
0.678837748442


In [10]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

p = Pipeline(steps=[('counts', CountVectorizer(ngram_range=(1, 2))),
                ('feature_selection', SelectKBest(chi2, k=10000)),
                ('multinomialnb', MultinomialNB())])

p.fit(fixed_text, fixed_target)

from sklearn import cross_validation

scores = cross_validation.cross_val_score(p, fixed_text, fixed_target, cv=10)
print scores
print scores.mean()

[ 0.67032967  0.66813187  0.62087912  0.64285714  0.64945055  0.67912088
  0.67876788  0.6809681   0.66041896  0.63947078]
0.659039495078


In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

p = Pipeline(steps=[('counts', CountVectorizer(ngram_range=(1, 4))),
                ('feature_selection', SelectKBest(chi2, k=10000)),
                ('multinomialnb', MultinomialNB())])

p.fit(fixed_text, fixed_target)

from sklearn import cross_validation

scores = cross_validation.cross_val_score(p, fixed_text, fixed_target, cv=10)
print scores
print scores.mean()

[ 0.65824176  0.63296703  0.5956044   0.61868132  0.64505495  0.62637363
  0.65346535  0.65126513  0.6615215   0.62844542]
0.63716204739


In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.grid_search import GridSearchCV


p = Pipeline(steps=[('counts', CountVectorizer()),
                    ('feature_selection', SelectKBest(chi2)),
                    ('multinomialnb', MultinomialNB())])

parameters = {
    'counts__max_df': (0.5, 0.75, 1.0),
    'counts__min_df': (1, 2, 3),
    'counts__ngram_range': ((1,1), (1,2)),
#    'feature_selection__k': (1000, 10000, 100000)
    }

grid_search = GridSearchCV(p, parameters, n_jobs=1, verbose=1, cv=10)
grid_search.fit(fixed_text, fixed_target)

print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:    0.4s
[Parallel(n_jobs=1)]: Done  50 jobs       | elapsed: 92.0min
