# Intro to scikit-learn

Notes from a workshop given by Lukas Biewald, CEO Crowdflower

In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv('tweets.csv')
df.head()

In [None]:
target = df['is_there_an_emotion_directed_at_a_brand_or_product']
text = df['tweet_text']

print ("Tweets:")
print (text[0:8])
print ("Sentiments:")
print (target[0:8])


## Dealing with Missing Data

Notice there is a __`NaN`__ value on line six of the head of the tweets data. There is some missing data which was classified as __"No emotion toward brand or product"__ and that isn't useful for classifying anything. Let's drop those out here:

In [None]:
fixed_text = text[text.notnull()]
fixed_target = target[text.notnull()] # note getting rid of same lines in both Series based on null data in text.

<img src= "http://scikit-learn.org/stable/_static/ml_map.png">
Source: http://scikit-learn.org/stable/tutorial/machine_learning_map/



In [None]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer()
count_vect.fit(fixed_text)

counts = count_vect.transform(fixed_text)

print (count_vect.vocabulary_.get(u'iphone'))
print (count_vect.transform(["I love my iphone!!!"]))

In [None]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(counts, fixed_target)

predictions = nb.predict(counts)
print (sum(predictions == fixed_target))

In [None]:
nb = MultinomialNB()

nb.fit(counts[0:6000], fixed_target[0:6000])

predictions = nb.predict(counts[6000:9092])
print (sum(predictions == fixed_target[6000:9092]))

In [None]:
nb = MultinomialNB()

from sklearn import cross_validation

scores = cross_validation.cross_val_score(nb, counts, fixed_target, cv=10)
print (scores)
print (scores.mean())

In [None]:
from sklearn.dummy import DummyClassifier

nb_dummy = DummyClassifier(strategy='most_frequent')

nb_dummy.fit(counts[0:6000], fixed_target[0:6000])

predictions = nb_dummy.predict(counts[6000:9092])
print (sum(predictions == fixed_target[6000:9092]))

from sklearn import cross_validation

scores = cross_validation.cross_val_score(nb_dummy, counts, fixed_target, cv=10)
print (scores)
print (scores.mean())

In [None]:
from sklearn.pipeline import Pipeline

p = Pipeline(steps=[('counts', CountVectorizer()),
                ('multinomialnb', MultinomialNB())])

p.fit(fixed_text, fixed_target)
print (p.predict(["I love my iphone!"]))

In [None]:
p = Pipeline(steps=[('counts', CountVectorizer(ngram_range=(1, 2))),
                ('multinomialnb', MultinomialNB())])

p.fit(fixed_text, fixed_target)
print (p.named_steps['counts'].vocabulary_.get(u'garage sale'))
print (len(p.named_steps['counts'].vocabulary_))

In [None]:
p = Pipeline(steps=[('counts', CountVectorizer(ngram_range=(1, 2))),
                ('multinomialnb', MultinomialNB())])

p.fit(fixed_text, fixed_target)
print (p.predict(["I love my iphone!"]))

In [None]:
p = Pipeline(steps=[('counts', CountVectorizer(ngram_range=(1, 2))),
                ('multinomialnb', MultinomialNB())])

p.fit(fixed_text, fixed_target)

from sklearn import cross_validation

scores = cross_validation.cross_val_score(p, fixed_text, fixed_target, cv=10)
print (scores)
print (scores.mean())

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

p = Pipeline(steps=[('counts', CountVectorizer(ngram_range=(1, 2))),
                ('feature_selection', SelectKBest(chi2, k=10000)),
                ('multinomialnb', MultinomialNB())])

p.fit(fixed_text, fixed_target)

from sklearn import cross_validation

scores = cross_validation.cross_val_score(p, fixed_text, fixed_target, cv=10)
print (scores)
print (scores.mean())

In [None]:
p = Pipeline(steps=[('counts', CountVectorizer()),
                ('feature_selection', SelectKBest(chi2)),
                ('multinomialnb', MultinomialNB())])

from sklearn.grid_search import GridSearchCV

parameters = {
    'counts__max_df': (0.5, 0.75, 1.0),
    'counts__min_df': (1, 2, 3),
    'counts__ngram_range': ((1,1), (1,2)),
#    'feature_selection__k': (1000, 10000, 100000)
    }

grid_search = GridSearchCV(p, parameters, n_jobs=1, verbose=1, cv=10)

grid_search.fit(fixed_text, fixed_target)

print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))