In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
%run scripts/helper.py

In [3]:
crowd_train = load_file('./data/train.csv/train.csv', index_col='id')
crowd_test = load_file('./data/test.csv/test.csv', index_col='id')

In [4]:
traindata = prepareText(crowd_train)
testdata = prepareText(crowd_test)

In [5]:
y = getTargetVariable(crowd_train)

In [6]:
from sklearn.metrics import make_scorer

# Weighted kappa scorer
kappa_scorer = make_scorer(quadratic_weighted_kappa, greater_is_better=True)

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [9]:
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB

pipeline = Pipeline((
    ('vec', TfidfVectorizer(min_df=3, max_features=None,
                     strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
                     ngram_range=(1, 2), use_idf=1,smooth_idf=1,sublinear_tf=1,
                     stop_words = 'english')),
    ('clf', MultinomialNB())
    ))

In [10]:
from sklearn.cross_validation import cross_val_score
from scipy.stats import sem

scores = cross_val_score(pipeline, traindata, y, cv=3, scoring=kappa_scorer, n_jobs=1)

In [11]:
scores.mean(), sem(scores)

(0.048054550070833958, 0.0036351935006655907)

In [14]:
from sklearn.grid_search import GridSearchCV

parameters = {
    'vec__min_df': [1, 2],
    'vec__max_df': [0.8, 1.0],
    'vec__ngram_range': [(1, 1), (1, 2)],
    'vec__use_idf': [True, False],
    'clf__alpha': np.logspace(-3, 0, 4)
}

gs = GridSearchCV(pipeline, parameters, verbose=2, refit=False, scoring=kappa_scorer)
_ = gs.fit(traindata, y)

Fitting 3 folds for each of 64 candidates, totalling 192 fits
[CV] vec__max_df=0.8, vec__ngram_range=(1, 1), vec__min_df=1, vec__use_idf=True, clf__alpha=0.001 
[CV]  vec__max_df=0.8, vec__ngram_range=(1, 1), vec__min_df=1, vec__use_idf=True, clf__alpha=0.001 -   6.1s
[CV] vec__max_df=0.8, vec__ngram_range=(1, 1), vec__min_df=1, vec__use_idf=True, clf__alpha=0.001 
[CV]  vec__max_df=0.8, vec__ngram_range=(1, 1), vec__min_df=1, vec__use_idf=True, clf__alpha=0.001 -   4.9s
[CV] vec__max_df=0.8, vec__ngram_range=(1, 1), vec__min_df=1, vec__use_idf=True, clf__alpha=0.001 
[CV]  vec__max_df=0.8, vec__ngram_range=(1, 1), vec__min_df=1, vec__use_idf=True, clf__alpha=0.001 -   4.3s
[CV] vec__max_df=0.8, vec__ngram_range=(1, 1), vec__min_df=1, vec__use_idf=False, clf__alpha=0.001 
[CV]  vec__max_df=0.8, vec__ngram_range=(1, 1), vec__min_df=1, vec__use_idf=False, clf__alpha=0.001 -   3.9s
[CV] vec__max_df=0.8, vec__ngram_range=(1, 1), vec__min_df=1, vec__use_idf=False, clf__alpha=0.001 
[CV]  ve

[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:    6.1s
[Parallel(n_jobs=1)]: Done  41 jobs       | elapsed:  4.5min



[CV] vec__max_df=1.0, vec__ngram_range=(1, 1), vec__min_df=2, vec__use_idf=False, clf__alpha=0.001 
[CV]  vec__max_df=1.0, vec__ngram_range=(1, 1), vec__min_df=2, vec__use_idf=False, clf__alpha=0.001 -   3.8s
[CV] vec__max_df=1.0, vec__ngram_range=(1, 2), vec__min_df=2, vec__use_idf=True, clf__alpha=0.001 
[CV]  vec__max_df=1.0, vec__ngram_range=(1, 2), vec__min_df=2, vec__use_idf=True, clf__alpha=0.001 -   7.8s
[CV] vec__max_df=1.0, vec__ngram_range=(1, 2), vec__min_df=2, vec__use_idf=True, clf__alpha=0.001 
[CV]  vec__max_df=1.0, vec__ngram_range=(1, 2), vec__min_df=2, vec__use_idf=True, clf__alpha=0.001 -   7.3s
[CV] vec__max_df=1.0, vec__ngram_range=(1, 2), vec__min_df=2, vec__use_idf=True, clf__alpha=0.001 
[CV]  vec__max_df=1.0, vec__ngram_range=(1, 2), vec__min_df=2, vec__use_idf=True, clf__alpha=0.001 -   7.8s
[CV] vec__max_df=1.0, vec__ngram_range=(1, 2), vec__min_df=2, vec__use_idf=False, clf__alpha=0.001 
[CV]  vec__max_df=1.0, vec__ngram_range=(1, 2), vec__min_df=2, vec__u

[Parallel(n_jobs=1)]: Done 162 jobs       | elapsed: 27.1min
[Parallel(n_jobs=1)]: Done 192 out of 192 | elapsed: 33.9min finished





In [15]:
gs.best_score_

0.40330414479371296

In [16]:
gs.best_params_

{'clf__alpha': 0.01,
 'vec__max_df': 0.8,
 'vec__min_df': 2,
 'vec__ngram_range': (1, 2),
 'vec__use_idf': True}