In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
%run scripts/helper.py
%run scripts/features.py

In [3]:
crowd_train = load_file('./data/train.csv/train.csv', index_col='id')
crowd_test = load_file('./data/test.csv/test.csv', index_col='id')

In [4]:
traindata = prepareText(crowd_train)
testdata = prepareText(crowd_test)
y = crowd_train.median_relevance.values

In [5]:
# lets take a look at some sample training data
traindata[0]

'bridal shower decorations Accent Pillow with Heart Design - Red/Black Red satin accent pillow embroidered with a heart in black thread. 8" x 8".'

In [6]:
# lets take a look at sample test data
testdata[0]

'electric griddle Star-Max 48 in Electric Griddle '

In [7]:
from sklearn.metrics import make_scorer

# Weighted kappa scorer
kappa_scorer = make_scorer(quadratic_weighted_kappa, greater_is_better=True)

In [23]:
from sklearn.cross_validation import StratifiedShuffleSplit

sss = StratifiedShuffleSplit(y, 3, train_size=7000, random_state=0)
train_index, test_index = next(iter(sss))

In [24]:
Xt = np.asarray(traindata)[train_index]
yt = np.asarray(y)[train_index]

In [25]:
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectPercentile, chi2
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cross_validation import cross_val_score
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import MultinomialNB

In [41]:
countvect_char = TfidfVectorizer(min_df=3, max_features=None, 
            strip_accents='unicode', analyzer='char',
            ngram_range=(1, 2), use_idf=1,smooth_idf=1,sublinear_tf=1,
            stop_words = 'english')

countvect_word = TfidfVectorizer(min_df=3, max_features=None, 
            strip_accents='unicode', analyzer='word',
            ngram_range=(1, 2), use_idf=1,smooth_idf=1,sublinear_tf=1,
            stop_words = 'english')

clf1 = MultinomialNB(alpha=.01)
clf2 = SVC(C=10.0)

ft = FeatureStacker([('chars', countvect_char), ('words', countvect_word)])

In [42]:
pipeline = Pipeline([
        ('vect', ft),
        ('classifier', clf1)
    ])

In [22]:
scores = cross_val_score(pipeline, Xt, 
                         yt, cv=2, scoring=kappa_scorer,
                         n_jobs=1)
print scores

[ 0.30866723  0.29436923]


In [43]:
pipeline1 = Pipeline([
        ('vect', ft),
        ('svd', TruncatedSVD(n_components=140)),
        ('scl', StandardScaler()),
        ('classifier', clf2)
    ])

In [51]:
scores = cross_val_score(pipeline1, Xt, 
                         yt, cv=2, scoring=kappa_scorer,
                         n_jobs=1)
print scores

[ 0.35911812  0.38454006]


In [29]:
from sklearn.cross_validation import train_test_split

In [30]:
Xtrain, Xvalidation, ytrain, yvalidation = train_test_split(traindata, y, test_size=0.2, random_state=0)

In [44]:
pipeline.fit(traindata, y)

Pipeline(steps=[('vect', FeatureStacker(transformer_list=[('chars', TfidfVectorizer(analyzer='char', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range=(1, 2), norm...  vocabulary=None))])), ('classifier', MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True))])

In [45]:
pipeline1.fit(traindata, y)

Pipeline(steps=[('vect', FeatureStacker(transformer_list=[('chars', TfidfVectorizer(analyzer='char', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range=(1, 2), norm...f', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False))])

In [33]:
first_model_predict = pipeline.predict(Xvalidation)
second_model_predict = pipeline1.predict(Xvalidation)

In [34]:
print 'First model individual score %0.4f ' %(quadratic_weighted_kappa(yvalidation, first_model_predict))

First model individual score 0.4457 


In [35]:
print 'Second model individual score %0.4f ' %(quadratic_weighted_kappa(yvalidation, second_model_predict))

Second model individual score 0.5342 


In [36]:
print 'Average of two models score %0.4f ' %(quadratic_weighted_kappa(yvalidation, (first_model_predict + second_model_predict) / 2))

Average of two models score 0.5573 


In [46]:
# prediction on test data set
pred1 = pipeline.predict(testdata)

In [47]:
pred2 = pipeline1.predict(testdata)

In [48]:
avg_pred = (pred1 + pred2) / 2

In [49]:
# submission
make_submission(crowd_test.index.values.astype(int), avg_pred, 'ensemble1.csv')