In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
%run scripts/helper.py

In [3]:
crowd_train = load_file('./data/train.csv/train.csv', index_col='id')
crowd_test = load_file('./data/test.csv/test.csv', index_col='id')

In [4]:
# fill in the missing np.nan values with empty string
crowd_train.fillna('', inplace=True, axis=1)
crowd_test.fillna('', inplace=True, axis=1)

In [5]:
traindata = list(crowd_train.apply(lambda x: '%s %s %s' %(x['query'], x['product_title'], x['product_description']), axis=1))
testdata = list(crowd_test.apply(lambda x: '%s %s %s' %(x['query'], x['product_title'], x['product_description']), axis=1))

In [6]:
y = crowd_train.median_relevance.values

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [51]:
tfv = TfidfVectorizer(min_df=3, max_df=0.8, max_features=None,
                     strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
                     ngram_range=(1, 2), use_idf=1,smooth_idf=1,sublinear_tf=1,
                     stop_words = 'english')

In [52]:
tfv.fit(traindata)
X = tfv.transform(traindata)
X_test = tfv.transform(testdata)

In [53]:
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import cross_val_score

In [54]:
Xt, Xv, yt, yv = train_test_split(X, y, test_size=0.2, random_state=0)

In [55]:
print Xt.shape, Xv.shape, yt.shape, yv.shape

(8126, 44460) (2032, 44460) (8126,) (2032,)


In [56]:
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler

In [57]:
svd = TruncatedSVD(n_components=140)
Xt_svd = svd.fit_transform(Xt)
Xv_svd = svd.transform(Xv)

In [58]:
# scale features

scl = StandardScaler()
Xt_svd_scl = scl.fit_transform(Xt_svd)
Xv_svd_scl = scl.transform(Xv_svd)

In [59]:
from sklearn.metrics import make_scorer
# Weighted kappa scorer
kappa_scorer = make_scorer(quadratic_weighted_kappa, greater_is_better=True)

In [60]:
# cross validation
from sklearn.cross_validation import ShuffleSplit
from sklearn.svm import SVC

In [61]:
svc = SVC(C=10.0, gamma=.01)
cv = ShuffleSplit(Xt_svd_scl.shape[0], n_iter=2, test_size=.1, random_state=1724)

test_scores = cross_val_score(svc, Xt_svd_scl, yt, cv=cv, scoring=kappa_scorer, n_jobs=1)

In [62]:
print 'min score %0.3f, mean score %0.3f and max score %0.3f' %(test_scores.min(), test_scores.mean(), test_scores.max())

min score 0.472, mean score 0.501 and max score 0.529


In [63]:
from sklearn.ensemble import RandomForestClassifier

In [64]:
rf = RandomForestClassifier(n_estimators=100)
test_scores = cross_val_score(rf, Xt_svd_scl, yt, cv=cv, scoring=kappa_scorer, n_jobs=1)

In [65]:
print 'min score %0.3f, mean score %0.3f and max score %0.3f' %(test_scores.min(), test_scores.mean(), test_scores.max())

min score 0.218, mean score 0.259 and max score 0.299


In [None]:
crowd