In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
%run scripts/helper.py

In [3]:
crowd_train = load_file('./data/train.csv/train.csv', index_col='id')
crowd_test = load_file('./data/test.csv/test.csv', index_col='id')

In [4]:
# fill in the missing np.nan values with empty string
crowd_train.fillna('', inplace=True, axis=1)
crowd_test.fillna('', inplace=True, axis=1)

In [5]:
traindata = list(crowd_train.apply(lambda x: '%s %s %s' %(x['query'], x['product_title'], x['product_description']), axis=1))
testdata = list(crowd_test.apply(lambda x: '%s %s %s' %(x['query'], x['product_title'], x['product_description']), axis=1))

In [6]:
y = crowd_train.median_relevance.values

### Train a support vector machine

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [41]:
tfv = TfidfVectorizer(min_df=1, max_features=None,
                     strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
                     ngram_range=(1, 2), use_idf=1,smooth_idf=1,sublinear_tf=1,
                     stop_words = 'english')

In [None]:
tfv.fit(traindata)
X = tfv.transform(traindata)
X_test = tfv.transform(testdata)

In [10]:
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler

In [11]:
svd = TruncatedSVD(n_components=140)
X_svd = svd.fit_transform(X)
X_test_svd = svd.transform(X_test)

In [12]:
scl = StandardScaler()
X_svd_scl = scl.fit_transform(X_svd)
X_test_svd_scl = scl.transform(X_test_svd)

In [13]:
from sklearn.svm import SVC

In [14]:
svc = SVC(C=10.0, gamma=.01)

In [15]:
svc.fit(X_svd_scl, y)

SVC(C=10.0, cache_size=200, class_weight=None, coef0=0.0, degree=3,
  gamma=0.01, kernel='rbf', max_iter=-1, probability=False,
  random_state=None, shrinking=True, tol=0.001, verbose=False)

### Train a Multinomial NB classifier

In [16]:
from sklearn.naive_bayes import MultinomialNB

In [33]:
classifier = MultinomialNB(alpha=0.1).fit(X, y)

In [34]:
quadratic_weighted_kappa(y, classifier.predict(X))

0.7186444813827411

### Ensemble their predictions

In [19]:
svc_pred = svc.predict(X_test_svd_scl)

In [35]:
nb_predict = classifier.predict(X_test)

In [37]:
svc_pred[:10], nb_predict[:10]

(array([4, 3, 3, 2, 4, 4, 4, 4, 4, 2], dtype=int64),
 array([4, 4, 3, 3, 4, 4, 4, 4, 4, 4], dtype=int64))

In [38]:
ensemble_predict = (svc_pred + nb_predict) / 2

In [39]:
ensemble_predict[:10]

array([4, 3, 3, 2, 4, 4, 4, 4, 4, 3], dtype=int64)

In [40]:
# Create your first submission file
submission = pd.DataFrame({"id": crowd_test.index.values.astype(int), "prediction": ensemble_predict})
submission.to_csv("./submissions/ensembleNBAndSVCoptimized.csv", index=False)