In [36]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [37]:
%run scripts/helper.py

### Loading training and test set

In [38]:
crowd_train = load_file('./data/train.csv/train.csv', index_col='id')
crowd_test = load_file('./data/test.csv/test.csv', index_col='id')

In [39]:
crowd_train.fillna('', inplace=True)
crowd_test.fillna('', inplace=True)

### Structure of the training and test set

In [40]:
crowd_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10158 entries, 1 to 32668
Data columns (total 5 columns):
query                  10158 non-null object
product_title          10158 non-null object
product_description    10158 non-null object
median_relevance       10158 non-null int64
relevance_variance     10158 non-null float64
dtypes: float64(1), int64(1), object(3)
memory usage: 357.1+ KB


<b>There are some missing values for product description in the training set.</b>

In [41]:
crowd_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22513 entries, 3 to 32671
Data columns (total 3 columns):
query                  22513 non-null object
product_title          22513 non-null object
product_description    22513 non-null object
dtypes: object(3)
memory usage: 439.7+ KB


<b>There are some missing values for product description in the test set.</b>

In [42]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [108]:
tfv = TfidfVectorizer(min_df=3, max_features=None,
                     strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
                     ngram_range=(1, 2), use_idf=1,smooth_idf=1,sublinear_tf=1,
                     stop_words = 'english')

In [44]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

In [109]:
# lets create term-frequency and inverse document frequency
traindata = list(crowd_train.apply(lambda x:'%s %s %s' % ((x['query']),(x['product_title']), (x['product_description'])),axis=1))
testdata = list(crowd_test.apply(lambda x:'%s %s %s' % ((x['query']),(x['product_title']), (x['product_description'])),axis=1))

In [110]:
y = crowd_train.median_relevance.values

In [111]:
# fit TFIDF
tfv.fit(traindata)
X = tfv.transform(traindata)
X_test = tfv.transform(testdata)

In [112]:
print X.shape, X_test.shape

(10158, 84419) (22513, 84419)


In [113]:
from sklearn.cross_validation import train_test_split
Xt, Xv, yt, yv = train_test_split(X, y, test_size=0.2, random_state=0)

In [114]:
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler

In [115]:
# initialize SVD
svd = TruncatedSVD(n_components=160)
Xt_svd = svd.fit_transform(Xt)
Xv_svd = svd.transform(Xv)

In [116]:
print 'Shape of training set ', Xt_svd.shape,  ' and test set ', Xv_svd.shape

Shape of training set  (8126, 160)  and test set  (2032, 160)


In [117]:
# initialize the standard scaler
scl = StandardScaler()
Xt_svd = scl.fit_transform(Xt_svd)
Xv_svd = scl.transform(Xv_svd)

In [118]:
from sklearn.svm import SVC
# from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier

In [130]:
# model of choice here is
svm_model = SVC(C=10.0, gamma=0.01)
# logreg = LogisticRegression(fit_intercept=True)
gbm = GradientBoostingClassifier(learning_rate=0.05, n_estimators=500, max_depth=8)

In [120]:
from sklearn import decomposition, pipeline, metrics, grid_search

In [121]:
# Kappa Scorer 
kappa_scorer = metrics.make_scorer(quadratic_weighted_kappa, greater_is_better = True)

In [131]:
# train a model
# logreg.fit(Xt, yt)
svm_model.fit(Xt_svd, yt)

SVC(C=5.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.01,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [106]:
# Gradient Boosting Machine Model
# gbm.fit(Xt, yt)

In [132]:
print 'weighted kappa score for svm on  training set is %0.3f' %(quadratic_weighted_kappa(yt, svm_model.predict(Xt_svd)))

weighted kappa score for svm on  training set is 0.811


In [133]:
print 'weighted kappa score for svm on test set is %0.3f' %(quadratic_weighted_kappa(yv, svm_model.predict(Xv_svd)))

weighted kappa score for svm on test set is 0.548


In [28]:
svm_predict = svm_model.predict(Xv)
gbm_predict = gbm.predict(Xv)

In [29]:
print 'weighted kappa score for ensemble on test set is %0.3f' %(quadratic_weighted_kappa(yv, (svm_predict + gbm_predict) / 2))

weighted kappa score for ensemble on test set is 0.573


In [70]:
X = svd.fit_transform(X)
X = scl.fit_transform(X)

In [71]:
svm_model.fit(X, y)

SVC(C=10.0, cache_size=200, class_weight=None, coef0=0.0, degree=3,
  gamma=0.01, kernel='rbf', max_iter=-1, probability=False,
  random_state=None, shrinking=True, tol=0.001, verbose=False)

In [31]:
X_test = svd.transform(X_test)
X_test = scl.transform(X_test)

In [None]:
preds = svm_model.predict(X_test)

In [33]:
# Create your first submission file
submission = pd.DataFrame({"id": crowd_test.index.values.astype(int), "prediction": preds})
submission.to_csv("./submissions/ensembleSVMAndGBM.csv", index=False)