In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
%run scripts/helper.py

In [3]:
crowd_train = load_file('./data/train.csv/train.csv', index_col='id')
crowd_test = load_file('./data/test.csv/test.csv', index_col='id')

In [4]:
crowd_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10158 entries, 1 to 32668
Data columns (total 5 columns):
query                  10158 non-null object
product_title          10158 non-null object
product_description    7714 non-null object
median_relevance       10158 non-null int64
relevance_variance     10158 non-null float64
dtypes: float64(1), int64(1), object(3)
memory usage: 357.1+ KB


In [5]:
crowd_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22513 entries, 3 to 32671
Data columns (total 3 columns):
query                  22513 non-null object
product_title          22513 non-null object
product_description    17086 non-null object
dtypes: object(3)
memory usage: 439.7+ KB


In [6]:
# fill in the missing np.nan values with empty string
crowd_train.fillna('', inplace=True, axis=1)
crowd_test.fillna('', inplace=True, axis=1)

In [7]:
print 'features in the training set ', list(crowd_train.columns)

features in the training set  ['query', 'product_title', 'product_description', 'median_relevance', 'relevance_variance']


### Relevance Variance is the standard deviation of the ratings for a search query

In [8]:
# lets explore it
crowd_train.relevance_variance.describe()

count    10158.000000
mean         0.377863
std          0.389707
min          0.000000
25%          0.000000
50%          0.471000
75%          0.471000
max          1.470000
Name: relevance_variance, dtype: float64

In [9]:
# lets see how many training examples have relevance_variance have less than mean relevacnce_variance
mean_relevance = crowd_train.relevance_variance.mean()
examples_less_than_mean_relevance = crowd_train[crowd_train.relevance_variance < mean_relevance].shape[0]
examples_less_than_mean_relevance

4439

In [10]:
# what percentage of training examples have relevance variance less than mean relevance variance
(examples_less_than_mean_relevance * 1. / crowd_train.shape[0]) * 100

43.69954715495176

### Around 44% of training examples have relevance score less than mean relevance variance

In [11]:
# lets only consider these queries as our trainig score 
# as many people have given similar scores to these queries

train = crowd_train[crowd_train.relevance_variance < mean_relevance]

In [12]:
traindata = list(train.apply(lambda x: '%s %s %s' %(x['query'], x['product_title'], x['product_description']), axis=1))
testdata = list(crowd_test.apply(lambda x: '%s %s %s' %(x['query'], x['product_title'], x['product_description']), axis=1))

In [13]:
# target variable
y = train.median_relevance.values

### Lets us convert us into Tf-IDF vector

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [15]:
tfv = TfidfVectorizer(min_df=3, max_features=None,
                     strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
                     ngram_range=(1, 2), use_idf=1,smooth_idf=1,sublinear_tf=1,
                     stop_words = 'english')

In [50]:
# fit TF-IDF
tfv.fit(traindata)
X = tfv.transform(traindata)
X_test = tfv.transform(testdata)

### Lets first split our dataset for cross validation

In [51]:
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import cross_val_score

In [52]:
Xt, Xv, yt, yv = train_test_split(X, y, test_size=0.2, random_state=0)

In [53]:
print Xt.shape, Xv.shape, yt.shape, yv.shape

(3551, 19070) (888, 19070) (3551,) (888,)


### We have amassed 19070 which needs some dimensionality reduction

In [54]:
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler

In [55]:
svd = TruncatedSVD(n_components=140)
Xt_svd = svd.fit_transform(Xt)
Xv_svd = svd.transform(Xv)

In [56]:
# scale features

scl = StandardScaler()
Xt_svd_scl = scl.fit_transform(Xt_svd)
Xv_svd_scl = scl.transform(Xv_svd)

In [57]:
from sklearn.metrics import make_scorer

In [58]:
# Weighted kappa scorer
kappa_scorer = make_scorer(quadratic_weighted_kappa, greater_is_better=True)

In [59]:
# cross validation
from sklearn.cross_validation import ShuffleSplit
from sklearn.svm import SVC

In [60]:
svc = SVC(C=6.0, gamma=.01)
cv = ShuffleSplit(Xt_svd_scl.shape[0], n_iter=3, test_size=.1, random_state=1724)

test_scores = cross_val_score(svc, Xt_svd_scl, yt, cv=cv, scoring=kappa_scorer, n_jobs=1)

In [61]:
print 'min score %0.3f, mean score %0.3f and max score %0.3f' %(test_scores.min(), test_scores.mean(), test_scores.max())

min score 0.523, mean score 0.576 and max score 0.650


In [63]:
# lets fit our model with this params
svc.fit(Xt_svd_scl, yt)

SVC(C=6.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.01,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [64]:
# lets see our test score
print 'test score is %0.3f ' %(quadratic_weighted_kappa(yv, svc.predict(Xv_svd_scl)))

test score is 0.655 


In [67]:
######## Training model on full dataset #############

## preprocessing
X_full = svd.fit_transform(X)
X_test = svd.transform(X_test)

In [68]:
## scaling
X_full = scl.fit_transform(X_full)
X_test = scl.transform(X_test)

In [69]:
# train model
svc.fit(X_full, y)

SVC(C=6.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.01,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [71]:
# predict for test
preds = svc.predict(X_test)

In [72]:
# Create your first submission file
submission = pd.DataFrame({"id": crowd_test.index.values.astype(int), "prediction": preds})
submission.to_csv("./submissions/onlyConfidentRelevanceExamples.csv", index=False)