In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
%run scripts/helper.py

In [3]:
crowd_train = load_file('./data/train.csv/train.csv', index_col='id')
crowd_test = load_file('./data/test.csv/test.csv', index_col='id')

In [4]:
traindata = prepareText(crowd_train)
testdata = prepareText(crowd_test)

In [5]:
y = getTargetVariable(crowd_train)

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

In [9]:
cv = CountVectorizer(min_df=3,  max_features=None, 
        strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
        ngram_range=(1, 2),
        stop_words = 'english')

In [10]:
# fit count vectorizer on train and test data
X = cv.fit_transform(traindata)
X_test = cv.transform(testdata)

In [11]:
from sklearn.metrics import make_scorer

# Weighted kappa scorer
kappa_scorer = make_scorer(quadratic_weighted_kappa, greater_is_better=True)

In [12]:
from sklearn.cross_validation import train_test_split

In [13]:
Xt, Xv, yt, yv = train_test_split(X, y, test_size=0.2, random_state=0)

In [14]:
print Xt.shape, Xv.shape, yt.shape, yv.shape

(8126, 44460) (2032, 44460) (8126,) (2032,)


In [15]:
from sklearn.decomposition import TruncatedSVD

In [16]:
svd = TruncatedSVD(n_components=140)
Xt = svd.fit_transform(Xt)
Xv = svd.transform(Xv)

In [17]:
from sklearn.preprocessing import StandardScaler

In [18]:
scl = StandardScaler()
Xt = scl.fit_transform(Xt)
Xv = scl.transform(Xv)

In [19]:
from sklearn.svm import SVC

In [20]:
svc = SVC(C=10.0, gamma=0.01)
svc.fit(Xt, yt)

SVC(C=10.0, cache_size=200, class_weight=None, coef0=0.0, degree=3,
  gamma=0.01, kernel='rbf', max_iter=-1, probability=False,
  random_state=None, shrinking=True, tol=0.001, verbose=False)

In [21]:
print 'score on training set %0.3f ' %(quadratic_weighted_kappa(yt, svc.predict(Xt))) 

score on training set 0.670 


In [22]:
print 'score on test set %0.3f ' %(quadratic_weighted_kappa(yv, svc.predict(Xv)))

score on test set 0.257 


In [23]:
from sklearn.metrics import confusion_matrix

In [24]:
confusion_matrix(yv, svc.predict(Xv))

array([[  31,   20,    9,  105],
       [  13,   44,   26,  191],
       [  11,   36,   51,  278],
       [  18,   41,   56, 1102]])

In [30]:
# confusion matrix suggests that most of the values are mistaken for 4

### Train two separate models and ensemble them if they are found un-correlated

In [31]:
mean_relevance_variation = crowd_train.relevance_variance.mean()

In [32]:
train_data_1 = crowd_train[crowd_train.relevance_variance < mean_relevance_variation]
train_response_1 = crowd_train[crowd_train.relevance_variance < mean_relevance_variation]['median_relevance'].values

In [33]:
train_data_2 = crowd_train[crowd_train.relevance_variance >= mean_relevance_variation]
train_response_2 = crowd_train[crowd_train.relevance_variance >= mean_relevance_variation]['median_relevance'].values

In [35]:
print train_data_1.shape, train_response_1.shape, train_data_2.shape, train_response_2.shape

 (4439, 5) (4439,) (5719, 5) (5719,)


In [38]:
train_data_text_1 = prepareText(train_data_1)
train_data_text_2 = prepareText(train_data_2)

In [39]:
print len(train_data_text_1), len(train_data_text_2)

4439 5719


In [57]:
train_data_train_1 = train_data_text_1[:3000]
y_train_1 = train_response_1[:3000]
train_data_test_1 = train_data_text_1[3000:]
y_test_1 = train_response_1[3000:]

train_data_train_2 = train_data_text_2[:4000]
y_train_2 = train_response_2[:4000]
train_data_test_2 = train_data_text_2[4000:]
y_test_2 = train_response_2[4000:]

In [36]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [46]:
tfv = TfidfVectorizer(min_df=3,  max_features=None, 
        strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
        ngram_range=(1, 2), use_idf=1,smooth_idf=1,sublinear_tf=1,
        stop_words = 'english') 

In [47]:
Xt_1 = tfv.fit_transform(train_data_train_1)
Xv_1 = tfv.transform(train_data_test_1)

In [97]:
Xt_2 = tfv.fit_transform(train_data_train_2)
Xv_2 = tfv.transform(train_data_test_2)

In [49]:
svd1 = TruncatedSVD(n_components=140)

Xt_1 = svd1.fit_transform(Xt_1)
Xv_1 = svd1.transform(Xv_1)

In [98]:
svd2 = TruncatedSVD(n_components=140)

Xt_2 = svd2.fit_transform(Xt_2)
Xv_2 = svd2.transform(Xv_2)

In [54]:
scl = StandardScaler()

Xt_1 = scl.fit_transform(Xt_1)
Xv_1 = scl.transform(Xv_1)

In [99]:
scl = StandardScaler()

Xt_2 = scl.fit_transform(Xt_2)
Xv_2 = scl.transform(Xv_2)

In [56]:
from sklearn.svm import SVC

In [60]:
svc1 = SVC(C=10.0, gamma=.01)
svc1.fit(Xt_1, y_train_1)

SVC(C=10.0, cache_size=200, class_weight=None, coef0=0.0, degree=3,
  gamma=0.01, kernel='rbf', max_iter=-1, probability=False,
  random_state=None, shrinking=True, tol=0.001, verbose=False)

In [100]:
svc2 = SVC(C=6.0, gamma=0.01)
svc2.fit(Xt_2, y_train_2)

SVC(C=6.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.01,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [62]:
print 'training score on first dataset %0.4f ' %(quadratic_weighted_kappa(y_train_1, svc1.predict(Xt_1)))

training score on first dataset 0.9656 


In [101]:
print 'training score on second dataset %0.4f ' %(quadratic_weighted_kappa(y_train_2, svc2.predict(Xt_2)))

training score on second dataset 0.8687 


In [78]:
print 'test score on first dataset %0.4f ' %(quadratic_weighted_kappa(y_test_1, svc1.predict(Xv_1)))

test score on first dataset 0.5940 


In [102]:
print 'test score on second dataset %0.4f ' %(quadratic_weighted_kappa(y_test_2, svc2.predict(Xv_2)))

test score on second dataset 0.3243 


In [103]:
ensemble_pred_1 = (svc1.predict(Xv_1) + svc2.predict(Xv_1)) / 2
print 'ensemble prediction on first dataset %0.4f ' %(quadratic_weighted_kappa(y_test_1, ensemble_pred_1))

ensemble prediction on first dataset 0.4050 


In [104]:
ensemble_pred_2 = (svc1.predict(Xv_2) + svc2.predict(Xv_2)) / 2
print 'ensemble prediction on second dataset %0.4f ' %(quadratic_weighted_kappa(y_test_2, ensemble_pred_2))

ensemble prediction on second dataset 0.2573 
