In [1]:
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [1]:
%run scripts/helper.py
%run scripts/models.py

In [3]:
crowd_train = load_file('./data/train.csv/train.csv', index_col='id')

In [4]:
y = crowd_train.median_relevance.values

### Stratified shuffle split to get smaller dataset with similar class frequency

In [5]:
train_idx, test_idx = ssSplit(y, train_size=6000)

In [6]:
X_small_train = crowd_train.iloc[train_idx]
y_small_train = y[train_idx]

X_small_validation = crowd_train.iloc[test_idx]
y_small_validation = y[test_idx]

### SVM model on unprocessed data by analyzing character similiarity taking 5 gram model with truncated svd with 400 components

In [7]:
X_small_train_unprocessed = prepareText(X_small_train)
X_small_validation_unprocessed = prepareText(X_small_validation)

### SVM model on unprocessed data by analyzing word similarity taking 2 gram model with truncated svd with 200 components

In [8]:
model_word, tfv_svm, svd_svm, scl_svm = build_non_linear_model(X_small_train_unprocessed, y_small_train, 'word')

In [9]:
non_linear_SVM_word_preds = non_linear_model_predictions(model_word, tfv_svm, svd_svm, scl_svm, X_small_validation_unprocessed)

In [10]:
print 'Validation set accuracy %0.4f ' %(quadratic_weighted_kappa(y_small_validation, non_linear_SVM_word_preds))

Validation set accuracy 0.5749 


### Linear SVM model with chi2 as selection criteria

In [11]:
linear_model, tfv_linear, select_linear = build_linear_model(X_small_train_unprocessed, y_small_train, 'word')

In [12]:
linear_SVM_preds = linear_model_predictions(linear_model, tfv_linear, select_linear, X_small_validation_unprocessed)

In [13]:
print 'Validation set accuracy %0.4f ' %(quadratic_weighted_kappa(y_small_validation, linear_SVM_preds))

Validation set accuracy 0.4107 


### KNN neighbors model on unprocessed data

In [14]:
knn_model_word, tfv_knn, svd_knn = build_knn_model(X_small_train_unprocessed, y_small_train, 'distance', 'word')

In [15]:
knn_word_preds = knn_model_predictions(knn_model_word, tfv_knn, svd_knn, X_small_validation_unprocessed)

In [16]:
print 'Validation set accuracy %0.4f ' %(quadratic_weighted_kappa(y_small_validation, knn_word_preds))

Validation set accuracy 0.5203 


### Let's see how correlated these predictions are among each other

In [17]:
pd.DataFrame({'non_lin_word': non_linear_SVM_word_preds,
              'lin_pred': linear_SVM_preds,
              'knn_pred_word': knn_word_preds
             }).corr()

Unnamed: 0,knn_pred_word,lin_pred,non_lin_word
knn_pred_word,1.0,0.584736,0.736249
lin_pred,0.584736,1.0,0.560435
non_lin_word,0.736249,0.560435,1.0


### lets take ensemble of all the three strategies

In [27]:
# Averaging
# ensemble_unprocessed = (non_linear_SVM_word_preds 
#                         + linear_SVM_preds + knn_word_preds) / 3

In [30]:
# Most common
ensemble_unprocessed = []

for i in range(len(knn_word_preds)):
    ensemble_unprocessed.append(most_common([knn_word_preds[i], non_linear_SVM_word_preds[i], linear_SVM_preds[i]]))

In [32]:
print 'Validation set accuracy %0.4f ' %(quadratic_weighted_kappa(y_small_validation, ensemble_unprocessed))

Validation set accuracy 0.5524 


### Models applied after processing text

In [33]:
X_small_train_processed = tweak_text(X_small_train)
X_small_validation_processed = tweak_text(X_small_validation)

In [34]:
tweaked_model, tfv_tweaked, svd_tweaked, scl_tweaked = build_stopwords_tweak_model(X_small_train_processed, y_small_train)

In [35]:
tweaked_predictions = non_linear_model_predictions(tweaked_model, tfv_tweaked,
                                                   svd_tweaked, scl_tweaked, X_small_validation_processed)

In [36]:
print 'Validation set accuracy %0.4f ' %(quadratic_weighted_kappa(y_small_validation, tweaked_predictions))

Validation set accuracy 0.5683 


### KNN on processed data

In [37]:
knn_model_distance, tfv_knn, svd_knn = build_knn_model(X_small_train_processed, y_small_train, 'distance','word')

In [38]:
knn_predictions_distance = knn_model_predictions(knn_model_distance, tfv_knn, svd_knn, X_small_validation_processed)

In [39]:
print 'Validation set accuracy %0.4f ' %(quadratic_weighted_kappa(y_small_validation, knn_predictions_distance))

Validation set accuracy 0.4981 


### Linear model on processed data

In [41]:
linear_model_proc, tfv_linear, select_linear = build_linear_model(X_small_train_processed, y_small_train, 'word')

In [42]:
linear_SVM_preds_proc = linear_model_predictions(linear_model_proc, tfv_linear, select_linear, X_small_validation_processed)

In [43]:
print 'Validation set accuracy %0.4f ' %(quadratic_weighted_kappa(y_small_validation, linear_SVM_preds_proc))

Validation set accuracy 0.4114 


In [44]:
pd.DataFrame({'non_linear_proc': tweaked_predictions,
              'knn_distance_proc': knn_predictions_distance,
              'linear_svm_proc': linear_SVM_preds_proc
             }).corr()

Unnamed: 0,knn_distance_proc,linear_svm_proc,non_linear_proc
knn_distance_proc,1.0,0.533003,0.71791
linear_svm_proc,0.533003,1.0,0.532926
non_linear_proc,0.71791,0.532926,1.0


In [304]:
# Averaging
# ensemble_processed = (knn_predictions_distance + linear_SVM_preds_proc
#                       + tweaked_predictions) / 3

In [45]:
# Most common
ensemble_processed = []

for i in range(len(knn_predictions_distance)):
    ensemble_processed.append(most_common([knn_predictions_distance[i], linear_SVM_preds_proc[i], tweaked_predictions[i]]))

In [46]:
ensemble_processed[:10]

[3, 4, 4, 4, 1, 4, 4, 4, 4, 4]

In [47]:
print 'Validation set accuracy %0.4f ' %(quadratic_weighted_kappa(y_small_validation, ensemble_processed))

Validation set accuracy 0.5554 


## Ensemble of unprocessed and processed results

In [306]:
# Averaging
# ensemble_all = (ensemble_processed + ensemble_unprocessed) / 2

In [50]:
# Most common
ensemble_all = []
for i in range(len(knn_word_preds)):
    common_pred = most_common([non_linear_SVM_word_preds[i], linear_SVM_preds[i], knn_word_preds[i],
                              tweaked_predictions[i], knn_predictions_distance[i], linear_SVM_preds_proc[i]])
    ensemble_all.append(common_pred)

In [51]:
ensemble_all[:10]

[3, 4, 4, 4, 1, 4, 4, 4, 4, 4]

In [52]:
print 'Validation set accuracy %0.4f ' %(quadratic_weighted_kappa(y_small_validation, ensemble_all))

Validation set accuracy 0.5711 
