In [1]:
# Import Libraries
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [32]:
# Load external scripts
%run scripts/helper.py
%run scripts/model_train_plus_test.py
%run query_features.py

In [5]:
# Load training and test data downloaded from kaggle
crowd_train = load_file('./data/train.csv/train.csv', None)
crowd_test = load_file('./data/test.csv/test.csv', None)

In [6]:
# Store the response variable
target = crowd_train.median_relevance.values

In [7]:
# Take a stratified split of the dataset to get a sample
# that would have same class frequency as the crowd_train dataset

train_index, test_index = ssSplit(target, train_size=8000, random_state=44)

In [8]:
# training data
training_data = crowd_train.iloc[train_index]
testing_data = crowd_train.iloc[test_index]

In [9]:
# process data 
Xt = tweak_text(training_data)
Xv = tweak_text(testing_data)

In [12]:
yt = target[train_index]
yv = target[test_index]

## 2 fold Stacking

* Split the train set in 2 parts: train_a and train_b
* Fit a first-stage model on train_a and create predictions for train_b
* Fit the same model on train_b and create predictions for train_a
* Finally fit the model on the entire train set and create predictions for the test set.
* Now train a second-stage stacker model on the probabilities from the first-stage model(s).

In [10]:
# Bag of words model with ngram range = (1, 2)
Xfitted, tfv = TFIDF(Xt, None)

In [83]:
train_data_len = Xfitted.shape[0]

In [84]:
# Split the dataset into two parts
Xfitted_a  = Xfitted[:train_data_len/2]
Xfitted_b = Xfitted[train_data_len/2:]

ya = yt[:train_data_len/2]
yb = yt[train_data_len/2:]

In [85]:
# Train a linear model on train_a
linear_model_a, select_a = build_linear_model(Xfitted_a, ya)

In [86]:
# Transform train_b
Xfitted_b_transformed = select_a.transform(Xfitted_b)

In [87]:
Xfitted_b_preds = linear_model_a.predict_proba(Xfitted_b_transformed)

In [88]:
# Train a linear model on train_b
linear_model_b, select_b = build_linear_model(Xfitted_b, yb)

In [89]:
# Transform train_a
Xfitted_a_transformed = select_b.transform(Xfitted_a)

In [90]:
# Generate predictions for train_a
Xfitted_a_preds = linear_model_b.predict_proba(Xfitted_a_transformed)

In [91]:
features_a = stack([Xfitted_a_preds, Xfitted_a])

In [92]:
features_b = stack([Xfitted_b_preds, Xfitted_b])

In [93]:
features_first_stage = concat_examples([features_a, features_b])

In [96]:
# Lets train a linear model on this feature set
linear_model, select = build_linear_model(features_first_stage, yt)

In [99]:
# Generate probabilities for the features
features_first_stage_selected = select.transform(features_first_stage)
first_stage_probabilities = linear_model.predict_proba(features_first_stage_selected)

### Train second stage classifier

In [100]:
svd = TruncatedSVD(n_components=200, algorithm='randomized', n_iter=5, random_state=None, tol=0.0)

scl = StandardScaler(copy=True, with_mean=True, with_std=True)

clf = SVC(C=10.0, kernel='rbf', degree=3, 
        gamma=0.0, coef0=0.0, shrinking=True, probability=False, 
        tol=0.001, cache_size=200, class_weight=None, 
        verbose=False, max_iter=-1, random_state=None)

keywords = keyword_counter(training_data)
# keywords = keyword_counter(crowd_train)

In [101]:
# Stack previous stage predictions with features generated for
# second stage model
features_second_stage = stack([first_stage_probabilities, keywords, Xfitted])

In [102]:
# Shape of the features dataframe
features_second_stage.shape

(8000, 40027)

In [103]:
from sklearn.pipeline import Pipeline

In [104]:
pipeline = Pipeline([('svd', svd), ('scl', scl), ('clf', clf)])

In [105]:
pipeline.fit(features_second_stage, yt)

Pipeline(steps=[('svd', TruncatedSVD(algorithm='randomized', n_components=200, n_iter=5,
       random_state=None, tol=0.0)), ('scl', StandardScaler(copy=True, with_mean=True, with_std=True)), ('clf', SVC(C=10.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False))])

### Prediction on the validation set

In [106]:
Xv_fitted = tfv.transform(Xv) 

In [109]:
Xv_data_len = len(Xv)

In [116]:
Xv_fitted_a = Xv_fitted[:Xv_data_len/2]
Xv_fitted_b = Xv_fitted[Xv_data_len/2:]

yv_a = yv[:Xv_data_len/2]
yv_b = yv[Xv_data_len/2:]

In [118]:
Xv_fitted_b_transformed = select_a.transform(Xv_fitted_b)
Xv_fitted_b_preds = linear_model_a.predict_proba(Xv_fitted_b_transformed)

In [119]:
Xv_fitted_a_transformed = select_b.transform(Xv_fitted_a)
Xv_fitted_a_preds = linear_model_b.predict_proba(Xv_fitted_a_transformed)

In [120]:
features_test_a = stack([Xv_fitted_a_preds, Xv_fitted_a])
features_test_b = stack([Xv_fitted_b_preds, Xv_fitted_b])

In [121]:
features_test = concat_examples([features_test_a, features_test_b])

In [123]:
features_test_transformed = select.transform(features_test)

In [124]:
first_stage_test_preds = linear_model.predict_proba(features_test_transformed)

### Second stage

In [125]:
keywords_test = keyword_counter(testing_data)

In [131]:
features_second_stage_test = stack([first_stage_test_preds, keywords_test, Xv_fitted])

In [None]:
second_stage_preds = pipeline.predict(fea)