In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from lr.text_processing.util import pre_process_nli_df
from lr.text_processing.util import get_corpus
from lr.training.util import get_ternary_label, filter_df_by_label
from lr.training.language_representation import Tfidf
from lr.training.util import get_ternary_label
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV

### Loading data

In [3]:
train_path = "data/snli/train.csv"
dev_path = "data/snli/dev.csv"

train = pd.read_csv(train_path)
dev = pd.read_csv(dev_path)
train = filter_df_by_label(train.dropna()).reset_index(drop=True)
dev = filter_df_by_label(dev.dropna()).reset_index(drop=True)

train = train.head(10000)
dev = dev.head(1000)
pre_process_nli_df(train)
pre_process_nli_df(dev)


print(train.shape)
print(dev.shape)

(10000, 3)
(1000, 3)


### Params

In [4]:
param_grid = {'n_estimators': range(10,30, 5),
                  'max_depth': range(2,31),
                  "reg_alpha":np.arange(0.05, 1.05, 0.05),
                  "reg_gamma":np.arange(0.05, 1.05, 0.05),
                  "learning_rate":np.arange(0.05, 1.05, 0.05),
                  "subsample":np.arange(0.05, 1.05, 0.05),
                  "colsample_bytree":np.arange(0.05, 1.05, 0.05)}


hyperparams = {"RepresentationFunction": Tfidf,
               "cv":3,
               "random_state": 123,
               "verbose":1,
               "n_jobs":1,
               "n_iter":2,
               "max_features": None,
               "label_translation": get_ternary_label,
               "param_grid": param_grid}

## Get Features

In [6]:
repr_ = Tfidf(hyperparams)
train_corpus = get_corpus(train)
repr_.fit(train_corpus)
X = repr_.transform(train_corpus)
y = get_ternary_label(train)
print(X.shape)
print(y.shape)

(10000, 5867)
(10000,)


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=123)

In [8]:
xg_cls = xgb.XGBClassifier(objective="multi:softprob",
                           n_estimatores=10,
                           seed=123)

xg_cls.fit(X_train, y_train)
y_pred = xg_cls.predict(X_test)
untuned_acc = np.mean(y_pred == y_test)

In [9]:
gbm = xgb.XGBClassifier(objective="multi:softprob")
params = hyperparams["param_grid"]


randomized_cv = RandomizedSearchCV(param_distributions=hyperparams["param_grid"],
                                   estimator=gbm,
                                   scoring="accuracy",
                                   n_iter=hyperparams["n_iter"],
                                   cv=hyperparams["cv"],
                                   verbose=hyperparams["verbose"],
                                   random_state=hyperparams["random_state"])


# Fit randomized_mse to the data
randomized_cv.fit(X,y)
y_pred = randomized_cv.predict(X_test)
tuned_acc = np.mean(y_pred == y_test)

Fitting 3 folds for each of 2 candidates, totalling 6 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:   12.2s finished


In [10]:
print(randomized_cv.best_score_)
print(randomized_cv.best_params_)

0.5147999503009689
{'subsample': 0.9500000000000001, 'reg_gamma': 0.2, 'reg_alpha': 0.2, 'n_estimators': 20, 'max_depth': 9, 'learning_rate': 0.4, 'colsample_bytree': 0.45}


In [11]:
ratio = untuned_acc / tuned_acc
print("untuned acc: {:.3f}".format(untuned_acc))
print("tuned acc: {:.3f}".format(tuned_acc))
print("tunning gain = {:.2%}".format(ratio))

untuned acc: 0.499
tuned acc: 0.711
tunning gain = 70.25%
