In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from lr.text_processing.util import pre_process_nli_df
from lr.text_processing.util import get_corpus
from lr.training.util import get_ternary_label, filter_df_by_label
from lr.text_processing.transformations.wordnet import path_base_transformation
from lr.text_processing.transformations.wordnet import path_base_transformation_p
from lr.text_processing.transformations.wordnet import path_base_transformation_h
from lr.training.language_representation import Tfidf
from lr.training.util import get_ternary_label
from lr.models.logistic_regression import LRWrapper
from lr.stats.h_testing import LIMts_test
from IPython.display import display, HTML 
from sklearn.exceptions import ConvergenceWarning
import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)
warnings.filterwarnings("ignore", category=ConvergenceWarning)

### Loading data

In [2]:
train_path = "data/snli/train.csv"
dev_path = "data/snli/dev.csv"

train_path_mod = "data/snli/train_p_h_syn_noun.csv"
dev_path_mod = "data/snli/dev_p_h_syn_noun.csv"

train_trans = lambda df: path_base_transformation(df, train_path_mod)
dev_trans = lambda df: path_base_transformation(df, dev_path_mod)

# train_trans = lambda df: path_base_transformation_p(df, train_path_mod)
# dev_trans = lambda df: path_base_transformation_p(df, dev_path_mod)

# train_trans = lambda df: path_base_transformation_h(df, train_path_mod)
# dev_trans = lambda df: path_base_transformation_h(df, dev_path_mod)





train = pd.read_csv(train_path)
dev = pd.read_csv(dev_path)
train = filter_df_by_label(train.dropna()).reset_index(drop=True)
dev = filter_df_by_label(dev.dropna()).reset_index(drop=True)

train = train.head(10000)
dev = dev.head(1000)
pre_process_nli_df(train)
pre_process_nli_df(dev)
dev_t = dev_trans(dev)


print(train.shape)
print(dev.shape)

(10000, 3)
(1000, 3)


### Params

In [3]:
max_features = 500

param_grid = {"C": np.linspace(0, 3, 500),
              "penalty":["l2"]}


hyperparams = {"RepresentationFunction": Tfidf,
               "cv":5,
               "solver": 'lbfgs',
               "random_state": None,
               "verbose":False,
               "n_jobs":1,
               "n_iter":2,
               "max_features": max_features,
               "label_translation": get_ternary_label,
               "param_grid": param_grid}

## Get Features

In [4]:
repr_ = Tfidf(hyperparams)
train_corpus = get_corpus(train)
repr_.fit(train_corpus) 
X = repr_.transform(train_corpus)
y = get_ternary_label(train)

In [5]:
print(X.shape)
print(y.shape)

(10000, 500)
(10000,)


In [6]:
import xgboost as xgb
from sklearn.model_selection import train_test_split

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=123)

In [8]:
xg_cl = xgb.XGBClassifier(objective="multi:softprob",
                          n_estimators=15,
                          seed=123)

In [9]:
xg_cl.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
              max_depth=3, min_child_weight=1, missing=None, n_estimators=15,
              n_jobs=1, nthread=None, objective='multi:softprob',
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              seed=123, silent=True, subsample=1)

In [10]:
from sklearn.metrics import classification_report


y_pred = xg_cl.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          -1       0.44      0.53      0.48       665
           0       0.50      0.34      0.41       665
           1       0.45      0.50      0.48       670

    accuracy                           0.46      2000
   macro avg       0.46      0.46      0.46      2000
weighted avg       0.46      0.46      0.46      2000



In [11]:
xg_cl.score(X_test, y_test)

0.459

In [12]:
from sklearn.tree import DecisionTreeClassifier
dt_clf_4 = DecisionTreeClassifier(max_depth=5)
dt_clf_4.fit(X_train, y_train)

# Predict the labels of the test set: y_pred_4
y_pred_4 = dt_clf_4.predict(X_test)

# Compute the accuracy of the predictions: accuracy
accuracy = float(np.sum(y_pred_4==y_test))/y_test.shape[0]
print("accuracy:", accuracy)

accuracy: 0.443


In [13]:
train_dmatrix = xgb.DMatrix(data=X_train, label=y_train)

In [16]:
params = {"objective":"multi:softprob", "max_depth":4, 'num_class':3}

In [17]:
cv_results = xgb.cv(dtrain=train_dmatrix, params=params, nfold=4,
                    num_boost_round=10, metrics="error", as_pandas=True)

XGBoostError: b'[22:18:14] /workspace/src/objective/multiclass_obj.cu:110: SoftmaxMultiClassObj: label must be in [0, num_class).\n\nStack trace returned 10 entries:\n[bt] (0) /usr/local/lib/python3.6/dist-packages/xgboost/./lib/libxgboost.so(dmlc::StackTrace(unsigned long)+0x47) [0x7fe337f8dfc7]\n[bt] (1) /usr/local/lib/python3.6/dist-packages/xgboost/./lib/libxgboost.so(dmlc::LogMessageFatal::~LogMessageFatal()+0x1d) [0x7fe337f8e42d]\n[bt] (2) /usr/local/lib/python3.6/dist-packages/xgboost/./lib/libxgboost.so(xgboost::obj::SoftmaxMultiClassObj::GetGradient(xgboost::HostDeviceVector<float> const&, xgboost::MetaInfo const&, int, xgboost::HostDeviceVector<xgboost::detail::GradientPairInternal<float> >*)+0x8db) [0x7fe338175b1b]\n[bt] (3) /usr/local/lib/python3.6/dist-packages/xgboost/./lib/libxgboost.so(xgboost::LearnerImpl::UpdateOneIter(int, xgboost::DMatrix*)+0x37d) [0x7fe33806723d]\n[bt] (4) /usr/local/lib/python3.6/dist-packages/xgboost/./lib/libxgboost.so(XGBoosterUpdateOneIter+0x35) [0x7fe337f86535]\n[bt] (5) /usr/lib/x86_64-linux-gnu/libffi.so.6(ffi_call_unix64+0x4c) [0x7fe396a3cdae]\n[bt] (6) /usr/lib/x86_64-linux-gnu/libffi.so.6(ffi_call+0x22f) [0x7fe396a3c71f]\n[bt] (7) /usr/lib/python3.6/lib-dynload/_ctypes.cpython-36m-x86_64-linux-gnu.so(_ctypes_callproc+0x2b4) [0x7fe396c505c4]\n[bt] (8) /usr/lib/python3.6/lib-dynload/_ctypes.cpython-36m-x86_64-linux-gnu.so(+0x11c33) [0x7fe396c50c33]\n[bt] (9) /usr/bin/python3(_PyObject_FastCallKeywords+0x19c) [0x5aa6ec]\n\n'