In [1]:
import pandas as pd
import numpy as np
import os
import scipy

from matplotlib import pyplot as plt
from sklearn.preprocessing import LabelEncoder, scale, normalize, MinMaxScaler
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif
from sklearn.model_selection import cross_validate, GridSearchCV
from sklearn.metrics import accuracy_score, auc, roc_curve

In [2]:
BASE_PATH = os.path.join(os.getcwd(), os.pardir)
DATA_PATH = os.path.join(BASE_PATH, 'data')

In [3]:
X_num = scipy.sparse.load_npz(os.path.join(DATA_PATH, 'training_feats_num.npz'))
X_cat = scipy.sparse.load_npz(os.path.join(DATA_PATH, 'training_feats_cat.npz'))

In [4]:
y = pd.read_csv(os.path.join(DATA_PATH, 'training_labels.tsv'), sep='\t', index_col=0, header=None)

In [5]:
# Convert labels to numerical values
encoder = LabelEncoder()
y = encoder.fit_transform(y)

  y = column_or_1d(y, warn=True)


In [6]:
# 2 = medium, 1 = low, 0 = high
y

array([2, 1, 0, ..., 1, 1, 1])

In [7]:
seed = 667

In [8]:
X_test = scipy.sparse.load_npz(os.path.join(DATA_PATH, 'test_feats.npz'))

In [9]:
num_index = X_num.shape[1]
cat_index = X_cat.shape[1]

In [10]:
X = scipy.sparse.load_npz(os.path.join(DATA_PATH, 'training_feats.npz'))

In [11]:
X

<49308x35522 sparse matrix of type '<class 'numpy.float64'>'
	with 2626763 stored elements in Compressed Sparse Row format>

# Stacking
First let's try two different base classifiers trained on different data. One for numerical, and one for categorical. Since our results with logistic regression from milestone 2 were decent, let's try it with two logistic regression methods first. 

We will use a logistic regression model for the meta-classifier

In [12]:
from sklearn.linear_model import LogisticRegression
from mlxtend.classifier import StackingCVClassifier, StackingClassifier
from mlxtend.feature_selection import ColumnSelector
from sklearn.pipeline import make_pipeline

In [13]:
metrics = ['accuracy', 'neg_log_loss']

In [28]:
pipe1 = make_pipeline(ColumnSelector(cols=(range(0, num_index))),
                      LogisticRegression(max_iter=5000, solver='saga', penalty='none'))
pipe2 = make_pipeline(ColumnSelector(cols=(range(num_index, cat_index))),
                      LogisticRegression(max_iter=5000, solver='saga', penalty='none'))

sclf = StackingCVClassifier(classifiers=[pipe1, pipe2], 
                            meta_classifier=LogisticRegression(max_iter=5000, solver='saga', penalty='none'),
                            use_probas=False,
                            random_state=seed)

prob_scores = cross_validate(sclf, X, y, scoring=metrics, cv=5, return_train_score=True)



In [29]:
prob_scores['test_accuracy'].mean()

0.6945728921401108

In [30]:
prob_scores['test_neg_log_loss'].mean() * -1

0.7795002488221615

In [31]:
prob_scores['train_accuracy'].mean()

0.6945728889906816

In [32]:
prob_scores['train_neg_log_loss'].mean() * -1

0.7328898688840402

In [66]:
naive_sclf = StackingClassifier(classifiers=[pipe1, pipe2], 
                                meta_classifier=LogisticRegression(solver='saga', penalty='none'),
                                use_probas=True)

naive_sclf.fit(X, y)

StackingClassifier(average_probas=False,
                   classifiers=[Pipeline(memory=None,
                                         steps=[('columnselector',
                                                 ColumnSelector(cols=range(0, 12),
                                                                drop_axis=False)),
                                                ('logisticregression',
                                                 LogisticRegression(C=1.0,
                                                                    class_weight=None,
                                                                    dual=False,
                                                                    fit_intercept=True,
                                                                    intercept_scaling=1,
                                                                    l1_ratio=None,
                                                                    max_iter=5000,
                

Now let's try using probabilities as the meta features

In [34]:
pipe1 = make_pipeline(ColumnSelector(cols=(range(0, num_index))),
                      LogisticRegression(max_iter=5000, solver='saga', penalty='none'))
pipe2 = make_pipeline(ColumnSelector(cols=(range(num_index, cat_index))),
                      LogisticRegression(max_iter=5000, solver='saga', penalty='none'))

sclf = StackingCVClassifier(classifiers=[pipe1, pipe2], 
                            meta_classifier=LogisticRegression(max_iter=5000, solver='saga', penalty='l1'),
                            use_probas=True,
                            random_state=seed)

prob_scores = cross_validate(sclf, X, y, scoring=metrics, cv=5, return_train_score=True)



In [35]:
prob_scores['test_accuracy'].mean()

0.7153402606512413

In [36]:
prob_scores['test_neg_log_loss'].mean() * -1

0.6615378082036653

In [37]:
prob_scores['train_accuracy'].mean()

0.8170732890192512

In [38]:
prob_scores['train_neg_log_loss'].mean() * -1

0.4941857597255641

Let's try adding l1 regularization

In [19]:
pipe1 = make_pipeline(ColumnSelector(cols=(range(0, num_index))),
                      LogisticRegression(solver='saga', penalty='l1'))
pipe2 = make_pipeline(ColumnSelector(cols=(range(num_index, cat_index))),
                      LogisticRegression(solver='saga', penalty='l1'))

sclf = StackingCVClassifier(classifiers=[pipe1, pipe2], 
                            meta_classifier=LogisticRegression(solver='saga', penalty='l1'),
                            use_probas=True,
                            random_state=seed)

prob_scores = cross_validate(sclf, X, y, scoring=metrics, cv=5, return_train_score=True)



In [20]:
prob_scores['test_accuracy'].mean()

0.7198628570469621

In [21]:
prob_scores['test_neg_log_loss'].mean() * -1

0.6434958057546839

In [22]:
prob_scores['train_accuracy'].mean()

0.746151738012778

In [23]:
prob_scores['train_neg_log_loss'].mean() * -1

0.5921988670667642

Now let's try adding l2 regularization

In [39]:
pipe1 = make_pipeline(ColumnSelector(cols=(range(0, num_index))),
                      LogisticRegression(max_iter=5000))
pipe2 = make_pipeline(ColumnSelector(cols=(range(num_index, cat_index))),
                      LogisticRegression(max_iter=5000))

sclf = StackingCVClassifier(classifiers=[pipe1, pipe2], 
                            meta_classifier=LogisticRegression(max_iter=5000), 
                            random_state=seed)

scores = cross_validate(sclf, X, y, scoring=metrics, cv=5, return_train_score=True)

In [40]:
scores['test_accuracy'].mean()

0.697026841829888

In [41]:
scores['test_neg_log_loss'].mean() * -1

0.7665314960724455

In [42]:
prob_scores['train_accuracy'].mean()

0.8170732890192512

In [43]:
prob_scores['train_neg_log_loss'].mean() * -1

0.4941857597255641

This is actually better than our baseline logistic regression model, so perhaps we can improve further on this with some optimizations. Now let's see if we get better results for using probabilities in the meta classifier.

In [44]:
pipe1 = make_pipeline(ColumnSelector(cols=(range(0, num_index))),
                      LogisticRegression(max_iter=5000))
pipe2 = make_pipeline(ColumnSelector(cols=(range(num_index, cat_index))),
                      LogisticRegression(max_iter=5000))

sclf = StackingCVClassifier(classifiers=[pipe1, pipe2], 
                            meta_classifier=LogisticRegression(max_iter=5000),
                            use_probas=True,
                            random_state=seed)

prob_scores = cross_validate(sclf, X, y, scoring=metrics, cv=5, return_train_score=True)

In [45]:
prob_scores['test_accuracy'].mean()

0.7254806462022477

In [46]:
prob_scores['test_neg_log_loss'].mean() * -1

0.6229847874937653

In [47]:
prob_scores['train_accuracy'].mean()

0.8132047437638462

In [48]:
prob_scores['train_neg_log_loss'].mean() * -1

0.47514005804015386

# Improvements

## Data Modifications
First let's try doing feature selection on the categorical features

In [14]:
kbest = SelectKBest(mutual_info_classif, k=1000).fit(X_cat, y)
X_cat_new = kbest.transform(X_cat)

In [15]:
support = kbest.get_support()

In [16]:
support = np.nonzero(support)[0].tolist()

In [17]:
new_support = [x + num_index for x in support]

Now let's try training the l2 logistic regression model the feature selected features

In [49]:
pipe1 = make_pipeline(ColumnSelector(cols=(range(0, num_index))),
                      LogisticRegression(max_iter=5000))
pipe2 = make_pipeline(ColumnSelector(cols=(new_support)),
                      LogisticRegression(max_iter=5000))

sclf = StackingCVClassifier(classifiers=[pipe1, pipe2], 
                            meta_classifier=LogisticRegression(max_iter=5000),
                            use_probas=True,
                            random_state=seed)

prob_scores = cross_validate(sclf, X, y, scoring=metrics, cv=5, return_train_score=True)

In [50]:
prob_scores['test_accuracy'].mean()

0.7268597590877421

In [51]:
prob_scores['test_neg_log_loss'].mean() * -1

0.6257758349640075

In [52]:
prob_scores['train_accuracy'].mean()

0.7380648191100505

In [53]:
prob_scores['train_neg_log_loss'].mean() * -1

0.6065777936441414

The log loss has gone down again, now let's try standardizing all the features

In [18]:
from sklearn.preprocessing import RobustScaler

In [19]:
enc = RobustScaler(with_centering=False)
X_new_normalized = enc.fit_transform(X)

In [54]:
pipe1 = make_pipeline(ColumnSelector(cols=(range(0, num_index))),
                      LogisticRegression(max_iter=5000))
pipe2 = make_pipeline(ColumnSelector(cols=(new_support)),
                      LogisticRegression(max_iter=5000))

sclf = StackingCVClassifier(classifiers=[pipe1, pipe2], 
                            meta_classifier=LogisticRegression(max_iter=5000),
                            use_probas=True,
                            random_state=seed)

prob_scores = cross_validate(sclf, X_new_normalized, y, scoring=metrics, cv=5, return_train_score=True)

In [55]:
prob_scores['test_accuracy'].mean()

0.7269611645679446

In [56]:
prob_scores['test_neg_log_loss'].mean() * -1

0.6254488292172622

In [57]:
prob_scores['train_accuracy'].mean()

0.7378062412287074

In [58]:
prob_scores['train_neg_log_loss'].mean() * -1

0.6062556736728788

The log loss goes down slightly. Now we performed grid search to find optimal parameters

## Grid Search

In [None]:
# This was run on a compute cluster 
pipe1 = make_pipeline(ColumnSelector(cols=(range(0, num_index))),
                      LogisticRegression(max_iter=5000, solver='saga'))
pipe2 = make_pipeline(ColumnSelector(cols=(new_support)),
                      LogisticRegression(max_iter=5000, solver='saga'))

params = {'pipeline-1__logisticregression__C': [1e-2, 1e-1, 1, 1e1, 1e2],
          'pipeline-1__logisticregression__penalty': ['l1', 'l2'],
          'pipeline-2__logisticregression__C': [1e-2, 1e-1, 1, 1e1, 1e2],
          'pipeline-2__logisticregression__penalty': ['l1', 'l2'],
          'meta_classifier__C': [1e-2, 1e-1, 1, 1e1, 1e2],
          'meta_classifier__penalty': ['l1', 'l2']}

sclf = StackingCVClassifier(classifiers=[pipe1, pipe2], 
                            meta_classifier=LogisticRegression(max_iter=5000, solver='saga'),
                            use_probas=True,
                            random_state=seed)

grid = GridSearchCV(sclf, params, cv=5, verbose=2, refit='neg_log_loss', scoring=metrics, n_jobs=-1)
grid.fit(X_new_normalized, y)

We found the best parameters to be:

Numerical model: C = 100, l1

Categorical model: C = 10, l2

Meta classifier: C = 100, l1

In [89]:
pipe1 = make_pipeline(ColumnSelector(cols=(range(0, num_index))),
                      LogisticRegression(max_iter=5000, C=100, solver='saga', penalty='l1'))
pipe2 = make_pipeline(ColumnSelector(cols=(new_support)),
                      LogisticRegression(max_iter=5000, C=10, penalty='l2'))

sclf = StackingCVClassifier(classifiers=[pipe1, pipe2], 
                            meta_classifier=LogisticRegression(max_iter=5000, C=100, solver='saga', penalty='l1'),
                            use_probas=True,
                            random_state=seed)

prob_scores = cross_validate(sclf, X_new_normalized, y, scoring=metrics, cv=5, return_train_score=True)

In [90]:
prob_scores['test_accuracy'].mean()

0.7259877126781384

In [91]:
prob_scores['test_neg_log_loss'].mean() * -1

0.6261126349101211

In [92]:
prob_scores['train_accuracy'].mean()

0.7374310493847532

In [93]:
prob_scores['train_neg_log_loss'].mean() * -1

0.6070221036728812

In [94]:
pipe1 = make_pipeline(ColumnSelector(cols=(range(0, num_index))),
                      LogisticRegression(max_iter=5000, C=100, solver='saga', penalty='l1'))
pipe2 = make_pipeline(ColumnSelector(cols=(new_support)),
                      LogisticRegression(max_iter=5000, C=10, penalty='l2'))

sclf = StackingCVClassifier(classifiers=[pipe1, pipe2], 
                            meta_classifier=LogisticRegression(max_iter=5000, C=100, solver='saga', penalty='l1'),
                            use_probas=True,
                            random_state=seed)

opt_sclf.fit(X_new_normalized, y)

StackingClassifier(average_probas=False,
                   classifiers=[Pipeline(memory=None,
                                         steps=[('columnselector',
                                                 ColumnSelector(cols=range(0, 12),
                                                                drop_axis=False)),
                                                ('logisticregression',
                                                 LogisticRegression(C=10,
                                                                    class_weight=None,
                                                                    dual=False,
                                                                    fit_intercept=True,
                                                                    intercept_scaling=1,
                                                                    l1_ratio=None,
                                                                    max_iter=5000,
                 

# Generation of test data
Let's first generate the test results for our unoptimized model

In [68]:
scores_unopt = naive_sclf.predict_proba(X_test)

In [69]:
scores_unopt

array([[0.05372743, 0.6882577 , 0.25801487],
       [0.01973129, 0.89014241, 0.0901263 ],
       [0.01782019, 0.86816979, 0.11401003],
       ...,
       [0.03007987, 0.79178372, 0.17813641],
       [0.46199034, 0.14522478, 0.39278488],
       [0.0357455 , 0.79116652, 0.17308798]])

In [70]:
scores_unopt[:,0]

array([0.05372743, 0.01973129, 0.01782019, ..., 0.03007987, 0.46199034,
       0.0357455 ])

In [63]:
test_ids = pd.read_csv(os.path.join(DATA_PATH, 'test_ids.tsv'), sep='\t', header=None, index_col=None)

In [71]:
test_out_unopt = pd.DataFrame({'listing_id': test_ids[0].values, 'high': scores_unopt[:,0], 'medium': scores_unopt[:,2], 
                               'low': scores_unopt[:,1]})

In [72]:
test_out_unopt.to_csv('stacking_unopt_test_out.tsv', index=False)

Now let's generate it for the optimized model


In [25]:
X_test_new_normalized = enc.transform(X_test)

In [95]:
scores_opt = opt_sclf.predict_proba(X_test_new_normalized)

In [96]:
scores_opt

array([[0.05661287, 0.67820566, 0.26518147],
       [0.02674066, 0.86736963, 0.10588971],
       [0.0303675 , 0.81779823, 0.15183427],
       ...,
       [0.03199778, 0.78380622, 0.184196  ],
       [0.39368618, 0.16587651, 0.44043731],
       [0.04342653, 0.76499966, 0.1915738 ]])

In [97]:
test_out_opt = pd.DataFrame({'listing_id': test_ids[0].values, 'high': scores_opt[:,0], 'medium': scores_opt[:,2], 
                             'low': scores_opt[:,1]})

In [98]:
test_out_opt.to_csv('stack_opt_test_out.tsv', index=False)