In [1]:
import pandas as pd
import numpy as np
import os
import scipy

from matplotlib import pyplot as plt
from sklearn.preprocessing import LabelEncoder, scale, normalize, MinMaxScaler
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif
from sklearn.model_selection import cross_validate, GridSearchCV
from sklearn.metrics import accuracy_score, auc, roc_curve
from sklearn.linear_model import LogisticRegression

In [2]:
BASE_PATH = os.path.join(os.getcwd(), os.pardir)
DATA_PATH = os.path.join(BASE_PATH, 'data')

In [3]:
X = scipy.sparse.load_npz(os.path.join(DATA_PATH, 'training_feats.npz'))

In [4]:
X

<49308x35522 sparse matrix of type '<class 'numpy.float64'>'
	with 2626763 stored elements in Compressed Sparse Row format>

In [5]:
y = pd.read_csv(os.path.join(DATA_PATH, 'training_labels.tsv'), sep='\t', index_col=0, header=None)

In [6]:
y.head()

Unnamed: 0_level_0,1
0,Unnamed: 1_level_1
10,medium
10000,low
100004,high
100007,low
100013,low


In [7]:
# Convert labels to numerical values
encoder = LabelEncoder()
y = encoder.fit_transform(y)

  y = column_or_1d(y, warn=True)


In [8]:
# 2 = medium, 1 = low, 0 = high
y

array([2, 1, 0, ..., 1, 1, 1])

In [9]:
seed = 667

In [10]:
X_test = scipy.sparse.load_npz(os.path.join(DATA_PATH, 'test_feats.npz'))

# 5-fold CV

We will first perform 5-fold CV on all of the features

In [11]:
# Define estimator; let's just try a basic logistic regression with no penalty
clf = LogisticRegression(solver='saga', random_state=seed, penalty='none')

In [12]:
metrics = ['accuracy', 'neg_log_loss']

In [13]:
# Now we can perform 5-fold CV
scores = cross_validate(clf, X, y, scoring=metrics, cv=5, return_train_score=True, n_jobs=3)

In [14]:
# Accuracy
scores['test_accuracy'].mean()

0.6945728921401108

In [15]:
# Log loss
scores['test_neg_log_loss'].mean() * -1

0.7787824784646624

In [58]:
clf.fit(X, y)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='none',
                   random_state=667, solver='saga', tol=0.0001, verbose=0,
                   warm_start=False)

# Feature selection
Let's try to reduce the number of features through feature selection. Let's use the features' mutual information values to select the top 1000 features

In [16]:
kbest = SelectKBest(mutual_info_classif, k=1000).fit(X, y)
X_new = kbest.transform(X)

In [17]:
X_new

<49308x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 2136010 stored elements in Compressed Sparse Row format>

Now let's train a model using 5-fold cross validation

In [18]:
clf_kbest = LogisticRegression(solver='saga', random_state=seed, penalty='none')

In [19]:
metrics = ['accuracy', 'neg_log_loss']

In [20]:
scores_kbest = cross_validate(clf_kbest, X_new, y, scoring=metrics, cv=5)



In [21]:
# Accuracy
scores_kbest['test_accuracy'].mean()

0.6945728921401108

In [22]:
# Log loss
scores_kbest['test_neg_log_loss'].mean() * -1

0.778782562720842

We see that taking the top 10 000 features with highest mutual importance helps

# Improving the model/data
## Data modifications
Let's try normalizing all the features to see if that helps

In [23]:
from sklearn.preprocessing import StandardScaler

In [24]:
enc = StandardScaler(with_mean=False)
X_new_normalized = enc.fit_transform(X_new)

In [25]:
clf_normalized = LogisticRegression(solver='saga', random_state=seed, penalty='none')

In [26]:
scores_normalized = cross_validate(clf_normalized, X_new_normalized, y, scoring=metrics, cv=5)



In [27]:
# Accuracy
scores_normalized['test_accuracy'].mean()

0.7061125696666528

In [28]:
# Log loss
scores_normalized['test_neg_log_loss'].mean() * -1

0.6943403092015097

We further lowered the log loss, so normalization helped.

## Model modifications
We increase the number of iterations, since it does not seem like the solver has converged. We now use 5000 iterations instead of the default 100

In [29]:
clf_normalized = LogisticRegression(solver='saga', random_state=seed, max_iter=5000, penalty='none')

In [30]:
scores_normalized = cross_validate(clf_normalized, X_new_normalized, y, scoring=metrics, cv=5, verbose=2, n_jobs=-1, return_train_score=True)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  6.6min finished


In [31]:
# Accuracy
scores_normalized['test_accuracy'].mean()

0.7251966870014394

In [32]:
scores_normalized['train_accuracy'].mean()

0.7386630948351504

In [33]:
# Log loss
scores_normalized['test_neg_log_loss'].mean() * -1

0.6445146699697663

In [34]:
scores_normalized['train_neg_log_loss'].mean() * -1

0.6050492386571408

We can see that the model is slightly overfitting, so lets try regularization

In [35]:
clf_regularized = LogisticRegression(solver='saga', random_state=seed, max_iter=5000, penalty='elasticnet', l1_ratio=0.2)

In [36]:
scores_regularized = cross_validate(clf_regularized, X_new_normalized, y, scoring=metrics, cv=5, verbose=2, n_jobs=-1, return_train_score=True)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed: 29.3min finished


In [37]:
# Accuracy
scores_regularized['test_accuracy'].mean()

0.7258457083988634

In [38]:
scores_regularized['train_accuracy'].mean()

0.7372333086475048

In [39]:
# Log loss
scores_regularized['test_neg_log_loss'].mean() * -1

0.6428499477195719

In [40]:
scores_regularized['train_neg_log_loss'].mean() * -1

0.6136797969730944

### Hyperparameter optimization
Now we can optimize the model's parameters using grid search. We optimize the type of penalty and C, the inverse regularization strength. This will also help reduce overfitting. 

In [None]:
params = {'C': [1e-2, 1e-1, 1, 1e1, 1e2], 'penalty': ['elasticnet'], 'l1_ratio': [0.2, 0.4, 0.6, 0.8]}

In [None]:
clf_cv_normalized = GridSearchCV(LogisticRegression(solver='saga', random_state=seed, max_iter=5000), params, 
                                 scoring=metrics, cv=5, return_train_score=True, verbose=2, refit='neg_log_loss', n_jobs=-1)

In [None]:
# We run this outside of the notebook, since it crashes if we try to run within the jupyter notebook
cv_out = clf_cv_normalized.fit(X_new_normalized, y)

In [41]:
import joblib

cv_out = joblib.load('lr_cv.model')
cv_out



GridSearchCV(cv=5, error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=5000, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=667, solver='saga',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=10,
             param_grid={'C': [0.01, 0.1, 1, 10.0, 100.0],
                         'l1_ratio': [0.2, 0.4, 0.6, 0.8],
                         'penalty': ['elasticnet']},
             pre_dispatch='2*n_jobs', refit='neg_log_loss',
             return_train_score=True, scoring=['accuracy', 'neg_log_loss'],
             verbose=2)

In [42]:
cv_out.best_estimator_

LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=0.2, max_iter=5000,
                   multi_class='auto', n_jobs=None, penalty='elasticnet',
                   random_state=667, solver='saga', tol=0.0001, verbose=0,
                   warm_start=False)

Let's retrain a model with these parameters

In [48]:
clf_opt = LogisticRegression(solver='saga', random_state=seed, max_iter=5000, penalty='elasticnet', l1_ratio=0.2, C=0.01)

In [49]:
scores_opt = cross_validate(clf_opt, X_new_normalized, y, scoring=metrics, cv=5, verbose=2, n_jobs=-1, return_train_score=True)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed: 34.9min finished


In [81]:
clf_opt.fit(X_new_normalized, y)

LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=0.2, max_iter=5000,
                   multi_class='auto', n_jobs=None, penalty='elasticnet',
                   random_state=667, solver='saga', tol=0.0001, verbose=0,
                   warm_start=False)

In [51]:
# Accuracy
scores_opt['test_accuracy'].mean()

0.725663175243983

In [50]:
scores_opt['test_neg_log_loss'].mean()

-0.6416982970856153

# Generation of test data

Now let's generate the classification results for our test data. We must first perform feature selection and normalization

In [52]:
X_test_new = kbest.transform(X_test)

In [53]:
X_test_norm = enc.transform(X_test_new)

In [83]:
scores_unopt = clf.predict_proba(X_test)

In [84]:
scores_unopt

array([[0.15290084, 0.6064138 , 0.24068536],
       [0.15788868, 0.59741287, 0.24469845],
       [0.11654313, 0.67550538, 0.20795148],
       ...,
       [0.19259308, 0.5373594 , 0.27004752],
       [0.21689081, 0.4978739 , 0.28523529],
       [0.15783627, 0.59748981, 0.24467392]])

In [85]:
scores_unopt[:,0]

array([0.15290084, 0.15788868, 0.11654313, ..., 0.19259308, 0.21689081,
       0.15783627])

In [86]:
test_ids = pd.read_csv(os.path.join(DATA_PATH, 'test_ids.tsv'), sep='\t', header=None, index_col=None)

In [87]:
test_out_unopt = pd.DataFrame({'listing_id': test_ids[0].values, 'high': scores_unopt[:,0], 'medium': scores_unopt[:,2], 
                               'low': scores_unopt[:,1]})

In [88]:
test_out_unopt.to_csv('lr_unopt_test_out.tsv', sep='\t', index=None)

Now to generate the test results for the optimized model

In [89]:
scores_opt = clf_opt.predict_proba(X_test_norm)

In [90]:
scores_opt

array([[0.08624902, 0.61548469, 0.29826629],
       [0.02730607, 0.89625713, 0.07643679],
       [0.04577804, 0.79244459, 0.16177737],
       ...,
       [0.03493086, 0.73672566, 0.22834348],
       [0.21765117, 0.40609722, 0.3762516 ],
       [0.05429806, 0.75328473, 0.19241721]])

In [91]:
test_out_opt = pd.DataFrame({'listing_id': test_ids[0].values, 'high': scores_opt[:,0], 'medium': scores_opt[:,2], 
                             'low': scores_opt[:,1]})

In [93]:
test_out_opt.to_csv('lr_opt_test_out.tsv', sep='\t', index=None)