In [130]:
# STANDARD LIBRARIES
import pandas as pd
import numpy as np
import pickle

# VISUALS
import matplotlib.pyplot as plt
import seaborn as sns

# FEATURE ENGINEERING AND PREPROCESSING
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split

# MODELING
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel

# METRICS
from sklearn.metrics import confusion_matrix

# Setup Features and Preprocessing

In [98]:
main = pd.read_csv("../data/clean-data/main-engineered.csv")
main.drop(columns=["Unnamed: 0"], inplace=True)

In [99]:
# DROPPING ROWS WHERE THE TARGET HAS ONLY 1 VALUE
# TO HELP WITH PREDICTION MODEL
y_least_pop_classes = main.groupby("label_zip").filter(lambda x: len(x) == 1).index

print(main.shape)
main.drop(y_least_pop_classes, inplace=True)

(97515, 101)


In [100]:
X = main.drop(columns=[
    "label_zip",
#     "label_street",
#     "zipcode",
#     "zip_street"
    "zip_num_street"
])
y = main["label_zip"]

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.33,
                                                    random_state=42)

ss = StandardScaler()
Z = ss.fit_transform(X)
Z_train =  ss.fit_transform(X_train)
Z_test = ss.transform(X_test)

In [101]:
y.value_counts()

15    9071
23    8478
27    7688
22    7015
21    6674
18    6666
19    6164
26    5578
16    5354
11    4572
14    4391
17    4044
13    3734
20    3069
9     2400
25    2284
10    2276
12    1412
8     1357
7     1192
6      934
31     858
1      676
4      475
2      441
34     231
3      224
30     215
28      18
32      13
33       8
Name: label_zip, dtype: int64

# Model 

In [102]:
logreg_pipe = Pipeline([
    ("ss", StandardScaler()),
    ("logreg", LogisticRegression())
])

logreg_pipe.get_params()

{'memory': None,
 'steps': [('ss', StandardScaler(copy=True, with_mean=True, with_std=True)),
  ('logreg',
   LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                      intercept_scaling=1, l1_ratio=None, max_iter=100,
                      multi_class='auto', n_jobs=None, penalty='l2',
                      random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                      warm_start=False))],
 'verbose': False,
 'ss': StandardScaler(copy=True, with_mean=True, with_std=True),
 'logreg': LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                    intercept_scaling=1, l1_ratio=None, max_iter=100,
                    multi_class='auto', n_jobs=None, penalty='l2',
                    random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                    warm_start=False),
 'ss__copy': True,
 'ss__with_mean': True,
 'ss__with_std': True,
 'logreg__C': 1.0,
 'logreg__class_weight': None,
 'logre

In [104]:
params = {
    "logreg__C": [0.75, 0.85, 1],
    "logreg__solver": ["liblinear"],
    "logreg__penalty": ["l1"],
    "logreg__verbose": [10]
}

gs = GridSearchCV(
    logreg_pipe,
    params,
    cv=5
)

In [106]:
gs.fit(Z_train, y_train)

[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear]

GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('ss',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('logreg',
                                        LogisticRegression(C=1.0,
                                                           class_weight=None,
                                                           dual=False,
                                                           fit_intercept=True,
                                                           intercept_scaling=1,
                                                           l1_ratio=None,
                                                           max_iter=100,
                                                           multi_class='auto',
              

In [107]:
gs.best_params_
gs.best_score_
gs.cv_results_
print(gs.score(Z_train, y_train))
print(gs.score(Z_test, y_test))
gs.predict(Z_test)

0.9999234689972908
0.999875695329252


array([16, 15, 21, ..., 22, 14, 15])

In [120]:
# DEFINE DICTIONARY TO STORE MODEL ITERATION RESULTS
model_iterations = {1: {}}

In [121]:
def model_params(dct, key):
    dct[key]["coefs"] = gs.best_estimator_[1].coef_
    dct[key]["base_params"] = params
    dct[key]["best_params"] = gs.best_params_
    dct[key]["best_score"] = gs.best_score_
    dct[key]["cv_results"] = gs.cv_results_
    dct[key]["train_score"] = gs.score(Z_train, y_train)
    dct[key]["test_score"] = gs.score(Z_test, y_test)
    dct[key]["preds"] = gs.predict(Z_test)
    
    return dct
    
model_params(model_iterations, 1)

{1: {'coefs': array([[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.]]),
  'base_params': {'logreg__C': [0.5, 1, 1.5],
   'logreg__solver': ['liblinear'],
   'logreg__penalty': ['l1'],
   'logreg__verbose': [10]},
  'best_params': {'logreg__C': 1,
   'logreg__penalty': 'l1',
   'logreg__solver': 'liblinear',
   'logreg__verbose': 10},
  'best_score': 0.9998775494583597,
  'cv_results': {'mean_fit_time': array([39.08477397, 38.01121726, 35.29328146]),
   'std_fit_time': array([3.0133276 , 3.63011006, 1.95155827]),
   'mean_score_time': array([0.00783367, 0.00659447, 0.00660791]),
   'std_score_time': array([0.00083163, 0.00025564, 0.00023773]),
   'param_logreg__C': masked_array(data=[0.5, 1, 1.5],
                mask=[False, False, False],
          fill_value='?',
               dtype=object)

In [133]:
sel_ = SelectFromModel(LogisticRegression(C=1, penalty="l1", solver="liblinear"))
sel_.fit(Z_train, y_train)

SelectFromModel(estimator=LogisticRegression(C=1, class_weight=None, dual=False,
                                             fit_intercept=True,
                                             intercept_scaling=1, l1_ratio=None,
                                             max_iter=100, multi_class='auto',
                                             n_jobs=None, penalty='l1',
                                             random_state=None,
                                             solver='liblinear', tol=0.0001,
                                             verbose=0, warm_start=False),
                max_features=None, norm_order=1, prefit=False, threshold=None)

In [143]:
X_train.columns[(sel_.estimator_.coef_ == 0).ravel().tolist()]

IndexError: boolean index did not match indexed array along dimension 0; dimension is 99 but corresponding boolean dimension is 3069

In [137]:
selected_feat = X_train.columns[(sel_.get_support())]
print('total features: {}'.format((X_train.shape[1])))
print('selected features: {}'.format(len(selected_feat)))
# print('features with coefficients shrank to zero: {}'.format(np.sum(sel_.estimator_.coef_ == 0)))

removed_feats = X_train.columns[(sel_.estimator_.coef_ == 0).ravel().tolist()]
removed_feats

total features: 99
selected features: 40


IndexError: boolean index did not match indexed array along dimension 0; dimension is 99 but corresponding boolean dimension is 3069

# X

In [70]:
model_iterations = {1: {}}
model_iterations[1]["best_params"] = gs.best_params_
model_iterations[1]["best_snore"] = gs.best_params_
model_iterations[1]["cv_results"] = gs.cv_results_
model_iterations[1]["train_score"] = gs.score(Z_train, y_train)
model_iterations[1]["test_score"] = gs.score(Z_test, y_test)

model_preds = gs.predict(Z_test)
model_cm = confusion_matrix(y_test, model_preds)
model_cm_df = pd.DataFrame(model_cm)

TypeError: 'function' object is not subscriptable

In [26]:
model_iterations = {2: {}}

model_iterations[2]["best_params"] = gs.best_params_
model_iterations[2]["best_snore"] = gs.best_params_
model_iterations[2]["cv_results"] = gs.cv_results_
model_iterations[2]["train_score"] = gs.score(Z_train, y_train)
model_iterations[2]["test_score"] = gs.score(Z_test, y_test)

model_preds = gs.predict(Z_test)
model_iterations[2]["preds"] = gs.predict(Z_test)
model_cm = confusion_matrix(y_test, model_preds)
model_cm_df2 = pd.DataFrame(model_cm)
cm_df = pd.concat([model_cm_df, model_cm_df2],
                  axis=1,
                  sort=False)

0.09646412163640342
0.09352311415562574
