In [62]:
# STANDARD LIBRARIES
import pandas as pd
import numpy as np
import pickle

# VISUALS
import matplotlib.pyplot as plt
import seaborn as sns

# FEATURE ENGINEERING AND PREPROCESSING
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split

# MODELING
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel, SelectKBest, chi2

# METRICS
from sklearn.metrics import confusion_matrix

# Setup Features and Preprocessing

In [63]:
main = pd.read_csv("../data/clean-data/main-engineered.csv")
main.drop(columns=["Unnamed: 0"], inplace=True)

In [64]:
# DROPPING ROWS WHERE THE TARGET HAS ONLY 1 VALUE
# TO HELP WITH PREDICTION MODEL
y_least_pop_classes = main.groupby("label_zip").filter(lambda x: len(x) == 1).index

print(main.shape)
main.drop(y_least_pop_classes, inplace=True)

(97515, 101)


In [65]:
X = main.drop(columns=[
    "label_zip",
#     "label_street",
#     "zipcode",
#     "zip_street"
    "zip_num_street"
])
y = main["label_zip"]

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.33,
                                                    random_state=42)

ss = StandardScaler()
Z = ss.fit_transform(X)
Z_train =  ss.fit_transform(X_train)
Z_test = ss.transform(X_test)

In [66]:
# y.value_counts()

# Model 

## Logistic Regression

In [12]:
logreg_pipe = Pipeline([
    ("ss", StandardScaler()),
    ("logreg", LogisticRegression())
])

logreg_pipe.get_params()

params = {
    "logreg__C": [0.75, 0.85, 1, 1.1],
    "logreg__solver": ["liblinear"],
    "logreg__penalty": ["l1"],
    "logreg__verbose": [10]
}

gs = GridSearchCV(
    logreg_pipe,
    params,
    cv=5
)

gs = gs.fit(Z_train, y_train);

# SAVE NEWEST ITERATIONS OF MODEL WITH PARAMS TO FILE
filename = "../assets/models/logreg_pipe_gs"
outfile = open(filename, "wb")
pickle.dump(gs, outfile)
outfile.close()

[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear]

In [13]:
#UNCOMMENT TO RESET DICT
# model_iterations = {1: {}}

# DEFINE DICTIONARY TO STORE MODEL ITERATION RESULTS
def model_params(dct, key):
    dct[key]["coefs"] = gs.best_estimator_[1].coef_
    dct[key]["base_params"] = params
    dct[key]["best_params"] = gs.best_params_
    dct[key]["best_score"] = gs.best_score_
    dct[key]["cv_results"] = gs.cv_results_
    dct[key]["train_score"] = gs.score(Z_train, y_train)
    dct[key]["test_score"] = gs.score(Z_test, y_test)
    dct[key]["preds"] = gs.predict(Z_test)
    
    
    return dct

In [14]:
model_params(model_iterations, 1)

# SAVE UPDATED DICT TO FILE
filename = "../assets/models/model_iterations"
outfile = open(filename, "wb")
pickle.dump(model_iterations, outfile)
outfile.close()

In [15]:
model_iterations

{1: {'coefs': array([[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.]]),
  'base_params': {'logreg__C': [0.75, 0.85, 1, 1.1],
   'logreg__solver': ['liblinear'],
   'logreg__penalty': ['l1'],
   'logreg__verbose': [10]},
  'best_params': {'logreg__C': 0.85,
   'logreg__penalty': 'l1',
   'logreg__solver': 'liblinear',
   'logreg__verbose': 10},
  'best_score': 0.9998775494583597,
  'cv_results': {'mean_fit_time': array([38.76960969, 38.05905571, 38.22778983, 37.08115921]),
   'std_fit_time': array([2.69931988, 2.3123963 , 3.36814968, 2.29430517]),
   'mean_score_time': array([0.00686274, 0.00754266, 0.00722566, 0.00812421]),
   'std_score_time': array([0.00056993, 0.00179684, 0.00057026, 0.00273595]),
   'param_logreg__C': masked_array(data=[0.75, 0.85, 1, 1.1],
                mask=[False, Fal

## Transfer from SelectFromModel

In [28]:
# https://towardsdatascience.com/feature-selection-using-regularisation-a3678b71e499
sel_ = SelectFromModel(LogisticRegression(C=.85, penalty="l1", solver="liblinear"))
sel_.fit(Z_train, y_train)

SelectFromModel(estimator=LogisticRegression(C=0.85, class_weight=None,
                                             dual=False, fit_intercept=True,
                                             intercept_scaling=1, l1_ratio=None,
                                             max_iter=100, multi_class='auto',
                                             n_jobs=None, penalty='l1',
                                             random_state=None,
                                             solver='liblinear', tol=0.0001,
                                             verbose=0, warm_start=False),
                max_features=None, norm_order=1, prefit=False, threshold=None)

In [59]:
# CREATE LIST OF THE REMOVED FEATURES
removed_features = []
for col in list(X_train.columns):
    if col not in list(selected_features):
        removed_features.append(col)

# THE NON-ZERO'D FEATURES VIA LASSO
selected_features = X_train.columns[sel_.get_support()]

print("Total Features: %d" % (X_train.shape[1]))
print("Total Selected Features: %d" % (len(selected_features)))
# NOPE:
print("Features with Zero'd Coefs from Lasso: %d" % (len(removed_features)))

X_train_selected = sel_.transform(X_train)
X_test_selected = sel_.transform(X_test)
print(X_train_selected.shape, X_test_selected.shape)

Total Features: 99
Total Selected Features: 40
Features with Zero'd Coefs from Lasso: 59


## Transfer from SelectKBest

# X

In [70]:
model_iterations = {1: {}}
model_iterations[1]["best_params"] = gs.best_params_
model_iterations[1]["best_snore"] = gs.best_params_
model_iterations[1]["cv_results"] = gs.cv_results_
model_iterations[1]["train_score"] = gs.score(Z_train, y_train)
model_iterations[1]["test_score"] = gs.score(Z_test, y_test)

model_preds = gs.predict(Z_test)
model_cm = confusion_matrix(y_test, model_preds)
model_cm_df = pd.DataFrame(model_cm)

TypeError: 'function' object is not subscriptable

In [26]:
model_iterations = {2: {}}

model_iterations[2]["best_params"] = gs.best_params_
model_iterations[2]["best_snore"] = gs.best_params_
model_iterations[2]["cv_results"] = gs.cv_results_
model_iterations[2]["train_score"] = gs.score(Z_train, y_train)
model_iterations[2]["test_score"] = gs.score(Z_test, y_test)

model_preds = gs.predict(Z_test)
model_iterations[2]["preds"] = gs.predict(Z_test)
model_cm = confusion_matrix(y_test, model_preds)
model_cm_df2 = pd.DataFrame(model_cm)
cm_df = pd.concat([model_cm_df, model_cm_df2],
                  axis=1,
                  sort=False)

0.09646412163640342
0.09352311415562574
