In [22]:
# STANDARD LIBRARIES
import pandas as pd
import numpy as np
import pickle

# VISUALS
import matplotlib.pyplot as plt
import seaborn as sns

# FEATURE ENGINEERING AND PREPROCESSING
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split

# MODELING
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel, SelectKBest, chi2

# METRICS
from sklearn import metrics
from sklearn.metrics import confusion_matrix

# Setup Features and Preprocessing

In [2]:
main = pd.read_csv("../data/clean-data/main-engineered.csv")
main.drop(columns=["Unnamed: 0"], inplace=True)

In [3]:
# DROPPING ROWS WHERE THE TARGET HAS ONLY 1 VALUE
# TO HELP WITH PREDICTION MODEL
y_least_pop_classes = main.groupby("label_zip").filter(lambda x: len(x) == 1).index

print(main.shape)
main.drop(y_least_pop_classes, inplace=True)

(97515, 124)


In [22]:
# main["label_zip_street"] = main["zip_street"].astype("category")
# main["label_zip_street"] = main["label_zip_street"].cat.codes

In [4]:
X = main.drop(columns=[
    "label_zip",
#     "label_street",
#     "zipcode",
    "zip_street",
    "zipcode",
    "pw_coords",
    "req_latitude",
    "req_longitude",
    "zip_num_street"
])
y = main["label_zip"]
# y = main["label_zip_street"]
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.33,
                                                    random_state=42)

ss = StandardScaler()
Z = ss.fit_transform(X)
Z_train =  ss.fit_transform(X_train)
Z_test = ss.transform(X_test)

In [66]:
# y.value_counts()

# Model 

## Logistic Regression

In [None]:
# logreg1 = LogisticRegression(solver="liblinear", penalty="l1")
# logreg1.fit(Z_train, y_train)

In [5]:
# READ IN MODELS AND OTHER DATA
gs = pd.read_pickle("../assets/models/logreg_pipe_gs")
model_iterations = pd.read_pickle("../assets/models/model_iterations")

In [8]:
logreg_pipe = Pipeline([
    ("ss", StandardScaler()),
    ("logreg", LogisticRegression())
])

logreg_pipe.get_params()

params = {
    "logreg__C": [0.75, 0.85, 1, 1.1],
    "logreg__solver": ["liblinear"],
    "logreg__penalty": ["l1"],
    "logreg__verbose": [10]
}

gs = GridSearchCV(
    logreg_pipe,
    params,
    cv=5
)

gs = gs.fit(Z_train, y_train);

# SAVE NEWEST ITERATIONS OF MODEL WITH PARAMS TO FILE
# filename = "../assets/models/logreg_pipe_gs"
# outfile = open(filename, "wb")
# pickle.dump(gs, outfile)
# outfile.close()

[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear]

In [43]:
#UNCOMMENT TO RESET DICT
# model_iterations = {1: {}}

# DEFINE DICTIONARY TO STORE MODEL ITERATION RESULTS
def model_params(dct, key):
    dct.update({key: {"coefs",
                      "base_params",
                      "best_params",
                      "best_score",
                      "cv_results",
                      "train_score",
                      "test_score",
                      "preds"}})
    dct[key]["coefs"] = gs.best_estimator_[1].coef_
    dct[key]["base_params"] = params
    dct[key]["best_params"] = gs.best_params_
    dct[key]["best_score"] = gs.best_score_
    dct[key]["cv_results"] = gs.cv_results_
    dct[key]["train_score"] = gs.score(Z_train, y_train)
    dct[key]["test_score"] = gs.score(Z_test, y_test)
    dct[key]["preds"] = gs.predict(Z_test)
    
    
    return dct

In [44]:
model_params(model_iterations, 2)

# SAVE UPDATED DICT TO FILE
# filename = "../assets/models/model_iterations"
# outfile = open(filename, "wb")
# pickle.dump(model_iterations, outfile)
# outfile.close()

# model_iterations

TypeError: 'set' object does not support item assignment

In [21]:
# GENERATE PREDICTIONS
preds = gs.predict(Z_test)

model_metrics = metrics.classification_report(y_test, preds)
filename = "../assets/models/model_metrics"
outfile = open(filename, "wb")
pickle.dump(model_metrics, outfile)
outfile.close()

print(model_metrics)

## Transfer from SelectFromModel

In [60]:
# https://towardsdatascience.com/feature-selection-using-regularisation-a3678b71e499
# THE PARAMS SET FOR THE LOGISTIC REGRESSION BELOW 
# ARE FROM THE GRIDSEARCH'D PARAMS ABOVE
sel_ = SelectFromModel(LogisticRegression(C=.85, penalty="l1", solver="liblinear"))
sel_.fit(Z, y)

filename = "../assets/models/sel_.csv"
outfile = open(filename, "wb")
pickle.dump(sel_, outfile)
outfile.close()

In [61]:
# THE NON-ZERO'D FEATURES VIA LASSO
selected_features = X.columns[sel_.get_support()]

# CREATE LIST OF THE REMOVED FEATURES
removed_features = []
for col in list(X.columns):
    if col not in list(selected_features):
        removed_features.append(col)

print("Total Features: %d" % (X.shape[1]))
print("Total Selected Features: %d" % (len(selected_features)))
# NOPE:
print("Features with Zero'd Coefs from Lasso: %d" % (len(removed_features)))

# X_train_selected = sel_.transform(X_train)
# X_test_selected = sel_.transform(X_test)
# print(X_train_selected.shape, X_test_selected.shape)

Z_selected = sel_.transform(Z)
print(Z_selected.shape)

Total Features: 100
Total Selected Features: 40
Features with Zero'd Coefs from Lasso: 60
(97512, 40)


In [63]:
X_sel = X[selected_features]
X_sel["zipcode"] = X_sel["zipcode"].astype(str)
X_sel["zipcode"] = X_sel["zipcode"].apply(lambda x: "0" + x[:4]) 

X_sel.to_csv("../assets/models/X_sel.csv")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


# X

In [70]:
model_iterations = {1: {}}
model_iterations[1]["best_params"] = gs.best_params_
model_iterations[1]["best_snore"] = gs.best_params_
model_iterations[1]["cv_results"] = gs.cv_results_
model_iterations[1]["train_score"] = gs.score(Z_train, y_train)
model_iterations[1]["test_score"] = gs.score(Z_test, y_test)

model_preds = gs.predict(Z_test)
model_cm = confusion_matrix(y_test, model_preds)
model_cm_df = pd.DataFrame(model_cm)

TypeError: 'function' object is not subscriptable

In [26]:
model_iterations = {2: {}}

model_iterations[2]["best_params"] = gs.best_params_
model_iterations[2]["best_snore"] = gs.best_params_
model_iterations[2]["cv_results"] = gs.cv_results_
model_iterations[2]["train_score"] = gs.score(Z_train, y_train)
model_iterations[2]["test_score"] = gs.score(Z_test, y_test)

model_preds = gs.predict(Z_test)
model_iterations[2]["preds"] = gs.predict(Z_test)
model_cm = confusion_matrix(y_test, model_preds)
model_cm_df2 = pd.DataFrame(model_cm)
cm_df = pd.concat([model_cm_df, model_cm_df2],
                  axis=1,
                  sort=False)

0.09646412163640342
0.09352311415562574
