In [1]:
# STANDARD LIBRARIES
import pandas as pd
import numpy as np
import pickle

# VISUALS
import matplotlib.pyplot as plt
import seaborn as sns

# FEATURE ENGINEERING AND PREPROCESSING
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split

# MODELING
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression

# METRICS

In [2]:
main = pd.read_csv("../data/clean-data/main-engineered.csv")
main.drop(columns=["Unnamed: 0"], inplace=True)

In [4]:
X = main.drop(columns=[
    "label",
    "zipcode",
    "zip_street",
    "zip_num_street"
])
y = main["label"]

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    random_state=42)

# ss = StandardScaler()
# 
# Z = ss.fit_transform(X)
# Z_train =  ss.fit_transform(X_train)
# Z_test = ss.transform(X_test)

In [14]:
y.value_counts()

15    9071
23    8478
27    7688
22    7015
21    6674
18    6666
19    6164
26    5578
16    5354
11    4572
14    4391
17    4044
13    3734
20    3069
9     2400
25    2284
10    2276
12    1412
8     1357
7     1192
6      934
31     858
1      676
4      475
2      441
34     231
3      224
30     215
28      18
32      13
33       8
24       1
29       1
5        1
Name: label, dtype: int64

In [6]:
logreg_pipe = Pipeline([
    ("ss", StandardScaler()),
    ("logreg", LogisticRegression())
])

logreg_pipe.get_params()

{'memory': None,
 'steps': [('ss', StandardScaler(copy=True, with_mean=True, with_std=True)),
  ('logreg',
   LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                      intercept_scaling=1, l1_ratio=None, max_iter=100,
                      multi_class='auto', n_jobs=None, penalty='l2',
                      random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                      warm_start=False))],
 'verbose': False,
 'ss': StandardScaler(copy=True, with_mean=True, with_std=True),
 'logreg': LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                    intercept_scaling=1, l1_ratio=None, max_iter=100,
                    multi_class='auto', n_jobs=None, penalty='l2',
                    random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                    warm_start=False),
 'ss__copy': True,
 'ss__with_mean': True,
 'ss__with_std': True,
 'logreg__C': 1.0,
 'logreg__class_weight': None,
 'logre

In [27]:
params ={
    "logreg__C": [0.01, 0.1, 1],
    "logreg__penalty": ["l1", "l2"],
    "logreg__verbose": [100]
}

In [28]:
gs = GridSearchCV(
    logreg_pipe,
    params,
    cv=5
)

In [15]:
gs.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('ss',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('logreg',
                                        LogisticRegression(C=1.0,
                                                           class_weight=None,
                                                           dual=False,
                                                           fit_intercept=True,
                                                           intercept_scaling=1,
                                                           l1_ratio=None,
                                                           max_iter=100,
                                                           multi_class='auto',
              

In [16]:
gs.best_score_

0.9995487871319664

In [17]:
gs.cv_results_

{'mean_fit_time': array([15.05849953]),
 'std_fit_time': array([0.31939253]),
 'mean_score_time': array([0.03423085]),
 'std_score_time': array([0.0002497]),
 'params': [{}],
 'split0_test_score': array([0.9994531]),
 'split1_test_score': array([0.9995898]),
 'split2_test_score': array([0.99965817]),
 'split3_test_score': array([0.99972653]),
 'split4_test_score': array([0.99931633]),
 'mean_test_score': array([0.99954879]),
 'std_test_score': array([0.00014726]),
 'rank_test_score': array([1], dtype=int32)}

In [22]:
gs.classes_

array([ 1,  2,  3,  4,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
       19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34])

In [24]:
gs.predict(X_test)

array([14,  2, 19, ..., 17, 27, 14])

In [25]:
gs.score(X_train, y_train)

0.9998769415882739

In [26]:
gs.score(X_test, y_test)

0.9995487919931089