In [8]:
# STANDARD LIBRARIES
import pandas as pd
import numpy as np
import pickle

# VISUALS
import matplotlib.pyplot as plt
import seaborn as sns

# FEATURE ENGINEERING AND PREPROCESSING
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.neighbors import NearestNeighbors, NearestCentroid

# MODELING
from sklearn.cluster import DBSCAN, KMeans
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression

# METRICS
from sklearn.metrics import silhouette_score

In [2]:
main = pd.read_csv("../data/clean-data/main-engineered.csv", low_memory=False)
main.drop(columns="Unnamed: 0", inplace=True)

In [4]:
print(main.shape)
print(main.isna().sum().sum())
main.head()

(97515, 364)
0


Unnamed: 0,zipcode,lu_category,lu_category_0,lu_category_1,lu_category_2,lu_category_3,lu_category_4,lu_category_5,lu_category_6,lu_category_7,...,req_source_category_3_total_street,req_source_category_4_total_street,req_source_category_5_total_street,req_source_category_0_total_zip,req_source_category_1_total_zip,req_source_category_2_total_zip,req_source_category_3_total_zip,req_source_category_4_total_zip,req_source_category_5_total_zip,label
0,2108,4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,2108,6,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,2108,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
3,2108,6,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,2108,6,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [5]:
Z = main.drop(columns=[
    "label",
    "zipcode",
    "zip_street",
    "zip_num_street"
])
X = main.drop(columns=[
    "label",
    "zipcode",
    "zip_street",
    "zip_num_street"
])
y = main["label"]

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    random_state=42)

ss = StandardScaler()

Z = ss.fit_transform(X)
Z_train =  ss.fit_transform(X_train)
Z_test = ss.transform(X_test)

# KMeans

**To help calculate an appropriate distance to model on, we will use `NearestNeighbors` to return the distances:**

In [15]:
def nearest_nb(x):
    nb = NearestNeighbors(n_neighbors=5, algorithm="auto", metric="euclidean").fit(x)
    distances, indices = nb.kneighbors(x)
    
    return distances, indices
# https://stackoverflow.com/questions/51305370/calculating-average-distance-of-nearest-neighbours-in-pandas-dataframe

In [17]:
distances, indices = nearest_nb(Z)

In [20]:
filename = "../assets/variables/main-nb-distances"
outfile = open(filename, "wb")
pickle.dump(distances, outfile)
outfile.close()

filename = "../assets/variables/mian-nb-indices"
outfile = open(filename, "wb")
pickle.dump(indices, outfile)
outfile.close()

In [18]:
distances

array([[0.        , 3.15943838, 4.30531862, 4.31911052, 4.31953967],
       [0.        , 0.20814598, 0.24234787, 0.24731217, 0.26001041],
       [0.        , 0.540545  , 0.98072736, 0.98192214, 0.98283186],
       ...,
       [0.        , 0.91112699, 0.91114136, 1.00055038, 1.00055142],
       [0.        , 0.03384385, 0.04859839, 0.0528762 , 0.06012846],
       [0.        , 2.33483045, 2.3482439 , 2.69906231, 2.70111563]])

In [19]:
indices

array([[    0, 15506, 15792, 15794, 15664],
       [    1, 15507,     3, 15702, 16477],
       [    2, 16479, 16473, 15510,     9],
       ...,
       [97512, 95798, 95805, 95978, 94107],
       [97513, 90539, 91107, 91829, 91292],
       [97514, 97031, 91728, 96992, 96991]])

# DBSCAN

In [None]:
dbscan = DBSCAN(eps=0.65,
                min_samples=10)
dbscan.fit(Z)

In [32]:
set(dbscan.labels_)

{-1, 0}

In [33]:
silhouette_score(Z, dbscan.labels_)

-0.06998326870094727

In [34]:
test_df = main
test_df["cluster"] = dbscan.labels_
test_df["cluster"].value_counts(normalize=True)

-1    0.941176
 0    0.058824
Name: cluster, dtype: float64

# PCA

In [47]:
data_cols = pd.read_pickle("../assets/variables/data_cols")
pw_cols = pd.read_pickle("../assets/variables/pw_cols")
fire_cols = pd.read_pickle("../assets/variables/fire_cols")
requests_cols = pd.read_pickle("../assets/variables/requests_cols")

data_main_df = main[data_cols]
pw_main_df = main[pw_cols]
fire_main_df = main[fire_cols]
requests_main_df = main[requests_cols]

main_scaler = StandardScaler()
data_main_sc = main_scaler.fit_transform(data_main_df)
pw_main_sc = main_scaler.fit_transform(pw_main_df)
fire_main_sc = main_scaler.fit_transform(fire_main_df)
requests_main_sc = main_scaler.fit_transform(requests_main_df)

data_main_sc_df = pd.DataFrame(data_main_sc)
pw_main_sc_df = pd.DataFrame(pw_main_sc)
fire_main_sc_df = pd.DataFrame(fire_main_sc)
requests_main_sc_df = pd.DataFrame(requests_main_sc)

## Data Cols

In [60]:
pca_data = PCA(n_components=5,
          random_state=42)
pca_data.fit(data_main_sc_df)

var_exp = pca_data.explained_variance_ratio_
cum_var_exp = np.cumsum(var_exp)

print(f"Explained Variance (first 20 components): {np.round(var_exp[:20], 3)}")
print("")
print(f"Cumulative Variance (first 20 components): {np.round(cum_var_exp[:20], 3)}")

Explained Variance (first 20 components): [0.387 0.18  0.096 0.07  0.06 ]

Cumulative Variance (first 20 components): [0.387 0.567 0.663 0.733 0.794]


## PW Violations Cols

In [58]:
pca_pw = PCA(n_components=5,
          random_state=42)
pca_pw.fit(pw_main_sc_df)

var_exp = pca_pw.explained_variance_ratio_
cum_var_exp = np.cumsum(var_exp)

print(f"Explained Variance (first 20 components): {np.round(var_exp[:20], 3)}")
print("")
print(f"Cumulative Variance (first 20 components): {np.round(cum_var_exp[:20], 3)}")

Explained Variance (first 20 components): [0.276 0.128 0.09  0.073 0.063]

Cumulative Variance (first 20 components): [0.276 0.404 0.494 0.568 0.631]


## Fire Cols

In [62]:
pca_fire = PCA(n_components=3,
          random_state=42)
pca_fire.fit(fire_main_sc_df)

var_exp = pca_fire.explained_variance_ratio_
cum_var_exp = np.cumsum(var_exp)

print(f"Explained Variance (first 5 components): {np.round(var_exp[:5], 3)}")
print("")
print(f"Cumulative Variance (first 5 components): {np.round(cum_var_exp[:5], 3)}")

Explained Variance (first 5 components): [0.442 0.243 0.199]

Cumulative Variance (first 5 components): [0.442 0.685 0.884]


## Requests Cols

In [63]:
pca_requests = PCA(n_components=5,
          random_state=42)
pca_requests.fit(requests_main_sc_df)

var_exp = pca_requests.explained_variance_ratio_
cum_var_exp = np.cumsum(var_exp)

print(f"Explained Variance (first 20 components): {np.round(var_exp[:20], 3)}")
print("")
print(f"Cumulative Variance (first 20 components): {np.round(cum_var_exp[:20], 3)}")

Explained Variance (first 20 components): [0.503 0.125 0.054 0.048 0.041]

Cumulative Variance (first 20 components): [0.503 0.628 0.683 0.731 0.772]


## Entire Main DataFrame

In [111]:
# BASIC MOST MODEL
pca_main = PCA(n_components=5,
               random_state=42)
pca_main.fit(X_sc)

var_exp = pca_main.explained_variance_ratio_
cum_var_exp = np.cumsum(var_exp)

print(f"Explained Variance (first 20 components): {np.round(var_exp[:40], 3)}")
print("")
print(f"Cumulative Variance (first 20 components): {np.round(cum_var_exp[:40], 3)}")

Explained Variance (first 20 components): [0.373 0.123 0.062 0.049 0.045]

Cumulative Variance (first 20 components): [0.373 0.495 0.557 0.606 0.651]


## For Supervised Learning Models

In [113]:
X = main.drop(columns="label")
y = main["label"]

In [126]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    stratify=y,
                                                    random_state=42)

ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.

In [115]:
ss = StandardScaler()
Z_train =  ss.fit_transform(X_train)
Z_test = ss.transform(X_test)

In [116]:
pca_pipe = Pipeline([
    ("ss", StandardScaler()),
    ("pca", PCA()),
    ("logreg", LogisticRegression())
])

In [118]:
pca_pipe.get_params();

In [124]:
params = {
    "pca__n_components": np.arange(1, 10, 1), # THERE ARE 118 COLUMNS
    "pca__random_state": [41],
    "logreg__penalty": ["l1", "l2"],
    "logreg__C": [0.01, 0.1, 1],
    "logreg__verbose": [100]
}

gs = GridSearchCV(
    pca_pipe,
    params,
    cv=5
)

In [120]:
Z_train

array([[-0.525148  , -0.19466292,  0.22358794, ..., -0.93738389,
        -0.90012905, -0.54744332],
       [-0.54675903, -0.38849968, -0.36165419, ..., -0.23583556,
         0.21829002, -0.54744332],
       [-0.42789837, -0.35706568, -0.50535326, ...,  0.37441538,
         0.22622207,  0.43013404],
       ...,
       [ 0.45815381, -0.60143096, -0.86313219, ..., -1.12958891,
        -0.99531365, -0.54744332],
       [-0.43870388, -0.03608859,  0.51004289, ...,  1.38349174,
         0.22622207, -0.54744332],
       [-0.25501014,  1.83146452,  2.3431829 , ...,  0.31675387,
         1.51518022, -0.54744332]])

In [123]:
gs.fit(Z_train, y_train)

ValueError: n_splits=5 cannot be greater than the number of members in each class.