## Import dependencies and tools, plust review data

In [45]:
# Supervised Learning Dependencies
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
# Unsupervised
import hvplot.pandas
import plotly.express as px
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
# Neural Network
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
import tensorflow as tf

In [24]:
# Import combined csv made in postgreSQL as DataFrame
df = pd.read_csv("cust_serv_combined.csv").set_index('cust')
df

Unnamed: 0_level_0,monthly_bill,cycle,qty,size,commodity,pickups,bin_amount,tax_body,biz_type
cust,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
98,888.98,CA,1,4.00,Garbage,1,813.98,C,MINI MART
98,888.98,CA,1,4.00,Recycle,2,75.00,C,MINI MART
98,888.98,CA,1,4.00,Recycle,2,75.00,C,MINI MART
218,965.40,CA,1,4.00,Recycle,1,75.00,COUNTY,WINERY
218,965.40,CA,1,4.00,Garbage,1,890.40,COUNTY,WINERY
...,...,...,...,...,...,...,...,...,...
31675,134.57,CA,1,0.48,Compost,1,0.00,COUNTY,LODGING-FW
31676,1456.25,CA,1,2.00,Garbage,1,482.56,COUNTY,WINERY
31676,1456.25,CA,1,6.00,Recycle,1,973.69,COUNTY,WINERY
31679,65.25,CA,1,0.32,Garbage,1,65.25,S,WINERY


In [25]:
# Determine number of unique values in each column, and check for dtypes and information while I'm at it
df.nunique()

monthly_bill    234
cycle             2
qty              23
size              7
commodity         3
pickups           6
bin_amount      134
tax_body          4
biz_type         40
dtype: int64

In [26]:
df.dtypes

monthly_bill    float64
cycle            object
qty               int64
size            float64
commodity        object
pickups           int64
bin_amount      float64
tax_body         object
biz_type         object
dtype: object

In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1817 entries, 98 to 31679
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   monthly_bill  1817 non-null   float64
 1   cycle         1817 non-null   object 
 2   qty           1817 non-null   int64  
 3   size          1817 non-null   float64
 4   commodity     1817 non-null   object 
 5   pickups       1817 non-null   int64  
 6   bin_amount    1817 non-null   float64
 7   tax_body      1817 non-null   object 
 8   biz_type      1817 non-null   object 
dtypes: float64(3), int64(2), object(4)
memory usage: 142.0+ KB


In [28]:
# Drop cycle column because it won't likely tell us much
df.drop(columns=['cycle'], inplace=True)
df

Unnamed: 0_level_0,monthly_bill,qty,size,commodity,pickups,bin_amount,tax_body,biz_type
cust,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
98,888.98,1,4.00,Garbage,1,813.98,C,MINI MART
98,888.98,1,4.00,Recycle,2,75.00,C,MINI MART
98,888.98,1,4.00,Recycle,2,75.00,C,MINI MART
218,965.40,1,4.00,Recycle,1,75.00,COUNTY,WINERY
218,965.40,1,4.00,Garbage,1,890.40,COUNTY,WINERY
...,...,...,...,...,...,...,...,...
31675,134.57,1,0.48,Compost,1,0.00,COUNTY,LODGING-FW
31676,1456.25,1,2.00,Garbage,1,482.56,COUNTY,WINERY
31676,1456.25,1,6.00,Recycle,1,973.69,COUNTY,WINERY
31679,65.25,1,0.32,Garbage,1,65.25,S,WINERY


## Unsupervised Model

I am pretty sure I know what I am planning to do with the data, but just in case I'm going to see what relationships the data has

In [29]:
# Starting with unsupervised learning to see what relationships there may be with the data
# Use get_dummies() to create variables for text features.
X = pd.get_dummies(df, columns= ["commodity", "tax_body", "biz_type"])
X.head()

Unnamed: 0_level_0,monthly_bill,qty,size,pickups,bin_amount,commodity_Compost,commodity_Garbage,commodity_Recycle,tax_body_C,tax_body_COUNTY,...,biz_type_POLICEFIRE,biz_type_POSTOFFICE,biz_type_PRESCHOOL,biz_type_RESTAURANT,biz_type_RETAIL,biz_type_SCHOOL,biz_type_STRIP MALL,biz_type_UTILITY,biz_type_VINEYARD,biz_type_WINERY
cust,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
98,888.98,1,4.0,1,813.98,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
98,888.98,1,4.0,2,75.0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
98,888.98,1,4.0,2,75.0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
218,965.4,1,4.0,1,75.0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,1
218,965.4,1,4.0,1,890.4,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,1


In [30]:
X = StandardScaler().fit_transform(X)
X

array([[-0.25473682, -0.11242994,  1.35896341, ..., -0.12956818,
        -0.19107287, -0.66660042],
       [-0.25473682, -0.11242994,  1.35896341, ..., -0.12956818,
        -0.19107287, -0.66660042],
       [-0.25473682, -0.11242994,  1.35896341, ..., -0.12956818,
        -0.19107287, -0.66660042],
       ...,
       [ 0.10009513, -0.11242994,  2.47059618, ..., -0.12956818,
        -0.19107287,  1.50014907],
       [-0.76998657, -0.11242994, -0.68644088, ..., -0.12956818,
        -0.19107287,  1.50014907],
       [-0.76998657, -0.11242994, -0.59751026, ..., -0.12956818,
        -0.19107287,  1.50014907]])

In [55]:
pca = PCA(n_components = 3)
df_pca=pca.fit_transform(X)

In [56]:
# Create a DataFrame with the three principal components.
pcs_df= pd.DataFrame(data=df_pca, columns=["PC 1", "PC 2", "PC 3"], index = df.index)
pcs_df.head(10)

Unnamed: 0_level_0,PC 1,PC 2,PC 3
cust,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
98,1.260685,1.257017,-0.70621
98,0.392196,0.952089,0.708526
98,0.392196,0.952089,0.708526
218,-0.247907,-2.613957,0.435492
218,1.260386,-2.183019,-0.996956
218,-1.101496,-2.230029,0.630257
218,-1.101496,-2.230029,0.630257
218,-1.001973,-1.577576,0.138301
218,-1.101496,-2.230029,0.630257
345,1.241232,-2.181816,-1.013549


In [57]:
inertia = []
k = list(range(1, 11))
# Looking for the best K
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(pcs_df)
    inertia.append(km.inertia_)
# Create the elbow curve
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", xticks=k, title="Elbow Curve")

In [58]:
# Initialize the K-means model
model = KMeans(n_clusters=4, random_state=4)

# Fit the model
model.fit(pcs_df)

# Predict clusters
predictions = model.predict(pcs_df)
print(predictions)

[0 0 0 ... 1 0 1]


In [59]:
# Create a new DataFrame including predicted clusters and cryptocurrencies features.
# Concatentate the crypto_df and pcs_df DataFrames on the same columns.
clustered_df = pd.concat([df, pcs_df], axis = 1)
#  Add a new column, "Class" to the clustered_df DataFrame that holds the predictions.
clustered_df["Class"] = model.labels_

# Print the shape of the clustered_df
print(clustered_df.shape)
clustered_df.head(10)

(1817, 12)


Unnamed: 0_level_0,monthly_bill,qty,size,commodity,pickups,bin_amount,tax_body,biz_type,PC 1,PC 2,PC 3,Class
cust,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
98,888.98,1,4.0,Garbage,1,813.98,C,MINI MART,1.260685,1.257017,-0.70621,0
98,888.98,1,4.0,Recycle,2,75.0,C,MINI MART,0.392196,0.952089,0.708526,0
98,888.98,1,4.0,Recycle,2,75.0,C,MINI MART,0.392196,0.952089,0.708526,0
218,965.4,1,4.0,Recycle,1,75.0,COUNTY,WINERY,-0.247907,-2.613957,0.435492,1
218,965.4,1,4.0,Garbage,1,890.4,COUNTY,WINERY,1.260386,-2.183019,-0.996956,1
218,965.4,1,0.48,Recycle,1,0.0,COUNTY,WINERY,-1.101496,-2.230029,0.630257,1
218,965.4,1,0.48,Recycle,1,0.0,COUNTY,WINERY,-1.101496,-2.230029,0.630257,1
218,965.4,1,0.48,Compost,1,0.0,COUNTY,WINERY,-1.001973,-1.577576,0.138301,1
218,965.4,1,0.48,Recycle,1,0.0,COUNTY,WINERY,-1.101496,-2.230029,0.630257,1
345,890.4,1,4.0,Garbage,1,890.4,COUNTY,WINERY,1.241232,-2.181816,-1.013549,1


In [60]:
# Creating a 3D-Scatter with the PCA data and the clusters
fig = px.scatter_3d(
    clustered_df,
    x="PC 1",
    y="PC 2",
    z="PC 3",
    color="Class",
    symbol="Class",
    hover_name = "biz_type",
    hover_data = ["commodity"],
    width=800)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()

In [61]:
# Create a table with tradable cryptocurrencies.
clustered_df.hvplot.table(columns=['monthly_bill', 'qty', 'size', 'commodity', 'pickups', 'bin_amount', 'tax_body', 'biz_type', 'Class'], sortable=True, selectable=True)

## Supervised Models

#### Balanced Forest Classifier

In [64]:
df = pd.read_csv("cust_serv_combined.csv").set_index('cust')
df.drop(columns=['cycle'], inplace=True)
# Create our features
X = df.drop("biz_type", axis=1)
X = pd.get_dummies(X)

# Create our target
y = df['biz_type']
X.head(15)

Unnamed: 0_level_0,monthly_bill,qty,size,pickups,bin_amount,commodity_Compost,commodity_Garbage,commodity_Recycle,tax_body_C,tax_body_COUNTY,tax_body_S,tax_body_Y
cust,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
98,888.98,1,4.0,1,813.98,0,1,0,1,0,0,0
98,888.98,1,4.0,2,75.0,0,0,1,1,0,0,0
98,888.98,1,4.0,2,75.0,0,0,1,1,0,0,0
218,965.4,1,4.0,1,75.0,0,0,1,0,1,0,0
218,965.4,1,4.0,1,890.4,0,1,0,0,1,0,0
218,965.4,1,0.48,1,0.0,0,0,1,0,1,0,0
218,965.4,1,0.48,1,0.0,0,0,1,0,1,0,0
218,965.4,1,0.48,1,0.0,1,0,0,0,1,0,0
218,965.4,1,0.48,1,0.0,0,0,1,0,1,0,0
345,890.4,1,4.0,1,890.4,0,1,0,0,1,0,0


In [65]:
X.describe()

Unnamed: 0,monthly_bill,qty,size,pickups,bin_amount,commodity_Compost,commodity_Garbage,commodity_Recycle,tax_body_C,tax_body_COUNTY,tax_body_S,tax_body_Y
count,1817.0,1817.0,1817.0,1817.0,1817.0,1817.0,1817.0,1817.0,1817.0,1817.0,1817.0,1817.0
mean,1296.22787,2.783709,1.555014,1.296643,274.506687,0.187122,0.326362,0.486516,0.222895,0.405614,0.280132,0.091359
std,1599.140549,15.869443,1.799651,0.718352,602.846274,0.390117,0.469011,0.499956,0.416303,0.491146,0.449187,0.288199
min,0.0,1.0,0.16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,195.75,1.0,0.48,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,699.2,1.0,0.48,1.0,32.62,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1772.84,1.0,3.5,1.0,103.93,0.0,1.0,1.0,0.0,1.0,1.0,0.0
max,8471.2,236.0,6.0,5.0,4634.19,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [66]:
# Check the balance of our target values
y.value_counts()

WINERY        559
MULTIFAMIL    187
RESTAURANT    164
OFFICE BLD     86
LODGING        70
LODGING-FW     66
GROCERY        65
VINEYARD       64
SCHOOL         43
RETAIL         40
CHURCH         39
MEDICAL        39
MEMBER         37
UTILITY        30
MOBILEHOME     29
COMMUNITY      27
CONTRACTOR     21
HOME IMPRO     20
MIXED USE      20
AUTO           19
BAKERY         19
POSTOFFICE     15
PARK           15
NRSG HOME      15
STRIP MALL     14
DELI/CAFE      12
POLICEFIRE     11
CATERING       11
FOURPLEX       10
HOA            10
PRESCHOOL       9
GROUP HOME      7
INDUSTRIAL      7
HOSPITAL        7
BAR NO FW       6
MINI MART       6
ENTERTAINM      6
DUPLEX          5
LANDSCAPER      4
FLORIST         3
Name: biz_type, dtype: int64

In [67]:
# Create the target variables
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [68]:
# Check to make sure the balances match
print(Counter(y_train))
print (Counter(y_test))

Counter({'WINERY': 416, 'MULTIFAMIL': 147, 'RESTAURANT': 120, 'OFFICE BLD': 62, 'LODGING': 53, 'GROCERY': 49, 'VINEYARD': 49, 'LODGING-FW': 45, 'SCHOOL': 33, 'MEDICAL': 30, 'MEMBER': 29, 'RETAIL': 29, 'UTILITY': 26, 'CHURCH': 24, 'MOBILEHOME': 23, 'COMMUNITY': 22, 'MIXED USE': 16, 'AUTO': 16, 'CONTRACTOR': 16, 'HOME IMPRO': 16, 'PARK': 12, 'POSTOFFICE': 12, 'DELI/CAFE': 11, 'BAKERY': 11, 'STRIP MALL': 10, 'NRSG HOME': 9, 'CATERING': 8, 'PRESCHOOL': 8, 'HOA': 8, 'POLICEFIRE': 8, 'HOSPITAL': 7, 'FOURPLEX': 6, 'INDUSTRIAL': 6, 'GROUP HOME': 5, 'ENTERTAINM': 5, 'MINI MART': 4, 'BAR NO FW': 4, 'DUPLEX': 3, 'LANDSCAPER': 3, 'FLORIST': 1})
Counter({'WINERY': 143, 'RESTAURANT': 44, 'MULTIFAMIL': 40, 'OFFICE BLD': 24, 'LODGING-FW': 21, 'LODGING': 17, 'GROCERY': 16, 'VINEYARD': 15, 'CHURCH': 15, 'RETAIL': 11, 'SCHOOL': 10, 'MEDICAL': 9, 'MEMBER': 8, 'BAKERY': 8, 'NRSG HOME': 6, 'MOBILEHOME': 6, 'COMMUNITY': 5, 'CONTRACTOR': 5, 'FOURPLEX': 4, 'MIXED USE': 4, 'HOME IMPRO': 4, 'STRIP MALL': 4, 'UTI

In [69]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
brfc = BalancedRandomForestClassifier(n_estimators=100, random_state=1)

# Fit the model
brfc.fit(X_train, y_train)

BalancedRandomForestClassifier(random_state=1)

In [70]:
# Calculated the balanced accuracy score
y_pred = brfc.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.19883994210917288

In [71]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, y_pred)

array([[ 0,  0,  0, ...,  0,  0,  0],
       [ 0,  4,  1, ...,  0,  0,  0],
       [ 0,  0,  1, ...,  0,  0,  0],
       ...,
       [ 0,  0,  0, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  2,  0],
       [ 2,  0,  0, ...,  0, 16,  4]], dtype=int64)

In [72]:
# Display as DataFrame
cm_df = pd.DataFrame(confusion_matrix(y_test,y_pred), index=["Actual_high_risk", "Acutal_low_risk"], columns=["Predicted_high_risk", "Predicted_low_risk"])
cm_df

ValueError: Shape of passed values is (40, 40), indices imply (2, 2)

In [73]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

       AUTO       0.00      0.00      0.95      0.00      0.00      0.00         3
     BAKERY       0.33      0.50      0.98      0.40      0.70      0.47         8
  BAR NO FW       0.05      0.50      0.96      0.10      0.69      0.46         2
   CATERING       0.12      0.33      0.98      0.18      0.57      0.31         3
     CHURCH       0.25      0.07      0.99      0.11      0.26      0.06        15
  COMMUNITY       0.00      0.00      0.96      0.00      0.00      0.00         5
 CONTRACTOR       0.00      0.00      0.97      0.00      0.00      0.00         5
  DELI/CAFE       0.14      1.00      0.99      0.25      0.99      0.99         1
     DUPLEX       0.29      1.00      0.99      0.44      0.99      0.99         2
 ENTERTAINM       0.00      0.00      0.95      0.00      0.00      0.00         1
    FLORIST       0.00      0.00      0.95      0.00      0.00      0.00         2
   

In [74]:
# List the features sorted in descending order by feature importance
features = sorted(zip(brfc.feature_importances_, X.columns), reverse=True)
for feature in features:
    print(f"{feature[1]}: ({feature[0]})")

monthly_bill: (0.32635456990529493)
bin_amount: (0.1297499172407895)
size: (0.09288964383493918)
qty: (0.08421254218171614)
tax_body_S: (0.06059900431716218)
tax_body_COUNTY: (0.04887697709402879)
tax_body_C: (0.04816035553534498)
commodity_Recycle: (0.04803975336885489)
pickups: (0.04607247595004977)
commodity_Garbage: (0.04275914787494485)
commodity_Compost: (0.03993308200376157)
tax_body_Y: (0.03235253069311321)


#### Easy Ensemble AdaBoost Classifier

In [75]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier
eec = EasyEnsembleClassifier(n_estimators=100, random_state=1)

# Fit the model
eec.fit(X_train, y_train)

EasyEnsembleClassifier(n_estimators=100, random_state=1)

In [76]:
# Calculated the balanced accuracy score
y_pred = eec.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.10186202686202686

In [77]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

       AUTO       0.00      0.00      0.89      0.00      0.00      0.00         3
     BAKERY       0.00      0.00      1.00      0.00      0.00      0.00         8
  BAR NO FW       0.02      0.50      0.87      0.03      0.66      0.42         2
   CATERING       0.10      0.33      0.98      0.15      0.57      0.31         3
     CHURCH       0.50      0.20      0.99      0.29      0.45      0.18        15
  COMMUNITY       0.12      0.20      0.98      0.15      0.44      0.18         5
 CONTRACTOR       0.00      0.00      1.00      0.00      0.00      0.00         5
  DELI/CAFE       0.00      0.00      1.00      0.00      0.00      0.00         1
     DUPLEX       0.25      1.00      0.99      0.40      0.99      0.99         2
 ENTERTAINM       0.00      0.00      0.98      0.00      0.00      0.00         1
    FLORIST       0.00      0.00      0.89      0.00      0.00      0.00         2
   

#### Naive Oversampling

In [78]:
# Resample the training data with the RandomOversampler
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=1)
# Resample targets
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({'MULTIFAMIL': 416,
         'UTILITY': 416,
         'WINERY': 416,
         'DUPLEX': 416,
         'MEMBER': 416,
         'OFFICE BLD': 416,
         'RETAIL': 416,
         'CATERING': 416,
         'LODGING': 416,
         'MIXED USE': 416,
         'GROCERY': 416,
         'SCHOOL': 416,
         'PRESCHOOL': 416,
         'RESTAURANT': 416,
         'VINEYARD': 416,
         'CHURCH': 416,
         'MOBILEHOME': 416,
         'MEDICAL': 416,
         'FOURPLEX': 416,
         'AUTO': 416,
         'GROUP HOME': 416,
         'MINI MART': 416,
         'PARK': 416,
         'COMMUNITY': 416,
         'HOA': 416,
         'LODGING-FW': 416,
         'ENTERTAINM': 416,
         'INDUSTRIAL': 416,
         'CONTRACTOR': 416,
         'POSTOFFICE': 416,
         'DELI/CAFE': 416,
         'BAR NO FW': 416,
         'BAKERY': 416,
         'POLICEFIRE': 416,
         'HOME IMPRO': 416,
         'STRIP MALL': 416,
         'HOSPITAL': 416,
         'NRSG HOME': 416,
         '

In [79]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(solver='lbfgs', random_state=1)

# fit the model
logreg.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [80]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = logreg.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.021367521367521364

In [81]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [82]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

       AUTO       0.00      0.00      1.00      0.00      0.00      0.00         3
     BAKERY       0.00      0.00      1.00      0.00      0.00      0.00         8
  BAR NO FW       0.00      0.00      1.00      0.00      0.00      0.00         2
   CATERING       0.00      0.00      1.00      0.00      0.00      0.00         3
     CHURCH       0.00      0.00      1.00      0.00      0.00      0.00        15
  COMMUNITY       0.00      0.00      1.00      0.00      0.00      0.00         5
 CONTRACTOR       0.00      0.00      1.00      0.00      0.00      0.00         5
  DELI/CAFE       0.00      0.00      1.00      0.00      0.00      0.00         1
     DUPLEX       0.00      0.00      1.00      0.00      0.00      0.00         2
 ENTERTAINM       0.00      0.00      1.00      0.00      0.00      0.00         1
    FLORIST       0.00      0.00      0.88      0.00      0.00      0.00         2
   

#### SMOTE Oversampling

In [84]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=1, sampling_strategy='auto')
# Resample the targets
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
Counter(y_resampled)

ValueError: Expected n_neighbors <= n_samples,  but n_samples = 4, n_neighbors = 6