In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [3]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

In [4]:
# Load the data
file_path = Path('ca_alcohol.csv')

In [5]:
df = pd.read_csv(file_path)
df

Unnamed: 0,county_fips_id,race_eth_code,race_eth_name,geotype,geoname,county_name,num_people_qrt,tot_people,pct_of_total,per_capita_income,...,manufacturing_2015,government_2015,recreation_2015,nonspecialized_2015,low_education_2015,low_employment_cnty,pop_loss_2010,retirement_dest_2015,persistent_poverty_2013,persistent_child_poverty_2013
0,6001,1,AIAN,CD,Alameda,Alameda,159,247,64,56261,...,0,0,0,1,0,0,0,0,0,0
1,6001,3,AfricanAm,CD,Alameda,Alameda,2674,4516,59,56261,...,0,0,0,1,0,0,0,0,0,0
2,6001,2,Asian,CD,Alameda,Alameda,14243,22822,62,56261,...,0,0,0,1,0,0,0,0,0,0
3,6001,4,Latino,CD,Alameda,Alameda,5689,8092,70,56261,...,0,0,0,1,0,0,0,0,0,0
4,6001,7,Multiple,CD,Alameda,Alameda,2490,4047,61,56261,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83080,6115,7,Multiple,PL,Wheatland city,Yuba,71,153,46,34989,...,0,1,0,0,1,1,0,0,0,1
83081,6115,5,NHOPI,PL,Wheatland city,Yuba,0,4,0,34989,...,0,1,0,0,1,1,0,0,0,1
83082,6115,8,Other,PL,Wheatland city,Yuba,0,7,0,34989,...,0,1,0,0,1,1,0,0,0,1
83083,6115,9,Total,PL,Wheatland city,Yuba,1583,3456,45,34989,...,0,1,0,0,1,1,0,0,0,1


In [6]:
df.dtypes

county_fips_id                     int64
race_eth_code                      int64
race_eth_name                     object
geotype                           object
geoname                           object
county_name                       object
num_people_qrt                     int64
tot_people                         int64
pct_of_total                       int64
per_capita_income                  int64
total_income                       int64
year_2011                          int64
unemp_2011                       float64
year_2012                          int64
unemp_2012                       float64
year_2013                          int64
unemp_2013                       float64
year_2014                          int64
unemp_2014                       float64
year_2015                          int64
unemp_2015                       float64
year_2016                          int64
unemp_2016                       float64
year_2017                          int64
unemp_2017      

In [56]:
df_dropna = df.copy().dropna()
df_drop_columns = df_dropna.drop(columns=["county_fips_id", "race_eth_code", "geotype","num_people_qrt", "tot_people", "total_income", "geoname" ,"year_2011", "year_2012", "year_2013", "year_2014", "year_2015", "year_2016", "year_2017", "year_2018", "year_2019", "median_house_pct", "metro_status","economic_types", "farming_2015","mining_2015",                     
"manufacturing_2015","government_2015", "recreation_2015", "nonspecialized_2015", "low_education_2015", "low_employment_cnty", "pop_loss_2010","retirement_dest_2015", "persistent_poverty_2013", "persistent_child_poverty_2013", "economic_type_label"])

In [57]:
df_drop_columns

Unnamed: 0,race_eth_name,county_name,pct_of_total,per_capita_income,unemp_2011,unemp_2012,unemp_2013,unemp_2014,unemp_2015,unemp_2016,unemp_2017,unemp_2018,unemp_2019,median_house_price
0,AIAN,Alameda,64,56261,10.1,8.7,7.2,5.8,4.7,4.3,3.7,3.1,2.9,107589
1,AfricanAm,Alameda,59,56261,10.1,8.7,7.2,5.8,4.7,4.3,3.7,3.1,2.9,107589
2,Asian,Alameda,62,56261,10.1,8.7,7.2,5.8,4.7,4.3,3.7,3.1,2.9,107589
3,Latino,Alameda,70,56261,10.1,8.7,7.2,5.8,4.7,4.3,3.7,3.1,2.9,107589
4,Multiple,Alameda,61,56261,10.1,8.7,7.2,5.8,4.7,4.3,3.7,3.1,2.9,107589
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83080,Multiple,Yuba,46,34989,17.0,15.4,13.2,11.2,9.3,8.6,7.4,6.5,6.1,56607
83081,NHOPI,Yuba,0,34989,17.0,15.4,13.2,11.2,9.3,8.6,7.4,6.5,6.1,56607
83082,Other,Yuba,0,34989,17.0,15.4,13.2,11.2,9.3,8.6,7.4,6.5,6.1,56607
83083,Total,Yuba,45,34989,17.0,15.4,13.2,11.2,9.3,8.6,7.4,6.5,6.1,56607


In [59]:
#Create a list of condition to turn pct_of_total to categorical variables
race_percentile = [
    (df_drop_columns['pct_of_total'] <= 50.00),
    (df_drop_columns['pct_of_total'] > 50.00)
    ]
values = ['low_percentile', 'high_percentile']

df_drop_columns['race_percent_qtr'] = np.select(race_percentile, values)


In [60]:
df_drop_columns

Unnamed: 0,race_eth_name,county_name,pct_of_total,per_capita_income,unemp_2011,unemp_2012,unemp_2013,unemp_2014,unemp_2015,unemp_2016,unemp_2017,unemp_2018,unemp_2019,median_house_price,race_percent_qtr
0,AIAN,Alameda,64,56261,10.1,8.7,7.2,5.8,4.7,4.3,3.7,3.1,2.9,107589,high_percentile
1,AfricanAm,Alameda,59,56261,10.1,8.7,7.2,5.8,4.7,4.3,3.7,3.1,2.9,107589,high_percentile
2,Asian,Alameda,62,56261,10.1,8.7,7.2,5.8,4.7,4.3,3.7,3.1,2.9,107589,high_percentile
3,Latino,Alameda,70,56261,10.1,8.7,7.2,5.8,4.7,4.3,3.7,3.1,2.9,107589,high_percentile
4,Multiple,Alameda,61,56261,10.1,8.7,7.2,5.8,4.7,4.3,3.7,3.1,2.9,107589,high_percentile
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83080,Multiple,Yuba,46,34989,17.0,15.4,13.2,11.2,9.3,8.6,7.4,6.5,6.1,56607,low_percentile
83081,NHOPI,Yuba,0,34989,17.0,15.4,13.2,11.2,9.3,8.6,7.4,6.5,6.1,56607,low_percentile
83082,Other,Yuba,0,34989,17.0,15.4,13.2,11.2,9.3,8.6,7.4,6.5,6.1,56607,low_percentile
83083,Total,Yuba,45,34989,17.0,15.4,13.2,11.2,9.3,8.6,7.4,6.5,6.1,56607,low_percentile


In [75]:
df_clean = df_drop_columns.drop(columns=["pct_of_total"])

In [76]:
#Split the Data into Training and Testing
# Create our features
y = pd.DataFrame(df_clean['race_percent_qtr'])
X = pd.get_dummies(df_clean.drop(columns="race_percent_qtr"))

In [77]:
X.describe()

Unnamed: 0,per_capita_income,unemp_2011,unemp_2012,unemp_2013,unemp_2014,unemp_2015,unemp_2016,unemp_2017,unemp_2018,unemp_2019,...,county_name_Sonoma,county_name_Stanislaus,county_name_Sutter,county_name_Tehama,county_name_Trinity,county_name_Tulare,county_name_Tuolumne,county_name_Ventura,county_name_Yolo,county_name_Yuba
count,83085.0,83085.0,83085.0,83085.0,83085.0,83085.0,83085.0,83085.0,83085.0,83085.0,...,83085.0,83085.0,83085.0,83085.0,83085.0,83085.0,83085.0,83085.0,83085.0,83085.0
mean,49638.104459,12.000077,10.655881,9.220537,7.752823,6.430342,5.719745,5.017152,4.457504,4.232614,...,0.014684,0.01401,0.00343,0.00272,0.001324,0.01348,0.002732,0.020846,0.006403,0.003105
std,13765.207643,2.683645,2.617569,2.444816,2.329851,2.291345,2.279096,1.979751,1.891821,1.847125,...,0.120284,0.117531,0.058468,0.052084,0.036362,0.11532,0.052199,0.14287,0.079763,0.055639
min,30437.0,7.3,6.3,5.2,4.2,3.4,3.0,2.7,2.3,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,37905.0,10.2,8.9,7.5,6.2,5.0,4.5,3.8,3.2,3.1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,49400.0,12.2,10.9,9.8,8.2,6.6,5.3,4.8,4.3,4.1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,54423.0,12.9,11.4,9.8,8.3,6.6,5.9,5.0,4.6,4.4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,98626.0,28.9,27.4,25.0,24.0,24.5,24.1,19.5,18.9,18.3,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [78]:
# Check the balance of our target values
y['race_percent_qtr'].value_counts()

low_percentile     44918
high_percentile    38167
Name: race_percent_qtr, dtype: int64

In [79]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
   y,  random_state=1, stratify=y)
y_train.shape

(62313, 1)

In [80]:
#Balanced Random Forest Classifier
from imblearn.ensemble import BalancedRandomForestClassifier
rf_model = BalancedRandomForestClassifier(n_estimators=100, random_state=1) 
rf_model = rf_model.fit(X_train, y_train)

In [81]:
# Calculated the balanced accuracy score
y_pred = rf_model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.6667861428305063

In [82]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[6173, 3369],
       [3519, 7711]])

In [83]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                       pre       rec       spe        f1       geo       iba       sup

high_percentile       0.64      0.65      0.69      0.64      0.67      0.44      9542
 low_percentile       0.70      0.69      0.65      0.69      0.67      0.45     11230

    avg / total       0.67      0.67      0.67      0.67      0.67      0.44     20772



In [84]:
# List the features sorted in descending order by feature importance
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.1247223925527769, 'unemp_2016'),
 (0.08640038737115972, 'unemp_2013'),
 (0.0861450494863796, 'per_capita_income'),
 (0.07523086774072574, 'unemp_2012'),
 (0.06797421553468136, 'unemp_2017'),
 (0.06791541604530586, 'unemp_2019'),
 (0.06542135809389314, 'unemp_2011'),
 (0.06247394789729797, 'unemp_2014'),
 (0.061847586281680855, 'unemp_2015'),
 (0.05566025755423161, 'county_name_Los Angeles'),
 (0.05068497681661299, 'unemp_2018'),
 (0.04888857515359423, 'median_house_price'),
 (0.02599089622943103, 'county_name_San Francisco'),
 (0.009213054298276365, 'race_eth_name_AfricanAm'),
 (0.009062746879823428, 'race_eth_name_White'),
 (0.008114380988909323, 'county_name_Riverside'),
 (0.008114208865660328, 'race_eth_name_Latino'),
 (0.007638343045913815, 'race_eth_name_Other'),
 (0.0069275254037862435, 'race_eth_name_NHOPI'),
 (0.0068322532009267205, 'race_eth_name_Asian'),
 (0.006696251972999954, 'race_eth_name_Multiple'),
 (0.006049489441246271, 'race_eth_name_AIAN'),
 (0.00581487355701333

In [85]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier
ee_model = EasyEnsembleClassifier(n_estimators=100, random_state=1) 
ee_model = ee_model.fit(X_train, y_train)

In [86]:
# Calculated the balanced accuracy score
y_pred_ee = ee_model.predict(X_test)
balanced_accuracy_score(y_test, y_pred_ee)

0.6677314223866253

In [87]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred_ee)

array([[6163, 3379],
       [3486, 7744]])

In [88]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred_ee))

                       pre       rec       spe        f1       geo       iba       sup

high_percentile       0.64      0.65      0.69      0.64      0.67      0.44      9542
 low_percentile       0.70      0.69      0.65      0.69      0.67      0.45     11230

    avg / total       0.67      0.67      0.67      0.67      0.67      0.45     20772

