In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [3]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from sklearn.preprocessing import StandardScaler

# Read the CSV

In [4]:
pricing_out_df = pd.read_csv('pricing_out.csv')
pricing_out_df.head()

Unnamed: 0.1,Unnamed: 0,year,county,Description,GDP_Change,House_price,Income,interest_rate,lending_limit,periods,suggest_payment,suggest_income,priced_out
0,0,2010,Adams,"Agriculture, forestry, fishing and hunting",31.9,186966.95,52785,0.0025,0.25,360,788.260203,37836.48974,N
1,1,2010,Adams,"Mining, quarrying, and oil and gas extraction",1.4,186966.95,52785,0.0025,0.25,360,788.260203,37836.48974,N
2,2,2010,Adams,Manufacturing,-3.5,186966.95,52785,0.0025,0.25,360,788.260203,37836.48974,N
3,3,2010,Adams,"Educational services, health care, and social ...",6.5,186966.95,52785,0.0025,0.25,360,788.260203,37836.48974,N
4,4,2010,Adams,"Arts, entertainment, recreation, accommodation...",2.5,186966.95,52785,0.0025,0.25,360,788.260203,37836.48974,N


In [5]:
pricing_out_df.drop(columns=['Unnamed: 0'], inplace=True)
pricing_out_df.head()

Unnamed: 0,year,county,Description,GDP_Change,House_price,Income,interest_rate,lending_limit,periods,suggest_payment,suggest_income,priced_out
0,2010,Adams,"Agriculture, forestry, fishing and hunting",31.9,186966.95,52785,0.0025,0.25,360,788.260203,37836.48974,N
1,2010,Adams,"Mining, quarrying, and oil and gas extraction",1.4,186966.95,52785,0.0025,0.25,360,788.260203,37836.48974,N
2,2010,Adams,Manufacturing,-3.5,186966.95,52785,0.0025,0.25,360,788.260203,37836.48974,N
3,2010,Adams,"Educational services, health care, and social ...",6.5,186966.95,52785,0.0025,0.25,360,788.260203,37836.48974,N
4,2010,Adams,"Arts, entertainment, recreation, accommodation...",2.5,186966.95,52785,0.0025,0.25,360,788.260203,37836.48974,N


In [6]:
#drop rows with string values in GDP_Change column
pricing_out_df = pricing_out_df[~pricing_out_df['GDP_Change'].isin(['(D)', '(NM)'])]

In [7]:
# Change House_price to an integer
pricing_out_df['House_price'] = pricing_out_df['House_price'].astype(int)

# Split the Data into Training and Testing

In [8]:
#Define target variable
target= 'priced_out'

# Create our features
X = pricing_out_df.drop(columns='priced_out')
X = pd.get_dummies(X)


# Create our target
y = pricing_out_df.loc[:, target].copy()

In [9]:
X.describe()

Unnamed: 0,year,House_price,Income,interest_rate,lending_limit,periods,suggest_payment,suggest_income,county_Adams,county_Alamosa,...,GDP_Change_9.5,GDP_Change_9.6,GDP_Change_9.7,GDP_Change_9.8,GDP_Change_9.9,GDP_Change_92.8,GDP_Change_937.8,GDP_Change_98.2,GDP_Change_989.1,GDP_Change_99.3
count,1829.0,1829.0,1829.0,1829.0,1829.0,1829.0,1829.0,1829.0,1829.0,1829.0,...,1829.0,1829.0,1829.0,1829.0,1829.0,1829.0,1829.0,1829.0,1829.0,1829.0
mean,2013.054128,209293.217059,52622.390377,0.0025,0.25,360.0,882.390676,42354.75243,0.019683,0.022963,...,0.001093,0.00164,0.003827,0.002187,0.00164,0.000547,0.000547,0.000547,0.000547,0.000547
std,2.012766,129327.088424,15236.321562,8.24219e-18,0.0,0.0,545.248211,26171.914119,0.138946,0.149828,...,0.033059,0.040478,0.061763,0.046727,0.040478,0.023383,0.023383,0.023383,0.023383,0.023383
min,2010.0,33937.0,26075.0,0.0025,0.25,360.0,143.081242,6867.899595,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2011.0,110758.0,41730.0,0.0025,0.25,360.0,466.96125,22414.13999,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,2013.0,188749.0,48934.0,0.0025,0.25,360.0,795.773515,38197.12871,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,2015.0,275615.0,61501.0,0.0025,0.25,360.0,1162.005539,55776.26585,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,2016.0,778772.0,112399.0,0.0025,0.25,360.0,3283.3381,157600.2288,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [10]:
# Check the balance of our target values
y.value_counts()

N    1180
Y     649
Name: priced_out, dtype: int64

In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

### Easy Ensemble AdaBoost Classifier

In [12]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier 

# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

eec_model = EasyEnsembleClassifier(n_estimators=100, random_state=1)
eec_model.fit(X_train, y_train)
eec_y_pred = eec_model.predict(X_test)

In [13]:
# Calculated the balanced accuracy score

balanced_accuracy_score(y_test, eec_y_pred)

1.0

In [14]:
# Display the confusion matrix
# Calculating the confusion matrix.
eec_cm = confusion_matrix(y_test, eec_y_pred)

# Create a DataFrame from the confusion matrix.
eec_cm_df = pd.DataFrame(
    eec_cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

eec_cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,283,0
Actual 1,0,175


In [15]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, eec_y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          N       1.00      1.00      1.00      1.00      1.00      1.00       283
          Y       1.00      1.00      1.00      1.00      1.00      1.00       175

avg / total       1.00      1.00      1.00      1.00      1.00      1.00       458

