# KC House Data

Classifer models to predict price_gt_1M

In [5]:
# To auto-reload modules in jupyter notebook (so that changes in files *.py doesn't require manual reloading):
# https://stackoverflow.com/questions/5364050/reloading-submodules-in-ipython
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


> Import libraries and magic command for inline plotting

In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns



import warnings
warnings.filterwarnings("ignore")

In [7]:
%matplotlib inline

In [None]:
!conda list | findstr "pandas-profiling"

In [None]:
!conda list | findstr "pydantic-settings"

In [None]:
!conda list | findstr "skimpy"

In [None]:
# pip install sweetviz

In [None]:
# pip show pandas-profiling

In [None]:
# !pip install pydantic-settings

In [None]:
# !pip install skimpy

In [None]:
# pip install category_encoders

### Task 3 - EDA

In [8]:
# Read CSV file into pandas dataframe
housing_df = pd.read_csv("kc_house_data_classification.csv")

# Data Summary 
print("Number if rows and columns:", housing_df.shape)
print("\nData types:")
print(housing_df.dtypes)
print("\nSummary stats:")
print(housing_df.describe())

# Missing Values
missing_values = housing_df.isnull().sum()
print("\nMissing values:")
print(missing_values)

# Check if price_gt_1M is binary 
print(housing_df['price_gt_1M'].unique())

Number if rows and columns: (21613, 19)

Data types:
bedrooms           int64
bathrooms        float64
sqft_living        int64
sqft_lot           int64
floors           float64
waterfront         int64
view               int64
condition          int64
grade              int64
sqft_above         int64
sqft_basement      int64
yr_built           int64
yr_renovated       int64
zipcode            int64
lat              float64
long             float64
sqft_living15      int64
sqft_lot15         int64
price_gt_1M        int64
dtype: object

Summary stats:
           bedrooms     bathrooms   sqft_living      sqft_lot        floors  \
count  21613.000000  21613.000000  21613.000000  2.161300e+04  21613.000000   
mean       3.370842      2.114757   2079.899736  1.510697e+04      1.494309   
std        0.930062      0.770163    918.440897  4.142051e+04      0.539989   
min        0.000000      0.000000    290.000000  5.200000e+02      1.000000   
25%        3.000000      1.750000   1427.000000

In [None]:
import sweetviz

In [None]:
report = sweetviz.analyze(housing_df)

In [None]:
report.show_html("output/sweetviz_report.html")

### Manual EDA

In [None]:
# Distribution of Target Variable
custom_color = ["#FF69B4", "#6A0DAD"]
# Set style
sns.set(style="whitegrid")

plt.figure(figsize=(8,6))
sns.countplot(x=housing_df['price_gt_1M'], palette=custom_color)
plt.title("Distribution of House Prices")
plt.xlabel("Price") 
plt.ylabel("Frequency")
plt.xticks([0, 1], ['Price <= 1M', 'Price > 1M']) # x-axis labels 
plt.show()


In [None]:
# Custom color palette
# Hot pink and dark purple
custom_color_scatter = ["#FF69B4", "#6A0DAD"]
custom_palette = ["#6A0DAD"]

# Set color palette
# sns.set_palette(custom_color)
               
# Set plot style
sns.set(style="whitegrid")

# Identify numerical and categorical columns
numerical_cols = housing_df.select_dtypes(include=np.number).columns.tolist()
categorical_cols = housing_df.select_dtypes(include=['object', 'category']).columns.tolist()

# Exclude target variable price_gt_1M from numerical columns 
target_variable = "price_gt_1M"
if target_variable in numerical_cols:
    numerical_cols.remove(target_variable)

# Feature distributions
for col in numerical_cols:
    plt.figure(figsize=(6, 4))
    if col in ['zipcode', 'lat', 'long', 'sqft_lot', 'floors', 'yr_renovated', 'sqft_lot15']:
        print(f"Not showing distribution plot for {col} because it's not suitable for this kind of plot")
        continue
    if col in ['waterfront', 'view', 'condition']:
        sns.countplot(x=col, data=housing_df, palette="Purples")
        plt.title(f"Distribution of {col}")
        plt.xlabel(col)
        plt.ylabel("Count")
        plt.show()
    else:
        sns.histplot(housing_df[col], bins=20, kde=True, color=custom_palette[0])
        plt.title(f"Distribution of {col}")
        plt.xlabel(col)
        plt.ylabel("Frequency")
        plt.show()

# Scatter plot
plt.figure(figsize=(8, 6))
sns.scatterplot(x='long', y='lat', hue='price_gt_1M', data=housing_df, alpha=0.5, palette=custom_color_scatter)
plt.title("Geographic Distribution of Properties")
plt.xlabel("Longitude")
plt.ylabel("Latitude")
plt.show()

# Bar plot
plt.figure(figsize=(16, 6))
sns.countplot(x='zipcode', data=housing_df)
plt.title("Distribution of Properties by Zipcode")
plt.xlabel("Zipcode")
plt.ylabel("Count")
plt.xticks(rotation=90)
plt.show()      


In [None]:
# Correlation Matrix
correlation_matrix = housing_df.corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap="Purples", fmt=".2f")
plt.title("Correlation Matrix")
plt.show()


In [None]:
housing_df.shape

In [None]:
housing_df.columns

In [None]:
housing_df.isna().sum()

In [None]:
# cols_to_keep = [
#    'price_gt_1M', 'sqft_living', 'grade', 'sqft_above', 'sqft_living15', 'bathrooms',
#                   'floors', 'yr_renovated', 'lat', 'bedrooms', 'yr_built', 'sqft_basement', 'long'
# ]

> We need to find and remove any duplicate rows. If we don't remove them it could affect model performance, like overfitting. 

In [9]:
# Check for duplicate rows
duplicates = housing_df.duplicated()
print(f"Number of duplicate rows: {duplicates.sum()}")

# Remove duplicate rows
housing_df = housing_df.drop_duplicates()

# Verify that duplicates are removed
print(f"Number of rows after removing duplicates: {housing_df.shape[0]}")

Number of duplicate rows: 184
Number of rows after removing duplicates: 21429


In [10]:
X = housing_df.iloc[:, 0:18]
y = housing_df.iloc[:, 18]

In [11]:
X.shape

(21429, 18)

In [12]:
y.shape

(21429,)

In [None]:
### Task 4

In [13]:
from sklearn.linear_model import LogisticRegression

In [14]:
clf_model = LogisticRegression(penalty='l2', C=1, solver='saga', max_iter=2000)

In [15]:
from sklearn.model_selection import cross_val_score

# Cross validation
cross_val_score(clf_model, X, y, cv=5, scoring='accuracy').mean()

0.9456344310612665

> A higher mean accuracy score indicates that the model performs better. Cross validation is useful for assessing the general performance of the model.


>This function returns the proportion of each unique value. The zero category value will be the same as your null model accuracy.

In [16]:
y.value_counts(normalize=True)

price_gt_1M
0    0.930655
1    0.069345
Name: proportion, dtype: float64

In [20]:
from sklearn.preprocessing import OneHotEncoder

housing_df["yr_renovated_binary"] = (housing_df["yr_renovated"] > 0).astype(int)
oe_style = OneHotEncoder()
oe_results = oe_style.fit_transform(housing_df[["yr_renovated_binary"]])
result_df = pd.DataFrame(oe_results.toarray(), columns=oe_style.categories_[0])

result_df.head


<bound method NDFrame.head of          0    1
0      1.0  0.0
1      0.0  1.0
2      1.0  0.0
3      1.0  0.0
4      1.0  0.0
...    ...  ...
21424  1.0  0.0
21425  1.0  0.0
21426  1.0  0.0
21427  1.0  0.0
21428  1.0  0.0

[21429 rows x 2 columns]>

In [21]:
# Confirm categories in array
oe_style.categories_

[array([0, 1])]

> This code drops yr_renovated and price_gt_1M. We don't need yr_renovated because we created yr_renovated_binary. We don't want our target variable in our feature matrix. 

### Data preprocessing - Variable type list

In [22]:
# Define categorical and numeric columns
all_cols = X.columns.tolist()

categorical_cols = ['floors', 'yr_renovated_binary', 'bedrooms']
print(categorical_cols)

numeric_cols = [col for col in all_cols if col not in categorical_cols]
numeric_cols

['floors', 'yr_renovated_binary', 'bedrooms']


['bathrooms',
 'sqft_living',
 'sqft_lot',
 'waterfront',
 'view',
 'condition',
 'grade',
 'sqft_above',
 'sqft_basement',
 'yr_built',
 'yr_renovated',
 'zipcode',
 'lat',
 'long',
 'sqft_living15',
 'sqft_lot15']

In [23]:
# Convert categorical columns to category type
for col in categorical_cols:
    housing_df[col] = housing_df[col].astype('category')

> Because we are doing regularize logistic regression we need to rescale our numeric variables so the units of measurement doesn't affect our model fit. Then we need to combine transformer into a prepocessor step, create a classifier model and a pipeline. 

In [24]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Create transformer objects
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Combine transformers into a preprocessor step
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, categorical_cols)])

# Classifier model
# clf_model = LogisticRegression(penalty='l2', C=1, solver='saga', max_iter=500)

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', clf_model)])

In [25]:
# Display pipeline
from sklearn import set_config

set_config(display='diagram')
clf

### Machine Learning Pipeline for Binary Classification with Feature Engineering

In [26]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression, Ridge, Lasso
from sklearn.metrics import accuracy_score, confusion_matrix
import category_encoders as ce

# Convert column names to strings
housing_df.columns = housing_df.columns.astype(str)

# Partition dataset
X = housing_df.drop(columns=['price_gt_1M'])  # Exclude target variable
y = housing_df['price_gt_1M']

# Splits data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=73)

# Target encoding for categorical columns
target_encoder = ce.TargetEncoder(cols=categorical_cols)
X_train_encoded = target_encoder.fit_transform(X_train, y_train)
X_test_encoded = target_encoder.transform(X_test)

clf.fit(X_train_encoded, y_train)

print("Training score: %.3f" % clf.score(X_train_encoded, y_train))
print("Test score: %.3f" % clf.score(X_test_encoded, y_test))


Training score: 0.960
Test score: 0.963


#### Model 0: Null Model 

In [27]:
# Null model
from sklearn.dummy import DummyClassifier

# Create a dummy classifier predicting the most frequent class
null_model = DummyClassifier(strategy='most_frequent')

# Fit the null model
null_model.fit(X_train, y_train)

# Predict on training and test data
y_train_pred_null = null_model.predict(X_train)
y_test_pred_null = null_model.predict(X_test)

# Evaluate the null model
train_accuracy_null = accuracy_score(y_train, y_train_pred_null)
test_accuracy_null = accuracy_score(y_test, y_test_pred_null)

train_conf_matrix_null = confusion_matrix(y_train, y_train_pred_null)
test_conf_matrix_null = confusion_matrix(y_test, y_test_pred_null)

# Print accuracy 
print("Null Model - Training score: {:.3f}".format(train_accuracy_null))
print("Null Model - Test score: {:.3f}".format(test_accuracy_null))

# Print confusion matrix
print("Null Model - Training Confusion Matrix:")
print(train_conf_matrix_null)
print("\nNull Model - Test Confusion Matrix:")
print(test_conf_matrix_null)


Null Model - Training score: 0.929
Null Model - Test score: 0.935
Null Model - Training Confusion Matrix:
[[15934     0]
 [ 1209     0]]

Null Model - Test Confusion Matrix:
[[4009    0]
 [ 277    0]]


#### Model 1: Ridge regression with C=1.0

In [48]:
from sklearn.metrics import confusion_matrix

# Classifier
clf_model_ridge = LogisticRegression(penalty='l2', C=1.0, solver='saga', max_iter=2000)

# Append classifier to preprocessing pipeline
clf_ridge = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', clf_model_ridge)])

# Fit model on training data
clf_ridge.fit(X_train_encoded, y_train)

print("Training score: %.3f" % clf_ridge.score(X_train_encoded, y_train))
print("Test score: %.3f" % clf_ridge.score(X_test_encoded, y_test))

# Make predictions on test data
y_test_pred_ridge = clf_ridge.predict(X_test_encoded)

# Calculate confusion matrix for test data
conf_matrix_test_ridge = confusion_matrix(y_test, y_test_pred_ridge)
print("Confusion Matrix for Ridge Model (Test Set):")
print(conf_matrix_test_ridge)

# Print prediction scores (probabilities) for training and test data
y_train_proba_ridge = clf_ridge.predict_proba(X_train_encoded)
y_test_proba_ridge = clf_ridge.predict_proba(X_test_encoded)

print("Prediction Scores (Probabilities) for Ridge Model (Training Set):")
print(y_train_proba_ridge)
print("Prediction Scores (Probabilities) for Ridge Model (Test Set):")
print(y_test_proba_ridge)

Training score: 0.960
Test score: 0.963
Confusion Matrix for Ridge Model (Test Set):
[[3952   57]
 [ 101  176]]
Prediction Scores (Probabilities) for Ridge Model (Training Set):
[[9.99747238e-01 2.52761594e-04]
 [9.92719017e-01 7.28098314e-03]
 [9.99330701e-01 6.69299449e-04]
 ...
 [9.98536245e-01 1.46375547e-03]
 [9.98465922e-01 1.53407786e-03]
 [9.95039836e-01 4.96016424e-03]]
Prediction Scores (Probabilities) for Ridge Model (Test Set):
[[0.99141678 0.00858322]
 [0.99871364 0.00128636]
 [0.99599659 0.00400341]
 ...
 [0.1044451  0.8955549 ]
 [0.98152131 0.01847869]
 [0.9989776  0.0010224 ]]


In [39]:
# Cross validation
cross_val_score(clf_model_ridge, X, y, cv=5, scoring='accuracy').mean()

0.9456344310612665

#### Model 2: Lasso regression with C=1.0

In [49]:
# Classifier
clf_model_lasso_C1 = LogisticRegression(penalty='l1', C=1.0, solver='saga', max_iter=2000)

# Append classifier to preprocessing pipeline
clf_lasso = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', clf_model_lasso_C1)])

# Fit model on training data
clf_lasso.fit(X_train_encoded, y_train)

print(f"Training score: {clf_lasso.score(X_train_encoded, y_train):.3f}")
print(f"Test score: {clf_lasso.score(X_test_encoded, y_test):.3f}")

# Make predictions on training and test data
y_train_pred_lasso = clf_lasso.predict(X_train_encoded)
y_test_pred_lasso = clf_lasso.predict(X_test_encoded)

# Calculate confusion matrix for training and test data
conf_matrix_train_lasso = confusion_matrix(y_train, y_train_pred_lasso)
conf_matrix_test_lasso = confusion_matrix(y_test, y_test_pred_lasso)

print("Confusion Matrix for Lasso Model (Training Set):")
print(conf_matrix_train_lasso)
print("Confusion Matrix for Lasso Model (Test Set):")
print(conf_matrix_test_lasso)

# Print prediction scores (probabilities) for training and test data
y_train_proba_lasso = clf_lasso.predict_proba(X_train_encoded)
y_test_proba_lasso = clf_lasso.predict_proba(X_test_encoded)

print("Prediction Scores (Probabilities) for Lasso Model (Training Set):")
print(y_train_proba_lasso)
print("Prediction Scores (Probabilities) for Lasso Model (Test Set):")
print(y_test_proba_lasso)

Training score: 0.960
Test score: 0.963
Confusion Matrix for Lasso Model (Training Set):
[[15728   206]
 [  478   731]]
Confusion Matrix for Lasso Model (Test Set):
[[3951   58]
 [ 101  176]]
Prediction Scores (Probabilities) for Lasso Model (Training Set):
[[9.99739522e-01 2.60477648e-04]
 [9.92553420e-01 7.44657960e-03]
 [9.99351318e-01 6.48681824e-04]
 ...
 [9.98595132e-01 1.40486829e-03]
 [9.98456378e-01 1.54362157e-03]
 [9.94925789e-01 5.07421101e-03]]
Prediction Scores (Probabilities) for Lasso Model (Test Set):
[[0.99180918 0.00819082]
 [0.99868569 0.00131431]
 [0.99589554 0.00410446]
 ...
 [0.10363629 0.89636371]
 [0.98149702 0.01850298]
 [0.99894983 0.00105017]]


In [40]:
# Cross validation
cross_val_score(clf_model_lasso_C1, X, y, cv=5, scoring='accuracy').mean()

0.9456344310612665

#### Model 3: Lasso regression with C=0.01

In [47]:
# Classifier
clf_model_lasso_C01 = LogisticRegression(penalty='l1', C=0.01, solver='saga', max_iter=2000)

# Append classifier to preprocessing pipeline
clf_lasso_C01 = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', clf_model_lasso_C01)])

# Fit model on training data
clf_lasso_C01.fit(X_train_encoded, y_train)

print("Training score: %.3f" % clf_lasso.score(X_train_encoded, y_train))
print("Test score: %.3f" % clf_lasso.score(X_test_encoded, y_test))

# Make predictions on training and test data
y_train_pred_lasso_C01 = clf_lasso_C01.predict(X_train_encoded)
y_test_pred_lasso_C01 = clf_lasso_C01.predict(X_test_encoded)

# Calculate confusion matrix for training and test data
conf_matrix_train_lasso_C01 = confusion_matrix(y_train, y_train_pred_lasso_C01)
conf_matrix_test_lasso_C01 = confusion_matrix(y_test, y_test_pred_lasso_C01)

print("Confusion Matrix for Lasso Model with C=0.01 (Training Set):")
print(conf_matrix_train_lasso_C01)
print("Confusion Matrix for Lasso Model with C=0.01 (Test Set):")
print(conf_matrix_test_lasso_C01)

# Print prediction scores (probabilities) for test data
y_test_proba_lasso_C01 = clf_lasso_C01.predict_proba(X_test_encoded)
print("Prediction Scores (Probabilities) for Lasso Model with C=0.01 (Test Set):")
print(y_test_proba_lasso_C01)

# Print prediction scores (probabilities) for training data
y_train_proba_lasso_C01 = clf_lasso_C01.predict_proba(X_train_encoded)
print("Prediction Scores (Probabilities) for Lasso Model with C=0.01 (Training Set):")
print(y_train_proba_lasso_C01)

Training score: 0.960
Test score: 0.963
Confusion Matrix for Lasso Model with C=0.01 (Training Set):
[[15803   131]
 [  588   621]]
Confusion Matrix for Lasso Model with C=0.01 (Test Set):
[[3976   33]
 [ 119  158]]
Prediction Scores (Probabilities) for Lasso Model with C=0.01 (Test Set):
[[0.98858524 0.01141476]
 [0.99525397 0.00474603]
 [0.98234507 0.01765493]
 ...
 [0.226201   0.773799  ]
 [0.97562559 0.02437441]
 [0.98552912 0.01447088]]
Prediction Scores (Probabilities) for Lasso Model with C=0.01 (Training Set):
[[0.99847139 0.00152861]
 [0.97020213 0.02979787]
 [0.99592027 0.00407973]
 ...
 [0.99585178 0.00414822]
 [0.99339204 0.00660796]
 [0.9867584  0.0132416 ]]


In [41]:
# Cross validation
cross_val_score(clf_model_lasso_C01, X, y, cv=5, scoring='accuracy').mean()

0.9456344310612665

In [33]:
# Lasso regression with C=0.01
lasso_model_3 = LogisticRegression(C=0.01, penalty='l1', solver='liblinear')

# Create pipeline for preprocessing and classifier
lasso_pipeline_3 = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', lasso_model_3)
])

# Fit the model
lasso_pipeline_3.fit(X_train_encoded, y_train)

# Predict on training and test data
y_train_pred_lasso_3 = lasso_pipeline_3.predict(X_train)
y_test_pred_lasso_3 = lasso_pipeline_3.predict(X_test)

# Convert continuous predictions to binary labels
threshold = 0.5  # Define a threshold for converting continuous predictions to binary labels
y_train_pred_lasso_3_binary = (y_train_pred_lasso_3 > threshold).astype(int)
y_test_pred_lasso_3_binary = (y_test_pred_lasso_3 > threshold).astype(int)

# Evaluate model
train_accuracy_lasso_3 = accuracy_score(y_train, y_train_pred_lasso_3_binary)
test_accuracy_lasso_3 = accuracy_score(y_test, y_test_pred_lasso_3_binary)

train_conf_matrix_lasso_3 = confusion_matrix(y_train, y_train_pred_lasso_3_binary)
test_conf_matrix_lasso_3 = confusion_matrix(y_test, y_test_pred_lasso_3_binary)

# Print accuracy 
print("\nModel 3: Lasso regression with C=0.01")
print(f"Training score: {train_accuracy_lasso_3:.3f}")
print(f"Test score: {test_accuracy_lasso_3:.3f}")

# Print confusion matrix
print("Training Confusion Matrix:")
print(train_conf_matrix_lasso_3)
print("\nTest Confusion Matrix:")
print(test_conf_matrix_lasso_3)




Model 3: Lasso regression with C=0.01
Training score: 0.956
Test score: 0.958
Training Confusion Matrix:
[[15621   313]
 [  439   770]]

Test Confusion Matrix:
[[3918   91]
 [  87  190]]


#### Model 4: Lasso regression with optimal C value

In [34]:
# Lasso regression with optimal C value
from sklearn.linear_model import LogisticRegressionCV

# Lasso regression with optimal C value using LogisticRegressionCV
lasso_model_cv = LogisticRegressionCV(cv=5, penalty='l1', solver='liblinear')

# Create pipeline for preprocessing and classifier
lasso_pipeline_cv = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', lasso_model_cv)
])

# Fit the model
lasso_pipeline_cv.fit(X_train_encoded, y_train)

# Predict on training and test data
y_train_pred_lasso_cv = lasso_pipeline_cv.predict(X_train)
y_test_pred_lasso_cv = lasso_pipeline_cv.predict(X_test)

# Convert continuous predictions to binary labels
threshold = 0.5  # Define a threshold for converting continuous predictions to binary labels
y_train_pred_lasso_cv_binary = (y_train_pred_lasso_cv > threshold).astype(int)
y_test_pred_lasso_cv_binary = (y_test_pred_lasso_cv > threshold).astype(int)

# Evaluate model
train_accuracy_lasso_cv = accuracy_score(y_train, y_train_pred_lasso_cv_binary)
test_accuracy_lasso_cv = accuracy_score(y_test, y_test_pred_lasso_cv_binary)

train_conf_matrix_lasso_cv = confusion_matrix(y_train, y_train_pred_lasso_cv_binary)
test_conf_matrix_lasso_cv = confusion_matrix(y_test, y_test_pred_lasso_cv_binary)

# Print accuracy 
print("\nModel 4: Lasso regression with optimal C value")
print(f"Training score: {train_accuracy_lasso_cv:.3f}")
print(f"Test score: {test_accuracy_lasso_cv:.3f}")

# Print confusion matrix
print("Training Confusion Matrix:")
print(train_conf_matrix_lasso_cv)
print("\nTest Confusion Matrix:")
print(test_conf_matrix_lasso_cv)

# Print optimal value of C
print(f"Optimal value of C: {lasso_model_cv.C_}")


Model 4: Lasso regression with optimal C value
Training score: 0.959
Test score: 0.962
Training Confusion Matrix:
[[15634   300]
 [  398   811]]

Test Confusion Matrix:
[[3924   85]
 [  77  200]]
Optimal value of C: [0.04641589]


In [42]:
cross_val_score(lasso_model_cv, X, y, cv=5, scoring='accuracy').mean()

0.9572074175996201

#### Model 5: Simple decision tree

In [35]:
from sklearn.tree import DecisionTreeClassifier

# Simple decision tree model
decision_tree_model = DecisionTreeClassifier(random_state=42)

# Create pipeline for preprocessing and classifier
decision_tree_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', decision_tree_model)
])

# Fit the model
decision_tree_pipeline.fit(X_train, y_train)

# Predict on training and test data
y_train_pred_tree = decision_tree_pipeline.predict(X_train)
y_test_pred_tree = decision_tree_pipeline.predict(X_test)

# Convert continuous predictions to binary labels
threshold = 0.5  # Define a threshold for converting continuous predictions to binary labels
y_train_pred_tree_binary = (y_train_pred_tree > threshold).astype(int)
y_test_pred_tree_binary = (y_test_pred_tree > threshold).astype(int)

# Evaluate model
train_accuracy_tree = accuracy_score(y_train, y_train_pred_tree_binary)
test_accuracy_tree = accuracy_score(y_test, y_test_pred_tree_binary)

train_conf_matrix_tree = confusion_matrix(y_train, y_train_pred_tree_binary)
test_conf_matrix_tree = confusion_matrix(y_test, y_test_pred_tree_binary)

# Print accuracy 
print("\nSimple Decision Tree Model")
print(f"Training score: {train_accuracy_tree:.3f}")
print(f"Test score: {test_accuracy_tree:.3f}")

# Print confusion matrix
print("Training Confusion Matrix:")
print(train_conf_matrix_tree)
print("\nTest Confusion Matrix:")
print(test_conf_matrix_tree)


Simple Decision Tree Model
Training score: 1.000
Test score: 0.959
Training Confusion Matrix:
[[15934     0]
 [    2  1207]]

Test Confusion Matrix:
[[3907  102]
 [  72  205]]


In [43]:
cross_val_score(decision_tree_model, X, y, cv=5, scoring='accuracy').mean()

0.9578139784846705

**Hacker Extra:**

In [None]:
from sklearn.tree import plot_tree

# Display the fitted decision tree
plt.figure(figsize=(20,10))
plot_tree(decision_tree_model, feature_names=X_train.columns, class_names=["<= 1M", "> 1M"], filled=True, fontsize=10)
plt.show()

### Task 6: Error exploration