In [1]:
#Import dependencies and other necessary items
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [2]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix

In [3]:
# #Load the data

file_path = Path("Resources/MechaCar_mpg.csv")
covid_data_df = pd.read_csv(file_path)
covid_data_df.head()
# OR 
# covid_data_df = pd.read_json()

Unnamed: 0,vehicle_length,vehicle_weight,spoiler_angle,ground_clearance,AWD,mpg
0,14.697095,6407.94647,48.789983,14.640983,1,49.04918
1,12.534206,5182.080571,90.0,14.366679,1,36.766063
2,20.0,8337.981208,78.632323,12.253711,0,80.0
3,13.428485,9419.670939,55.939032,12.989359,1,18.941489
4,15.44998,3772.666826,26.128164,15.103963,1,63.824568


# Clean the data and prepare it for the machine learning model

In [34]:
covid_data_df['good_or_bad'] = covid_data_df['mpg'].apply(lambda x: 'good' if x > 40 else 'bad')
covid_data_df.head()

Unnamed: 0,vehicle_length,vehicle_weight,spoiler_angle,ground_clearance,AWD,mpg,good_or_bad
0,14.697095,6407.94647,48.789983,14.640983,1,49.04918,good
1,12.534206,5182.080571,90.0,14.366679,1,36.766063,bad
2,20.0,8337.981208,78.632323,12.253711,0,80.0,good
3,13.428485,9419.670939,55.939032,12.989359,1,18.941489,bad
4,15.44998,3772.666826,26.128164,15.103963,1,63.824568,good


# Split the data into training and testing

In [35]:
#Ensure all data types are numerical for the machine learning model to process

In [36]:
#Create features
X = covid_data_df.drop(['good_or_bad'], axis=1)

#Create targets
y = covid_data_df['good_or_bad']

In [37]:
#Check balance of the target values
y.value_counts()

good    31
bad     19
Name: good_or_bad, dtype: int64

In [38]:
#Import train test split and split the data into training and testing
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1,stratify=y)

# Use Logistic Regression Model to train and predict

In [39]:
# # Train the Logistic Regression model using the training data

from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver="lbfgs", random_state=1)
model.fit(X_train, y_train)

LogisticRegression(random_state=1)

In [40]:
#Predict using the trained data

y_pred = model.predict(X_test)

In [41]:
#Compare the prediction data vs the actual data from y_test

compare_df = pd.DataFrame({"Prediction": y_pred, "Actual": y_test})
print(compare_df)

   Prediction Actual
35       good   good
16       good   good
41       good   good
48        bad   good
12        bad    bad
32        bad   good
21        bad    bad
18       good   good
25        bad    bad
22        bad    bad
27       good   good
7        good   good
24        bad    bad


# Showcase results of the machine learning model

In [42]:
# Calculated the balanced accuracy score

from sklearn.metrics import balanced_accuracy_score, confusion_matrix

acc_score = balanced_accuracy_score(y_test, y_pred)

print(f"The balanced accuracy score for the Logistic Regression Model is : {acc_score}")

The balanced accuracy score for the Logistic Regression Model is : 0.875


In [43]:
# Display the confusion matrix

cm = pd.DataFrame(confusion_matrix(y_test, y_pred), index = ["Actual Bad", "Actual Good"], columns = ["Pred Bad", "Pred Good"])
print(cm)

             Pred Bad  Pred Good
Actual Bad          5          0
Actual Good         2          6
