In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt

# Read the dataset
data = pd.read_csv("C:\Data\data.homework_3.csv")

# Select relevant columns
data = data[['Make', 'Model', 'Year', 'Engine HP', 'Engine Cylinders', 'Transmission Type', 'Vehicle Style', 'highway MPG', 'city mpg', 'MSRP']]

# Lowercase column names and replace spaces with underscores
data.columns = data.columns.str.lower().str.replace(' ', '_')

# Fill missing values with 0
data = data.fillna(0)

# Create binary target variable 'above_average'
data['above_average'] = (data['msrp'] > data['msrp'].mean()).astype(int)

# Split the data into train/validation/test sets
df_full_train, df_test = train_test_split(data, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

# Define numerical and categorical columns
numerical = ['engine_hp', 'engine_cylinders', 'highway_mpg', 'city_mpg']
categorical = ['make', 'model', 'year', 'transmission_type', 'vehicle_style']

Question 1: ROC AUC feature importance

In [2]:
def calculate_auc(df_train, column):
    auc = roc_auc_score(df_train['above_average'], df_train[column])
    if auc < 0.5:
        auc = roc_auc_score(df_train['above_average'], -df_train[column])
    return auc

auc_scores = {column: calculate_auc(df_train, column) for column in numerical}
best_numerical_variable = max(auc_scores, key=auc_scores.get)
print(f'Answer for Question 1: {best_numerical_variable}')

Answer for Question 1: engine_hp


Question 2: Training the model

In [3]:
columns = categorical + numerical
train_dicts = df_train[columns].to_dict(orient='records')
dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(train_dicts)
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000)
model.fit(X_train, df_train['above_average'])

# Evaluate AUC on the validation dataset
val_dicts = df_val[columns].to_dict(orient='records')
X_val = dv.transform(val_dicts)
y_pred = model.predict_proba(X_val)[:, 1]
val_auc = roc_auc_score(df_val['above_average'], y_pred)
print(f'Answer for Question 2: {val_auc:.3f}')

Answer for Question 2: 0.980


Question 3: Precision and Recall

In [4]:
thresholds = np.arange(0, 1.01, 0.01)
precision_scores = []
recall_scores = []

for threshold in thresholds:
    y_pred_bin = (y_pred >= threshold).astype(int)
    tp = ((y_pred_bin == 1) & (df_val['above_average'] == 1)).sum()
    fp = ((y_pred_bin == 1) & (df_val['above_average'] == 0)).sum()
    fn = ((y_pred_bin == 0) & (df_val['above_average'] == 1)).sum()
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    precision_scores.append(precision)
    recall_scores.append(recall)

# Find the threshold where precision and recall intersect
intersection = np.argwhere(np.diff(np.sign(np.array(precision_scores) - np.array(recall_scores))) != 0).reshape(-1) + 1
optimal_threshold = thresholds[intersection[0]]
print(f'Answer for Question 3: {optimal_threshold:.2f}')

Answer for Question 3: 0.49


Question 4: F1 score

In [5]:
f1_scores = [2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0 for precision, recall in zip(precision_scores, recall_scores)]
max_f1_score = max(f1_scores)
optimal_threshold_f1 = thresholds[f1_scores.index(max_f1_score)]
print(f'Answer for Question 4: {optimal_threshold_f1:.2f}')

Answer for Question 4: 0.51


Question 5: 5-Fold CV

In [6]:
kfold = KFold(n_splits=5, shuffle=True, random_state=1)
scores = []

for train_idx, val_idx in kfold.split(df_full_train):
    df_train = df_full_train.iloc[train_idx]
    df_val = df_full_train.iloc[val_idx]
    
    dv = DictVectorizer(sparse=False)
    X_train = dv.fit_transform(df_train[columns].to_dict(orient='records'))
    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000)
    model.fit(X_train, df_train['above_average'])
    
    val_dicts = df_val[columns].to_dict(orient='records')
    X_val = dv.transform(val_dicts)
    y_pred = model.predict_proba(X_val)[:, 1]
    
    auc = roc_auc_score(df_val['above_average'], y_pred)
    scores.append(auc)

std_deviation = round(np.std(scores), 3)
print(f'Answer for Question 5: {std_deviation}')

Answer for Question 5: 0.002


Question 6: Hyperparameter Tuning

In [7]:
C_values = [0.01, 0.1, 0.5, 10]
mean_scores = []
std_scores = []

for C in C_values:
    kfold = KFold(n_splits=5, shuffle=True, random_state=1)
    scores = []
    
    for train_idx, val_idx in kfold.split(df_full_train):
        df_train = df_full_train.iloc[train_idx]
        df_val = df_full_train.iloc[val_idx]
        
        dv = DictVectorizer(sparse=False)
        X_train = dv.fit_transform(df_train[columns].to_dict(orient='records'))
        model = LogisticRegression(solver='liblinear', C=C, max_iter=1000)
        model.fit(X_train, df_train['above_average'])
        
        val_dicts = df_val[columns].to_dict(orient='records')
        X_val = dv.transform(val_dicts)
        y_pred = model.predict_proba(X_val)[:, 1]
        
        auc = roc_auc_score(df_val['above_average'], y_pred)
        scores.append(auc)
    
    mean_auc = round(np.mean(scores), 3)
    std_auc = round(np.std(scores), 3)
    
    mean_scores.append(mean_auc)
    std_scores.append(std_auc)

best_mean_score = max(mean_scores)
best_std_score = std_scores[mean_scores.index(best_mean_score)]
best_C = C_values[mean_scores.index(best_mean_score)]
print(f'Answer for Question 6: {best_C}')

Answer for Question 6: 10
