In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

In [3]:
accs = pd.read_csv('accidents1000.csv')
accs.head()

Unnamed: 0,RushHour,WRK_ZONE,WKDY,INT_HWY,LGTCON_day,LEVEL,SPD_LIM,SUR_COND_dry,TRAF_WAY_two_way,WEATHER_adverse,MAX_SEV
0,1,0,1,1,0,1,70,0,0,1,no-injury
1,1,0,1,0,0,0,70,0,0,1,no-injury
2,1,0,1,0,0,0,65,0,0,1,non-fatal
3,1,0,1,0,0,0,55,0,1,0,non-fatal
4,1,0,0,0,0,0,35,0,0,1,no-injury


In [5]:
accs['MAX_SEV'].value_counts()

MAX_SEV
non-fatal    499
no-injury    492
fatal          8
Name: count, dtype: int64

In [7]:
accs.isna().sum()

RushHour            0
WRK_ZONE            0
WKDY                0
INT_HWY             0
LGTCON_day          0
LEVEL               0
SPD_LIM             0
SUR_COND_dry        0
TRAF_WAY_two_way    0
WEATHER_adverse     0
MAX_SEV             0
dtype: int64

##### [Problem 1] Discriminant Analysis, Neural Networks, Random Forest, and SVM Model


In [10]:
from sklearn.model_selection import train_test_split

##### [1.1] Generate training and holdout partitions on the data set. Use 1/3 of the data in the holdout.

In [13]:
X = accs.drop('MAX_SEV', axis=1)
y = accs['MAX_SEV']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/3, random_state=42, stratify = y) 

##### [1.2] Fit a discriminant analysis model

In [16]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.preprocessing import StandardScaler

In [18]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [20]:
lda = LinearDiscriminantAnalysis()

#fit model to training data
lda.fit(X_train_scaled, y_train)

#predict on holdout set
y_pred = lda.predict(X_test_scaled)

In [22]:
#confusion matrix, accuracy score
from sklearn.metrics import confusion_matrix, accuracy_score

cm_da = confusion_matrix(y_test, y_pred, labels=lda.classes_)
cm_df_da = pd.DataFrame(cm_da, index=lda.classes_, columns=lda.classes_)
print("Confusion Matrix:")
print(cm_df_da)

acc_score = accuracy_score(y_test, y_pred)
print(f"\nAccuracy of Discriminant Analysis: {acc_score:.3f}")

Confusion Matrix:
           fatal  no-injury  non-fatal
fatal          0          1          2
no-injury      0         73         91
non-fatal      1         72         93

Accuracy of Discriminant Analysis: 0.498


In [24]:
from sklearn.metrics import classification_report
print("Classification Report of Discriminant Analysis:")
print(classification_report(y_test, y_pred, target_names=lda.classes_, digits=3))

Classification Report of Discriminant Analysis:
              precision    recall  f1-score   support

       fatal      0.000     0.000     0.000         3
   no-injury      0.500     0.445     0.471       164
   non-fatal      0.500     0.560     0.528       166

    accuracy                          0.498       333
   macro avg      0.333     0.335     0.333       333
weighted avg      0.495     0.498     0.495       333



##### [1.3] Fit a neural network model, applying a training grid to tune the parameters. 

In [27]:
from sklearn.model_selection import GridSearchCV

In [29]:
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import LabelEncoder

In [31]:
#target variables are encoded to integer values
label_encode = LabelEncoder()
y_encode = label_encode.fit_transform(y)

In [33]:
#parameter tuning
param_grid = {
    'hidden_layer_sizes': [(50,), (100,), (50,50)],
    'activation': ['relu', 'tanh'],
    'alpha': [0.0001, 0.001],
    'solver': ['adam'],
    'max_iter': [300]
}

In [35]:
mlp = MLPClassifier(random_state=42)
mlp_grid = GridSearchCV(mlp, param_grid, cv=5, scoring='accuracy', n_jobs=1)

In [37]:
#fit the training model
mlp_grid.fit(X_train_scaled, y_train)

In [39]:
best_mlp = mlp_grid.best_estimator_

In [41]:
y_pred = best_mlp.predict(X_test_scaled)

In [43]:
cm_mlp = confusion_matrix(y_test, y_pred, labels=best_mlp.classes_)
cm_df_mlp = pd.DataFrame(cm_mlp, index=best_mlp.classes_, columns=best_mlp.classes_)
print("Confusion Matrix:")
print(cm_df_mlp)

acc_score_mlp = accuracy_score(y_test, y_pred)
print(f"\nAccuracy of MLP: {acc_score_mlp:.3f}")

Confusion Matrix:
           fatal  no-injury  non-fatal
fatal          0          1          2
no-injury      0         91         73
non-fatal      0         98         68

Accuracy of MLP: 0.477


In [45]:
print("Classification Report of MLPClassifier:")
print(classification_report(y_test, y_pred, target_names=best_mlp.classes_, digits=3))

Classification Report of MLPClassifier:
              precision    recall  f1-score   support

       fatal      0.000     0.000     0.000         3
   no-injury      0.479     0.555     0.514       164
   non-fatal      0.476     0.410     0.440       166

    accuracy                          0.477       333
   macro avg      0.318     0.322     0.318       333
weighted avg      0.473     0.477     0.473       333



##### [1.4]Fit a random forest model, applying a training grid to tune the parameters.

In [48]:
from sklearn.ensemble import RandomForestClassifier

#parameter tuning
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
}

In [50]:
#fit the model on training data
rf = RandomForestClassifier(random_state=42)
rf_grid = GridSearchCV(rf, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
rf_grid.fit(X_train_scaled, y_train)

In [51]:
best_rf = rf_grid.best_estimator_
best_rf

In [52]:
#predict on test data
y_pred = best_rf.predict(X_test_scaled)

In [53]:
cm_rf = confusion_matrix(y_test, y_pred, labels=best_rf.classes_)
cm_df_rf = pd.DataFrame(cm_rf, index=best_rf.classes_, columns=best_rf.classes_)
print("Confusion Matrix:")
print(cm_df_rf)

acc_score_rf = accuracy_score(y_test, y_pred)
print(f"\nAccuracy of RF: {acc_score_rf:.3f}")

Confusion Matrix:
           fatal  no-injury  non-fatal
fatal          0          1          2
no-injury      0         89         75
non-fatal      0         83         83

Accuracy of RF: 0.517


In [54]:
print("Classification Report of Random Forest:")
print(classification_report(y_test, y_pred, target_names=best_rf.classes_, digits=3))

Classification Report of Random Forest:
              precision    recall  f1-score   support

       fatal      0.000     0.000     0.000         3
   no-injury      0.514     0.543     0.528       164
   non-fatal      0.519     0.500     0.509       166

    accuracy                          0.517       333
   macro avg      0.344     0.348     0.346       333
weighted avg      0.512     0.517     0.514       333



##### [1.5] Support Vector Machine, applying a training grid to tune the parameters.

In [61]:
from sklearn.svm import SVC

In [63]:
#param_grid = {'C': [1,10,20],'kernel': ['linear', 'rbf'],'gamma': ['auto']}

svm_grid = GridSearchCV(SVC(random_state=42, gamma='auto'), {'C': [1,10,20],
               'kernel': ['linear', 'rbf']}, cv=5)

svm_grid.fit(X_train_scaled, y_train)

svm_grid.cv_results_

{'mean_fit_time': array([0.00758214, 0.0045794 , 0.00595517, 0.00450211, 0.00848012,
        0.0049264 ]),
 'std_fit_time': array([0.00276171, 0.00052539, 0.00032244, 0.00024445, 0.00045274,
        0.00040146]),
 'mean_score_time': array([0.00149565, 0.00206289, 0.00069304, 0.00158215, 0.00066156,
        0.0015305 ]),
 'std_score_time': array([4.74472343e-04, 2.44809033e-04, 2.24252414e-05, 2.34679466e-05,
        8.20714187e-06, 1.15327579e-05]),
 'param_C': masked_array(data=[1, 1, 10, 10, 20, 20],
              mask=[False, False, False, False, False, False],
        fill_value=999999),
 'param_kernel': masked_array(data=['linear', 'rbf', 'linear', 'rbf', 'linear', 'rbf'],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'C': 1, 'kernel': 'linear'},
  {'C': 1, 'kernel': 'rbf'},
  {'C': 10, 'kernel': 'linear'},
  {'C': 10, 'kernel': 'rbf'},
  {'C': 20, 'kernel': 'linear'},
  {'C': 20, 'kernel': 'rbf'}],


In [64]:
#fit the SVM model to trained data
svm_grid.fit(X_train_scaled, y_train)

In [66]:
#pulling the results of the cross-validation into a dataframe
df = pd.DataFrame(svm_grid.cv_results_)
df.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.003309,0.000538,0.000706,7.4e-05,1,linear,"{'C': 1, 'kernel': 'linear'}",0.507463,0.481203,0.511278,0.466165,0.451128,0.483447,0.023236,4
1,0.003598,0.000104,0.001705,8.4e-05,1,rbf,"{'C': 1, 'kernel': 'rbf'}",0.522388,0.526316,0.556391,0.526316,0.488722,0.524026,0.02148,1
2,0.005909,0.000333,0.000665,1.1e-05,10,linear,"{'C': 10, 'kernel': 'linear'}",0.507463,0.481203,0.511278,0.466165,0.451128,0.483447,0.023236,4
3,0.004447,0.000179,0.001553,1e-05,10,rbf,"{'C': 10, 'kernel': 'rbf'}",0.507463,0.488722,0.556391,0.503759,0.496241,0.510515,0.023825,3
4,0.008507,0.000438,0.000661,2e-06,20,linear,"{'C': 20, 'kernel': 'linear'}",0.507463,0.481203,0.511278,0.466165,0.451128,0.483447,0.023236,4


In [69]:
#taking the C, kernel, and average of cv score and sorting them in descending order
df[['param_C', 'param_kernel','mean_test_score']].sort_values(by='mean_test_score', ascending=False)

Unnamed: 0,param_C,param_kernel,mean_test_score
1,1,rbf,0.524026
5,20,rbf,0.512041
3,10,rbf,0.510515
0,1,linear,0.483447
2,10,linear,0.483447
4,20,linear,0.483447


In [71]:
#the best estimator here is where C=1, kernel=rbf, and mean_score = 0.524026
best_svm = svm_grid.best_estimator_
best_svm

In [73]:
#predict on test data using best svm
y_pred = best_svm.predict(X_test_scaled)

In [75]:
cm_svm = confusion_matrix(y_test, y_pred, labels=best_svm.classes_)
cm_df_svm = pd.DataFrame(cm_svm, index=best_svm.classes_, columns=best_svm.classes_)
print("Confusion Matrix:")
print(cm_df_svm)

acc_score_svm = accuracy_score(y_test, y_pred)
print(f"\nAccuracy of SVM: {acc_score_svm:.3f}")

Confusion Matrix:
           fatal  no-injury  non-fatal
fatal          0          1          2
no-injury      0         99         65
non-fatal      0         96         70

Accuracy of SVM: 0.508


In [77]:
print("Classification Report of SVM:")
print(classification_report(y_test, y_pred, target_names=best_svm.classes_, digits=3))

Classification Report of SVM:
              precision    recall  f1-score   support

       fatal      0.000     0.000     0.000         3
   no-injury      0.505     0.604     0.550       164
   non-fatal      0.511     0.422     0.462       166

    accuracy                          0.508       333
   macro avg      0.339     0.342     0.337       333
weighted avg      0.503     0.508     0.501       333



##### [Problem 2]: XGBoost tree model

In [80]:
res = pd.read_csv('restaurantdata.csv')
res.head()

Unnamed: 0,Location,Cuisine,Rating,Seating.Capacity,Average.Meal.Price,Marketing.Budget,Social.Media.Followers,Chef.Experience.Years,Number.of.Reviews,Avg.Review.Length,Ambience.Score,Service.Quality.Score,Parking.Availability,Weekend.Reservations,Weekday.Reservations,Revenue
0,Rural,Japanese,4.0,38,73.98,2224,23406,13,185,161.924906,1.3,7.0,Yes,13,4,638945.52
1,Downtown,Mexican,3.2,76,28.11,4416,42741,8,533,148.759717,2.6,3.4,Yes,48,6,490207.83
2,Rural,Italian,4.7,48,48.29,2796,37285,18,853,56.849189,5.3,6.7,No,27,14,541368.62
3,Rural,Italian,4.4,34,51.55,1167,15214,13,82,205.433265,4.6,2.8,Yes,9,17,404556.8
4,Downtown,Japanese,4.9,88,75.98,3639,40171,9,78,241.681584,8.6,2.1,No,37,26,1491046.35


In [82]:
X = res.drop('Revenue', axis=1)
y = res['Revenue']

In [84]:
#convert categorical variables into integers
X = pd.get_dummies(X, drop_first=True)

In [86]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 1/3, random_state = 42)

##### [2.2] Fit an XGBoost tree model, applying a training grid to tune the parameters.

In [89]:
from xgboost import XGBRegressor

In [91]:
#parameter tuning
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 6],
    'learning_rate': [0.05, 0.1],
    'subsample': [0.8, 1.0]
}

In [93]:
xgb = XGBRegressor(random_state=42, tree_method='hist', n_jobs=-1)
xgb_grid = GridSearchCV(xgb, param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)

xgb_grid.fit(X_train, y_train)

In [94]:
best_xgb = xgb_grid.best_estimator_

In [95]:
y_pred = best_xgb.predict(X_test)

In [96]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

mse_xgb = mean_squared_error(y_test, y_pred)
mae_xgb = mean_absolute_error(y_test, y_pred)
r2_xgb = r2_score(y_test, y_pred)

print(f"Mean Squared Error (MSE): {mse_xgb:,.2f}")
print(f"Mean Absolute Error (MAE): {mae_xgb:,.2f}")
print(f"R^2 Score: {r2_xgb:.3f}")

Mean Squared Error (MSE): 29,283,814.78
Mean Absolute Error (MAE): 4,154.81
R^2 Score: 1.000


##### [2.3] Fit a random forest model, applying a training grid to tune the parameters. 

In [102]:
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

In [104]:
from sklearn.ensemble import RandomForestRegressor

#fit the model on training data
rf_reg = RandomForestRegressor(random_state=42)
rf_reg_grid = GridSearchCV(rf_reg, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
rf_reg_grid.fit(X_train, y_train)

In [105]:
best_reg_rf = rf_reg_grid.best_estimator_

In [106]:
y_pred = best_reg_rf.predict(X_test)

In [107]:
mse_reg_rf = mean_squared_error(y_test, y_pred)
mae_reg_rf = mean_absolute_error(y_test, y_pred)
r2_reg_rf = r2_score(y_test, y_pred)

print(f"Mean Squared Error (MSE): {mse_reg_rf:,.2f}")
print(f"Mean Absolute Error (MAE): {mae_reg_rf:,.2f}")
print(f"R^2 Score: {r2_reg_rf:.3f}")

Mean Squared Error (MSE): 63,670,296.33
Mean Absolute Error (MAE): 6,216.94
R^2 Score: 0.999
