In [54]:
is_debug = False

## Imports

In [55]:
# DO NOT CHANGE THESE LINES.
import os
import json
import pandas as pd
import warnings
from sklearn.linear_model import LogisticRegression
from feature_engine.encoding import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from joblib import dump
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score, average_precision_score
import pandas as pd
from sklearn.linear_model import LogisticRegression
warnings.filterwarnings('ignore')

## Paths

In [56]:
# DO NOT CHANGE THESE LINES.
ROOT_DIR = os.path.dirname(os.getcwd())
MODEL_INPUTS_OUTPUTS = os.path.join(ROOT_DIR, 'model_inputs_outputs/')
INPUT_DIR = os.path.join(MODEL_INPUTS_OUTPUTS, "inputs")
INPUT_SCHEMA_DIR = os.path.join(INPUT_DIR, "schema")
DATA_DIR = os.path.join(INPUT_DIR, "data")
TRAIN_DIR = os.path.join(DATA_DIR, "training")
TEST_DIR = os.path.join(DATA_DIR, "testing")
MODEL_PATH = os.path.join(MODEL_INPUTS_OUTPUTS, "model")
MODEL_ARTIFACTS_PATH = os.path.join(MODEL_PATH, "artifacts")
OHE_ENCODER_FILE = os.path.join(MODEL_ARTIFACTS_PATH, 'ohe.joblib')
LABEL_ENCODER_FILE = os.path.join(MODEL_ARTIFACTS_PATH, 'label_encoder.joblib')
PREDICTOR_DIR_PATH = os.path.join(MODEL_ARTIFACTS_PATH, "predictor")
PREDICTOR_FILE_PATH = os.path.join(PREDICTOR_DIR_PATH, "predictor.joblib")
IMPUTATION_FILE = os.path.join(MODEL_ARTIFACTS_PATH, 'imputation.joblib')
if not os.path.exists(MODEL_ARTIFACTS_PATH):
    os.makedirs(MODEL_ARTIFACTS_PATH)
if not os.path.exists(PREDICTOR_DIR_PATH):
    os.makedirs(PREDICTOR_DIR_PATH)

### Reading the schema
The schema contains metadata about the datasets. We will use the schema to get information about the type of each feature (NUMERIC or CATEGORICAL) and the id and target features, this will be helpful in preprocessing stage.

In [57]:
file_name = [f for f in os.listdir(INPUT_SCHEMA_DIR) if f.endswith('json')][0]
schema_path = os.path.join(INPUT_SCHEMA_DIR, file_name)
with open(schema_path, "r", encoding="utf-8") as file:
    schema = json.load(file)
features = schema['features']

numeric_features = []
categorical_features = []
for f in features:
    if f['dataType'] == 'CATEGORICAL':
        categorical_features.append(f['name'])
    else:
        numeric_features.append(f['name'])

id_feature = schema['id']['name']
target_feature = schema['target']['name']

### Reading training data

In [58]:
file_name = [f for f in os.listdir(TRAIN_DIR) if f.endswith('.csv')][0]
file_path = os.path.join(TRAIN_DIR, file_name)
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,u_id,fatals,a_ct,a_ped_f,a_pedal_f,a_roll,a_hr,a_polpur,month,day,...,owner,deaths,numoccs,impact1,deformed,ve_forms,ve_total,weather,lgt_cond,driver_factor
0,32083,1,Single-Vehicle Crash,Other Crash,Other Crash,Other Crash,No - Hit and Run,Other Crash,10,2,...,Driver (in this crash) Was Registered Owner,1,1.0,Clockpoint 12,Disabling damage,1,1,Clear,Dark - not lighted,other
1,55073,1,Single-Vehicle Crash,Other Crash,Other Crash,Other Crash,No - Hit and Run,Other Crash,6,21,...,Driver (in this crash) Not Registered Owner (o...,1,1.0,Clockpoint 1,Disabling damage,1,1,Clear,Daylight,speeding_driver_involved
2,7458,1,Single-Vehicle Crash,Other Crash,Other Crash,Other Crash,No - Hit and Run,Other Crash,7,14,...,Driver (in this crash) Was Registered Owner,0,1.0,Clockpoint 12,,1,1,Clear,Daylight,other
3,5685,1,Single-Vehicle Crash,Other Crash,Other Crash,Other Crash,No - Hit and Run,Other Crash,9,15,...,Driver (in this crash) Not Registered Owner (o...,0,1.0,Clockpoint 12,Functional damage,1,1,Clear,Dark - not lighted,other
4,9245,1,Single-Vehicle Crash,Other Crash,Other Crash,Other Crash,No - Hit and Run,Other Crash,9,28,...,Driver (in this crash) Was Registered Owner,1,1.0,Clockpoint 9,Disabling damage,1,1,Clear,Dark - not lighted,drunk_driver_involved


## Data Preprocessing
Data preprocessing is very important before training the model, as the data may contain missing values in some cells. Moreover, most of the learning algorithms cannot work with categorical data, thus the data has to be encoded.

In this section we will impute the missing values and encode the categorical features. Afterwards the data will be ready to train the model.

##### Imputing missing data
> In this section we imputed the missing values using the mode. However in many cases the mode is not an optimal value. You can try your own imputation and compare the results.

In [59]:
# Imputing missing data
# columns_with_missing_values = df.columns[df.isna().any()]
# imputaion_values = {}
# for column in columns_with_missing_values:
#     mode = df[column].mode()[0]
#     df[column].fillna(mode, inplace=True)
#     imputaion_values[column] = mode

# path = dump(imputaion_values, IMPUTATION_FILE)

from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer

numerical_cols_with_missing = [col for col in numeric_features if df[col].isna().any()]
categorical_cols_with_missing = [col for col in categorical_features if df[col].isna().any()]

knn_imputer = KNNImputer(n_neighbors=5)
categorical_imputer = SimpleImputer(strategy='most_frequent')  # Mode imputation

df[numerical_cols_with_missing] = knn_imputer.fit_transform(df[numerical_cols_with_missing])
df[categorical_cols_with_missing] = categorical_imputer.fit_transform(df[categorical_cols_with_missing])

imputers = {'numerical': knn_imputer, 'categorical': categorical_imputer}
dump(imputers, IMPUTATION_FILE)


# Comment the above code and write you own imputation code here

# Different options I found for imputation
# Handling NA values:
# - Remove rows with NA values: 
#    df.dropna(inplace=True)
# 
# - Replace NA values with a statistic (mean, median, or mode):
#    df.fillna(df.mean(), inplace=True)
# 
# - Use forward-fill or back-fill:
#    df.fillna(method='ffill', inplace=True)
#    # or
#    df.fillna(method='bfill', inplace=True)
# 
# - Replace NA values with a constant:
#    df.fillna(-1, inplace=True)
# 
# - Use interpolation:
#    df.interpolate(inplace=True)

# print("Size of the dataframe is ", df.shape)
# rows_with_missing_values = df[df.isna().any(axis=1)]
# path = dump(rows_with_missing_values, DROPNA_FILE)
# df.dropna(inplace=True)
# print("Size of the dataframe after removing dropna is ", df.shape)

# I decided to just use the mode for imputation and focus on the model


['/Users/pparks/Dev/USD-HackAThon-2023-Fall/model_inputs_outputs/model/artifacts/imputation.joblib']

##### Encoding Categorical features
> Notice that we do not want to encode the target feature nor the id column.

In [60]:
# Saving the id and target columns in a different variable.
ids = df[id_feature]
target = df[target_feature]

# Dropping the id and target from the dataframe
df.drop(columns=[id_feature, target_feature], inplace=True)

# Ensure that all categorical columns are stored as str type.
# This is to ensure that even if the categories are numbers (1, 2, ...), they still get encoded.
for c in categorical_features:
    df[c] = df[c].astype(str)

# Encoding the features
encoder = OneHotEncoder(top_categories=6)
encoder.fit(df)
df = encoder.transform(df)

# Saving the encoder to use it on the testing dataset
path = dump(encoder, OHE_ENCODER_FILE)


#### Encoding the target feature

In [61]:
print(target.values)

['other' 'speeding_driver_involved' 'other' ... 'other' 'other'
 'drunk_driver_involved']


In [62]:
encoder = LabelEncoder()
y = encoder.fit_transform(target.values.reshape(-1, 1))
dump(encoder, LABEL_ENCODER_FILE)
y

array([1, 2, 1, ..., 1, 1, 0])

In [63]:
df.head()

Unnamed: 0,fatals,month,day,hour,minute,age,permvit,pernotmvit,mod_year,deaths,...,weather_Rain,"weather_Fog, smog, smoke",weather_Snow,weather_Reported as unknown,lgt_cond_Daylight,lgt_cond_Dark - not lighted,lgt_cond_Dark - lighted,lgt_cond_Dawn,lgt_cond_Dark - unknown lighting,lgt_cond_Dusk
0,1,10,2,3.0,10.0,62,1,0,2003.0,1,...,0,0,0,0,0,1,0,0,0,0
1,1,6,21,8.0,45.0,40,1,0,2002.0,1,...,0,0,0,0,1,0,0,0,0,0
2,1,7,14,21.0,45.0,26,1,1,2003.0,0,...,0,0,0,0,1,0,0,0,0,0
3,1,9,15,20.0,46.0,64,1,1,1999.0,0,...,0,0,0,0,0,1,0,0,0,0
4,1,9,28,20.0,24.0,45,1,0,1996.0,1,...,0,0,0,0,0,1,0,0,0,0


In [64]:
# TEST CODE DO NOT LEAVE ENABLED
if (is_debug):
    X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.2, random_state=42)
    df = X_train
    y = y_train

### Training the Classifier
We choose Logistic Regression Classifier, but feel free to try your own and compare the results.

In [65]:
# Creating a logistic regression model and training it
# model = LogisticRegression()
# model.fit(df, y)
# .63

# from sklearn.ensemble import RandomForestClassifier
# model = RandomForestClassifier(n_estimators=100)
# model.fit(df, y)
# .67

# from sklearn.svm import SVC
# model = SVC(kernel='linear') # probability = False is MUCH faster
# model = SVC(kernel='linear', probability=True)
# Accuracy: 0.66

# from sklearn.neighbors import KNeighborsClassifier
# model = KNeighborsClassifier(n_neighbors=5)
# 0.57

# from sklearn.naive_bayes import GaussianNB
# model = GaussianNB()
# 0.56

# from sklearn.tree import DecisionTreeClassifier
# model = DecisionTreeClassifier(max_leaf_nodes = 4, random_state = 0)
# .64

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
# model = GradientBoostingClassifier()
# model = GradientBoostingClassifier(learning_rate=0.2, max_depth=3, n_estimators=100)
best_params = {'subsample': 0.8, 'n_estimators': 200, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_features': 'auto', 'max_depth': 5, 'learning_rate': 0.1}
model = GradientBoostingClassifier(**best_params)
# 0.69

train_params = False
if (train_params):
    # GradientBoostingClassifier worked the best with 0.69 accuracy
    ## Try to tune for better accuracy
    ## https://scikit-learn.org/stable/modules/grid_search.html
    ## https://medium.com/all-things-ai/in-depth-parameter-tuning-for-gradient-boosting-3363992e9bae
    param_grid = {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'subsample': [0.8, 0.9, 1.0],
        'max_features': ['auto', 'sqrt', 'log2']
    }
    # grid_search = GridSearchCV(model, param_grid, cv=3, scoring='accuracy')
    # grid_search = GridSearchCV(model, param_grid, cv=3, scoring='f1_macro', n_jobs=-1)
    random_search = RandomizedSearchCV(model, param_distributions=param_grid, n_iter=100, cv=3, scoring='f1_macro')
    random_search.fit(df, y)
    best_params = random_search.best_params_
    #{'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 100}
    # Initially I used scoring='accurary' but changed to scoring='f1_macro' after reading the docs
    # The model is evaluated on Macro-averaged F1 Score
    # grid_search.fit(df, y)
    # best_params = grid_search.best_params_

    print(best_params)
    model = GradientBoostingClassifier(**best_params)


model.fit(df, y)

# Saving the model to use it for predictions
path = dump(model, PREDICTOR_FILE_PATH)


In [66]:
# TEST CODE DO NOT LEAVE ENABLED
if (is_debug):
    # Make predictions on the test data
    y_pred = model.predict(X_test)

    # Get predicted probabilities
    y_pred_prob = model.predict_proba(X_test)

    # Calculate the metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred, average='macro')
    precision_w, recall_w, f1_score_w, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted')
    roc_auc = roc_auc_score(y_test, y_pred_prob, average='weighted', multi_class='ovr')
    
    print('Accuracy:', accuracy)
    print('Macro Avg Precision:', precision)
    print('Weighted Avg Precision:', precision_w)
    print('Macro Avg Recall:', recall)
    print('Weighted Avg Recall:', recall_w)
    print('Macro Avg F1-Score:', f1_score)
    print('Weighted Avg F1-Score:', f1_score_w)
    print('Weighted Avg AUC-Score:', roc_auc)



Accuracy: 0.6959593729300066
Macro Avg Precision: 0.6314455146693686
Weighted Avg Precision: 0.6780284393927807
Macro Avg Recall: 0.578229313113034
Weighted Avg Recall: 0.6959593729300066
Macro Avg F1-Score: 0.583618563900676
Weighted Avg F1-Score: 0.6750943335237147
Weighted Avg AUC-Score: 0.8285459687176149


In [67]:
if (is_debug):
    print("DISABLE THE DEBUG FLAG BEFORE SUBMITTING THE NOTEBOOK!")

DISABLE THE DEBUG FLAG BEFORE SUBMITTING THE NOTEBOOK!


In [68]:

# Accuracy: 0.6920953852947671
# Macro Avg Precision: 0.6237889355795457
# Weighted Avg Precision: 0.6733327232094176
# Macro Avg Recall: 0.5755847767475675
# Weighted Avg Recall: 0.6920953852947671
# Macro Avg F1-Score: 0.5803224831418862
# Weighted Avg F1-Score: 0.671840351900582
# Weighted Avg AUC-Score: 0.828581996716839