In [64]:
import pandas as pd 
import numpy as np 
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import root_mean_squared_error
import mlflow
import xgboost as xgb
from hyperopt import tpe, hp, STATUS_OK, fmin, Trials
from hyperopt.pyll import scope

In [51]:
mlflow.set_experiment('road_risk')

2025/10/14 02:55:37 INFO mlflow.tracking.fluent: Experiment with name 'road_risk' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///Users/aravindrajeshmenon/Documents/DataScienceProjects/Projects/road_accident_risk/road_risk/notebooks/mlruns/549507082084492537', creation_time=1760406937748, experiment_id='549507082084492537', last_update_time=1760406937748, lifecycle_stage='active', name='road_risk', tags={}>

In [52]:
mlflow.get_tracking_uri()

'file:///Users/aravindrajeshmenon/Documents/DataScienceProjects/Projects/road_accident_risk/road_risk/notebooks/mlruns'

In [42]:
df = pd.read_csv('../data/train.csv')

In [43]:
df = df.drop(['id'], axis = 1)

In [44]:
df.sample()

Unnamed: 0,road_type,num_lanes,curvature,speed_limit,lighting,weather,road_signs_present,public_road,time_of_day,holiday,school_season,num_reported_accidents,accident_risk
314106,highway,1,0.69,25,dim,foggy,True,False,evening,True,False,2,0.35


In [45]:
x = df.drop(['accident_risk'], axis = 1)
y = df['accident_risk'].values

In [46]:
x_train, x_val, y_train, y_val= train_test_split(x,y,train_size = 0.8, random_state = 42)


In [47]:
num_cols = df.select_dtypes(['int', 'float']).columns.to_list()
cat_cols = df.select_dtypes(['object', 'bool']).columns.to_list()

num_cols.remove('accident_risk')
print(num_cols)

['num_lanes', 'curvature', 'speed_limit', 'num_reported_accidents']


In [48]:
dv = DictVectorizer(sparse = False)

train_dict = x_train[cat_cols + num_cols].to_dict(orient = 'records')
x_train_encoded = dv.fit_transform(train_dict)

val_dict = x_val[cat_cols + num_cols].to_dict(orient = 'records')
x_val_encoded = dv.transform(val_dict)

In [56]:
with mlflow.start_run():
    mlflow.set_tag("model", "linear_regression")
    lr = LinearRegression()
    lr.fit(x_train_encoded, y_train)
    y_pred = lr.predict(x_val_encoded)
    rmse = root_mean_squared_error(y_val, y_pred)
    print(rmse)
    mlflow.log_metric("rmse", rmse)

0.0735308625898314


In [61]:
with mlflow.start_run():
    mlflow.set_tag("model", "Lasso")
    lasso = Lasso(alpha = 1)
    lasso.fit(x_train_encoded, y_train)
    y_pred = lasso.predict(x_val_encoded)
    rmse = root_mean_squared_error(y_val, y_pred)
    print(rmse)
    mlflow.log_metric("rmse", rmse)

0.16271187689427052


In [62]:
with mlflow.start_run():
    mlflow.set_tag("model", "Ridge")
    ridge = Ridge(alpha = 0.001)
    ridge.fit(x_train_encoded, y_train)
    y_pred = ridge.predict(x_val_encoded)
    rmse = root_mean_squared_error(y_val, y_pred)
    print(rmse)
    mlflow.log_metric("rmse", rmse)
    

0.07353086258195074


In [66]:
train = xgb.DMatrix(x_train_encoded, label = y_train)
valid = xgb.DMatrix(x_val_encoded, y_val)

In [67]:
def objective(params):
    with mlflow.start_run():
        mlflow.set_tag("model", 'xgboost')
        mlflow.log_params(params)
        booster = xgb.train(
            params = params, 
            dtrain = train, 
            num_boost_round = 200, 
            evals=[(train, "train"), (valid, "validation")],
            early_stopping_rounds = 50,
            verbose_eval = False
        )
        y_pred = booster.predict(valid)
        rmse = root_mean_squared_error(y_val, y_pred)
        mlflow.log_metric("rmse", rmse)
        mlflow.xgboost.log_model(booster, name = "xgb_mlflow")
        return {'loss': rmse, 'status': STATUS_OK}
        

In [70]:
search_space = {
    'max_depth' : scope.int(hp.quniform("max_depth", 10,30,1)),
    'learning_rate' : hp.loguniform("learning_rate", -3, 0),
    'min_child_weight' : hp.choice("min_samples_split", [2,5,7,10]),
    'subsample' : hp.uniform("subsample", 0.7,1),
    'colsample_bytree' : hp.uniform("colsample_bytree", 0.7,1),
    'gamma' : hp.uniform("gamma", 0, 0.5),
    'reg_alpha' : hp.loguniform("reg_alpha", -5, 0),
    "reg_lambda" : hp.loguniform("reg_lambda", -5,0),
    'objective' : 'reg:squarederror'

}

In [71]:
trials = Trials()
best_result = fmin(
    fn = objective, 
    space = search_space, 
    algo = tpe.suggest, 
    max_evals = 50
)

  0%|          | 0/50 [00:00<?, ?trial/s, best loss=?]

  xgb_model.save_model(model_data_path)




  2%|▏         | 1/50 [00:06<05:01,  6.15s/trial, best loss: 0.05662084290306092]

  xgb_model.save_model(model_data_path)




  4%|▍         | 2/50 [00:10<03:59,  5.00s/trial, best loss: 0.05645571729163345]

  xgb_model.save_model(model_data_path)




  6%|▌         | 3/50 [00:14<03:34,  4.56s/trial, best loss: 0.05645571729163345]

  xgb_model.save_model(model_data_path)




  8%|▊         | 4/50 [00:19<03:43,  4.87s/trial, best loss: 0.05645571729163345]

  xgb_model.save_model(model_data_path)




 10%|█         | 5/50 [00:22<03:11,  4.25s/trial, best loss: 0.05645571729163345]

  xgb_model.save_model(model_data_path)




 12%|█▏        | 6/50 [00:25<02:47,  3.80s/trial, best loss: 0.05645571729163345]

  xgb_model.save_model(model_data_path)




 14%|█▍        | 7/50 [00:29<02:40,  3.74s/trial, best loss: 0.056453279369115775]

  xgb_model.save_model(model_data_path)




 16%|█▌        | 8/50 [00:32<02:30,  3.57s/trial, best loss: 0.056453279369115775]

  xgb_model.save_model(model_data_path)




 18%|█▊        | 9/50 [00:36<02:25,  3.55s/trial, best loss: 0.056269459826753396]

  xgb_model.save_model(model_data_path)




 20%|██        | 10/50 [00:40<02:35,  3.89s/trial, best loss: 0.056269459826753396]

  xgb_model.save_model(model_data_path)




 22%|██▏       | 11/50 [00:45<02:43,  4.19s/trial, best loss: 0.056269459826753396]

  xgb_model.save_model(model_data_path)




 24%|██▍       | 12/50 [00:50<02:44,  4.33s/trial, best loss: 0.056269459826753396]

  xgb_model.save_model(model_data_path)




 26%|██▌       | 13/50 [00:54<02:35,  4.19s/trial, best loss: 0.056269459826753396]

  xgb_model.save_model(model_data_path)




 28%|██▊       | 14/50 [00:59<02:43,  4.53s/trial, best loss: 0.056269459826753396]

  xgb_model.save_model(model_data_path)




 30%|███       | 15/50 [01:02<02:18,  3.95s/trial, best loss: 0.056269459826753396]

  xgb_model.save_model(model_data_path)




 32%|███▏      | 16/50 [01:05<02:08,  3.78s/trial, best loss: 0.056269459826753396]

  xgb_model.save_model(model_data_path)




 34%|███▍      | 17/50 [01:09<02:05,  3.80s/trial, best loss: 0.056269459826753396]

  xgb_model.save_model(model_data_path)




 36%|███▌      | 18/50 [01:12<01:59,  3.73s/trial, best loss: 0.056269459826753396]

  xgb_model.save_model(model_data_path)




 38%|███▊      | 19/50 [01:16<01:51,  3.61s/trial, best loss: 0.056269459826753396]

  xgb_model.save_model(model_data_path)




 40%|████      | 20/50 [01:20<01:50,  3.68s/trial, best loss: 0.056269459826753396]

  xgb_model.save_model(model_data_path)




 42%|████▏     | 21/50 [01:24<01:56,  4.01s/trial, best loss: 0.056269459826753396]

  xgb_model.save_model(model_data_path)




 44%|████▍     | 22/50 [01:28<01:49,  3.90s/trial, best loss: 0.05625525187272339] 

  xgb_model.save_model(model_data_path)




 46%|████▌     | 23/50 [01:33<01:52,  4.16s/trial, best loss: 0.05625525187272339]

  xgb_model.save_model(model_data_path)




 48%|████▊     | 24/50 [01:37<01:47,  4.12s/trial, best loss: 0.05625525187272339]

  xgb_model.save_model(model_data_path)




 50%|█████     | 25/50 [01:40<01:36,  3.87s/trial, best loss: 0.05625525187272339]

  xgb_model.save_model(model_data_path)




 52%|█████▏    | 26/50 [01:43<01:24,  3.52s/trial, best loss: 0.05625525187272339]

  xgb_model.save_model(model_data_path)




 54%|█████▍    | 27/50 [01:46<01:19,  3.48s/trial, best loss: 0.05625525187272339]

  xgb_model.save_model(model_data_path)




 56%|█████▌    | 28/50 [01:50<01:18,  3.56s/trial, best loss: 0.05625525187272339]

  xgb_model.save_model(model_data_path)




 58%|█████▊    | 29/50 [01:53<01:10,  3.34s/trial, best loss: 0.05625525187272339]

  xgb_model.save_model(model_data_path)




 60%|██████    | 30/50 [01:56<01:04,  3.22s/trial, best loss: 0.05625525187272339]

  xgb_model.save_model(model_data_path)




 62%|██████▏   | 31/50 [01:58<00:57,  3.01s/trial, best loss: 0.05625525187272339]

  xgb_model.save_model(model_data_path)




 64%|██████▍   | 32/50 [02:04<01:07,  3.77s/trial, best loss: 0.05625525187272339]

  xgb_model.save_model(model_data_path)




 66%|██████▌   | 33/50 [02:08<01:06,  3.94s/trial, best loss: 0.05625525187272339]

  xgb_model.save_model(model_data_path)




 68%|██████▊   | 34/50 [02:12<01:02,  3.92s/trial, best loss: 0.05625525187272339]

  xgb_model.save_model(model_data_path)




 70%|███████   | 35/50 [02:16<00:58,  3.88s/trial, best loss: 0.056235952632478235]

  xgb_model.save_model(model_data_path)




 72%|███████▏  | 36/50 [02:19<00:53,  3.80s/trial, best loss: 0.056235952632478235]

  xgb_model.save_model(model_data_path)




 74%|███████▍  | 37/50 [02:23<00:48,  3.70s/trial, best loss: 0.056235952632478235]

  xgb_model.save_model(model_data_path)




 76%|███████▌  | 38/50 [02:27<00:45,  3.78s/trial, best loss: 0.056235952632478235]

  xgb_model.save_model(model_data_path)




 78%|███████▊  | 39/50 [02:30<00:40,  3.71s/trial, best loss: 0.056235952632478235]

  xgb_model.save_model(model_data_path)




 80%|████████  | 40/50 [02:35<00:38,  3.89s/trial, best loss: 0.056235952632478235]

  xgb_model.save_model(model_data_path)




 82%|████████▏ | 41/50 [02:38<00:33,  3.71s/trial, best loss: 0.056235952632478235]

  xgb_model.save_model(model_data_path)




 84%|████████▍ | 42/50 [02:40<00:26,  3.37s/trial, best loss: 0.056235952632478235]

  xgb_model.save_model(model_data_path)




 86%|████████▌ | 43/50 [02:43<00:22,  3.16s/trial, best loss: 0.056235952632478235]

  xgb_model.save_model(model_data_path)




 88%|████████▊ | 44/50 [02:46<00:18,  3.14s/trial, best loss: 0.056235952632478235]

  xgb_model.save_model(model_data_path)




 90%|█████████ | 45/50 [02:49<00:15,  3.14s/trial, best loss: 0.056235952632478235]

  xgb_model.save_model(model_data_path)




 92%|█████████▏| 46/50 [02:52<00:12,  3.07s/trial, best loss: 0.056235952632478235]

  xgb_model.save_model(model_data_path)




 94%|█████████▍| 47/50 [02:55<00:09,  3.03s/trial, best loss: 0.056235952632478235]

  xgb_model.save_model(model_data_path)




 96%|█████████▌| 48/50 [02:58<00:06,  3.02s/trial, best loss: 0.056235952632478235]

  xgb_model.save_model(model_data_path)




 98%|█████████▊| 49/50 [03:01<00:02,  2.93s/trial, best loss: 0.056235952632478235]

  xgb_model.save_model(model_data_path)




100%|██████████| 50/50 [03:04<00:00,  3.68s/trial, best loss: 0.056235952632478235]
