In [1]:
import warnings
import pandas as pd

warnings.filterwarnings("ignore")

In [2]:
MAIN_FOLDER = "/home/arman/it/AI_work/machine/road_accident_risk"

In [3]:
train_data = pd.read_csv(f"{MAIN_FOLDER}/data/train.csv")
test_data = pd.read_csv(f"{MAIN_FOLDER}/data/test.csv")
target_col = "accident_risk"

In [4]:
# train_data = pd.get_dummies(train_data, columns=['road_type', 'weather', 'time_of_day', 'lighting'])
# test_data = pd.get_dummies(test_data, columns=['road_type', 'weather', 'time_of_day', 'lighting'])
# train_data.dtypes

In [5]:
# Identify feature
cols = train_data.columns.tolist()

# Categorical features
categ_feats = [col for col in cols if train_data[col].dtype in ["object","category"] and col != target_col]
print(f"{categ_feats=}")

# Numerical features
num_feats = [col for col in cols if train_data[col].dtype not in ["object","category","bool"] and col not in ["id", target_col]]
print(f"{num_feats=}")


categ_feats=['road_type', 'lighting', 'weather', 'time_of_day']
num_feats=['num_lanes', 'curvature', 'speed_limit', 'num_reported_accidents']


In [6]:
# Creating new features based on the frequency of numerical features
"""
Add frequency and binning features to the dataset.

- For each column, create <col>_freq = how often each value appears in train data.
- For numeric columns, split values into 5 and 10 quantile bins (groups) to show rank or range.
"""

for col in cols:
    if col == "accident_risk":
        continue
    # Frequency encoding: how common each value is
    if col in categ_feats:
        freq = train_data[col].value_counts(normalize=True)
        train_data[f"{col}_freq"] = train_data[col].map(freq)
        test_data[f"{col}_freq"] = test_data[col].map(freq).fillna(train_data[f"{col}_freq"].mean())
    if col in num_feats:
        for q in [5, 10, 15]:
            try:
                train_data[f"{col}_bin{q}"], bins = pd.qcut(train_data[col], q=q, labels=False, retbins=True, duplicates="drop")
                test_data[f"{col}_bin{q}"] = pd.cut(test_data[col], bins=bins, labels=False, include_lowest=True)
            except Exception:
                train_data[f"{col}_bin{q}"] = test_data[f"{col}_bin{q}"] = 0



train_data.head()


Unnamed: 0,id,road_type,num_lanes,curvature,speed_limit,lighting,weather,road_signs_present,public_road,time_of_day,...,curvature_bin15,speed_limit_bin5,speed_limit_bin10,speed_limit_bin15,lighting_freq,weather_freq,time_of_day_freq,num_reported_accidents_bin5,num_reported_accidents_bin10,num_reported_accidents_bin15
0,0,urban,2,0.06,35,daylight,rainy,False,True,afternoon,...,0,0,0,0,0.343822,0.303204,0.331252,0,0,0
1,1,urban,4,0.99,35,daylight,clear,True,False,evening,...,14,0,0,0,0.343822,0.346315,0.333821,0,0,0
2,2,rural,4,0.63,70,dim,clear,False,True,morning,...,9,3,3,3,0.355045,0.346315,0.334927,1,1,1
3,3,highway,4,0.07,35,dim,rainy,True,True,morning,...,0,0,0,0,0.355045,0.303204,0.334927,0,0,0
4,4,rural,1,0.58,60,daylight,foggy,False,False,evening,...,8,2,2,2,0.343822,0.350481,0.333821,0,0,0


In [7]:
# Mapping a column
map_col = "num_reported_accidents"
map_num_reported = {0:0, 1:0, 2:0, 3:2, 4:4, 5:3, 6:1, 7:0}
train_data[map_col] = train_data[map_col].map(map_num_reported)
test_data[map_col] = test_data[map_col].map(map_num_reported)

# Dropping unnecessary columns
remove = categ_feats + num_feats + ["road_signs_present"]
train_data = train_data.drop(columns=remove)
test_data = test_data.drop(columns=remove)

# Dropping ID and duplicates
train_data.drop(columns="id", inplace=True)
train_data.drop_duplicates(inplace=True)

train_data.head()

Unnamed: 0,public_road,holiday,school_season,accident_risk,road_type_freq,num_lanes_bin5,num_lanes_bin10,num_lanes_bin15,curvature_bin5,curvature_bin10,curvature_bin15,speed_limit_bin5,speed_limit_bin10,speed_limit_bin15,lighting_freq,weather_freq,time_of_day_freq,num_reported_accidents_bin5,num_reported_accidents_bin10,num_reported_accidents_bin15
0,True,False,True,0.13,0.330974,0,0,0,0,0,0,0,0,0,0.343822,0.303204,0.331252,0,0,0
1,False,True,True,0.35,0.330974,2,2,2,4,9,14,0,0,0,0.343822,0.346315,0.333821,0,0,0
2,True,True,False,0.3,0.333593,2,2,2,3,6,9,3,3,3,0.355045,0.346315,0.334927,1,1,1
3,True,False,False,0.21,0.335433,2,2,2,0,0,0,0,0,0,0.355045,0.303204,0.334927,0,0,0
4,False,True,False,0.56,0.333593,0,0,0,2,5,8,2,2,2,0.343822,0.350481,0.333821,0,0,0


In [8]:
train_data.to_csv(f"{MAIN_FOLDER}/data/prepared_train.csv")
test_data.to_csv(f"{MAIN_FOLDER}/data/prepared_test.csv")