In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s5e10/sample_submission.csv
/kaggle/input/playground-series-s5e10/train.csv
/kaggle/input/playground-series-s5e10/test.csv


## 1. Importing libraries

In [2]:
#for data processing 
import pandas as pd
import numpy as np

#for data processing (specific to ML modelling)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler, LabelEncoder

#for training + eval
from sklearn.metrics import mean_squared_error, r2_score
from lightgbm import LGBMRegressor

## 2. Reading in libraries 

In [3]:
df = pd.read_csv("/kaggle/input/playground-series-s5e10/train.csv")
df_test = pd.read_csv("/kaggle/input/playground-series-s5e10/test.csv")
df_sub = pd.read_csv("/kaggle/input/playground-series-s5e10/sample_submission.csv")


## 3. Exploratory Data Analysis 

In [4]:
df.head()

Unnamed: 0,id,road_type,num_lanes,curvature,speed_limit,lighting,weather,road_signs_present,public_road,time_of_day,holiday,school_season,num_reported_accidents,accident_risk
0,0,urban,2,0.06,35,daylight,rainy,False,True,afternoon,False,True,1,0.13
1,1,urban,4,0.99,35,daylight,clear,True,False,evening,True,True,0,0.35
2,2,rural,4,0.63,70,dim,clear,False,True,morning,True,False,2,0.3
3,3,highway,4,0.07,35,dim,rainy,True,True,morning,False,False,1,0.21
4,4,rural,1,0.58,60,daylight,foggy,False,False,evening,True,False,1,0.56


In [5]:
df_test.head()

Unnamed: 0,id,road_type,num_lanes,curvature,speed_limit,lighting,weather,road_signs_present,public_road,time_of_day,holiday,school_season,num_reported_accidents
0,517754,highway,2,0.34,45,night,clear,True,True,afternoon,True,True,1
1,517755,urban,3,0.04,45,dim,foggy,True,False,afternoon,True,False,0
2,517756,urban,2,0.59,35,dim,clear,True,False,afternoon,True,True,1
3,517757,rural,4,0.95,35,daylight,rainy,False,False,afternoon,False,False,2
4,517758,highway,2,0.86,35,daylight,clear,True,False,evening,False,True,3


In [6]:
#checking null values for training data 
df.isnull().sum()

id                        0
road_type                 0
num_lanes                 0
curvature                 0
speed_limit               0
lighting                  0
weather                   0
road_signs_present        0
public_road               0
time_of_day               0
holiday                   0
school_season             0
num_reported_accidents    0
accident_risk             0
dtype: int64

In [7]:


#checking null values for testing data 
df_test.isnull().sum()

id                        0
road_type                 0
num_lanes                 0
curvature                 0
speed_limit               0
lighting                  0
weather                   0
road_signs_present        0
public_road               0
time_of_day               0
holiday                   0
school_season             0
num_reported_accidents    0
dtype: int64

## 4. Data Preprocessing

### This includes cleaning the data, assigning X and y etc. 

In [8]:
# for training data
X = df.drop(columns=['accident_risk'])
y = df['accident_risk']

In [9]:
#for test data 
X_test_final = df_test

#why doesn't test data have target

In [11]:
# retrieving categorical columns and numerical columns 
numerical_cols =  df.select_dtypes(include="number").columns.tolist()
categorical_cols = df.select_dtypes(include=['object', 'bool']).columns.tolist()


print(f"categorical columns : {categorical_cols}")
print(f"numerical columns : {numerical_cols}")

categorical columns : ['road_type', 'lighting', 'weather', 'road_signs_present', 'public_road', 'time_of_day', 'holiday', 'school_season']
numerical columns : ['id', 'num_lanes', 'curvature', 'speed_limit', 'num_reported_accidents', 'accident_risk']


In [12]:
#apply label encoding consistently to both train and test
for col in categorical_cols:
    le = LabelEncoder()
    #fit on combined data to ensure all categories are seen
    combined = pd.concat([X[col], X_test_final[col]], axis=0)
    le.fit(combined)
    X[col] = le.transform(X[col])
    X_test_final[col] = le.transform(X_test_final[col])

In [13]:
X.head()

Unnamed: 0,id,road_type,num_lanes,curvature,speed_limit,lighting,weather,road_signs_present,public_road,time_of_day,holiday,school_season,num_reported_accidents
0,0,2,2,0.06,35,0,2,0,1,0,0,1,1
1,1,2,4,0.99,35,0,0,1,0,1,1,1,0
2,2,1,4,0.63,70,1,0,0,1,2,1,0,2
3,3,0,4,0.07,35,1,2,1,1,2,0,0,1
4,4,1,1,0.58,60,0,1,0,0,1,1,0,1


In [14]:
X_test_final.head()

Unnamed: 0,id,road_type,num_lanes,curvature,speed_limit,lighting,weather,road_signs_present,public_road,time_of_day,holiday,school_season,num_reported_accidents
0,517754,0,2,0.34,45,2,0,1,1,0,1,1,1
1,517755,2,3,0.04,45,1,1,1,0,0,1,0,0
2,517756,2,2,0.59,35,1,0,1,0,0,1,1,1
3,517757,1,4,0.95,35,0,2,0,0,0,0,0,2
4,517758,0,2,0.86,35,0,0,1,0,1,0,1,3


In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


## 5. Training the data + Making predictions 

In [16]:
# state the model + parameters
model = LGBMRegressor(
    objective='regression',
    metric='rmse',
    learning_rate=0.05,
    num_leaves=31,
    random_state=42
)

#fit the model/ training it 
model.fit(X_train, y_train)

#make predictions
y_pred = model.predict(X_test)

# Show first few predictions
print("Sample predictions for X_test:")
print(y_pred[:10])

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.024406 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 422
[LightGBM] [Info] Number of data points in the train set: 414203, number of used features: 13
[LightGBM] [Info] Start training from score 0.352605
Sample predictions for X_test:
[0.13356373 0.33036508 0.25712421 0.2914435  0.32241881 0.03940119
 0.64863996 0.23617857 0.27664685 0.30141647]


## 6. Evaluation


In [17]:
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"RMSE: {rmse:.4f}")
print(f"R²:   {r2:.4f}")

RMSE: 0.0566
R²:   0.8842


### BUT WE HAVE TO MAKE FINAL PREDICTIONS ON THE GIVEN TEST DATA TO SUBMIT

In [18]:
y_pred_final = model.predict(X_test_final)
# Show first few predictions
print("Sample predictions for X_test:")
print(y_pred_final[:10])

Sample predictions for X_test:
[0.29065006 0.12674676 0.18886699 0.33736463 0.41899801 0.470082
 0.25985551 0.19631396 0.35436147 0.31686013]


## 7. Submission

In [21]:
# Fill the existing empty DataFrame

submission = pd.DataFrame({'id': df_test['id'], 'accident_risk': y_pred_final})
submission

# Save to CSV
submission.to_csv('submission.csv', index=False)