# Label Encoding

In [1]:
import pandas as pd

df = pd.read_csv('/content/df_dropped_4.csv')

In [2]:
df.dtypes

Source                 int64
Severity               int64
Start_Lat            float64
Start_Lng            float64
Distance(mi)         float64
Description           object
City                  object
County                object
State                 object
Airport_Code          object
Temperature(F)       float64
Humidity(%)          float64
Pressure(in)         float64
Visibility(mi)       float64
Wind_Direction         int64
Wind_Speed(mph)      float64
Precipitation(in)    float64
Weather_Condition     object
Amenity                 bool
Crossing                bool
Give_Way                bool
Junction                bool
No_Exit                 bool
Railway                 bool
Station                 bool
Stop                    bool
Traffic_Signal          bool
Sunrise_Sunset         int64
Hour                   int64
Day                    int64
Month                  int64
Year                   int64
Weekday                int64
Duration_Minutes     float64
Desc_Word_Coun

In [3]:
df = df.drop(columns=['Description'])

In [4]:
from sklearn.preprocessing import LabelEncoder

labels = ['City', 'County', 'State', 'Airport_Code', 'Weather_Condition']

# Label encode labels
le = LabelEncoder()
for label in labels:
    df[label] = le.fit_transform(df[label])

In [5]:
X = df.drop(columns=['Severity']) # Severity is the target
y = df['Severity']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y  # stratify if classification
)

In [6]:
from sklearn.ensemble import RandomForestClassifier
# from tqdm import tqdm  # When using tqdm with random forest it becomes too slow as it needs "warm_start=True"

rf = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    n_jobs=6,
    verbose=1
)

rf.fit(X_train, y_train)

[Parallel(n_jobs=6)]: Using backend ThreadingBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:  5.4min
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed: 12.7min finished


In [7]:
import joblib

# Save the trained model
filename = f"/media/abood/backup/datasets/ENSA M2/ML/label_encoded_rf_model.pkl"
joblib.dump(rf, filename)
print(f"Model saved as {filename}")

Model saved as /media/abood/backup/datasets/ENSA M2/ML/label_encoded_rf_model.pkl


In [8]:
y_pred = rf.predict(X_test)

[Parallel(n_jobs=6)]: Using backend ThreadingBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    6.4s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:   17.7s finished


In [9]:
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, classification_report
)

# --- Metrics ---
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, average='weighted')
rec = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
conf_matrix = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)

# --- Output ---
print("📊 Accuracy:", acc)              # Overall correct predictions
print("📊 Precision:", prec)            # How many selected items were relevant
print("📊 Recall:", rec)                # How many relevant items were selected
print("📊 F1 Score:", f1)               # Harmonic mean of precision and recall
print("📊 Confusion Matrix:\n", conf_matrix)
print("📊 Classification Report:\n", report)

📊 Accuracy: 0.9281101679044939
📊 Precision: 0.9259867462343904
📊 Recall: 0.9281101679044939
📊 F1 Score: 0.926674496162888
📊 Confusion Matrix:
 [[  10180    2619     271      11]
 [   1504 1106414   28709    8682]
 [    202   44861  185934     166]
 [     41   12999    2432   20726]]
📊 Classification Report:
               precision    recall  f1-score   support

           1       0.85      0.78      0.81     13081
           2       0.95      0.97      0.96   1145309
           3       0.86      0.80      0.83    231163
           4       0.70      0.57      0.63     36198

    accuracy                           0.93   1425751
   macro avg       0.84      0.78      0.81   1425751
weighted avg       0.93      0.93      0.93   1425751



# One-hot and Binary Encoding

In [1]:
import pandas as pd

df = pd.read_csv('/content/df_dropped_4.csv')

In [2]:
df = df.drop(columns=['Description'])

In [3]:
!pip install category_encoders


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [4]:
import category_encoders as ce

# Binary encoding 'City' and 'County' (much more unique values than 'State')
cols_to_encode = ['City', 'County', 'Airport_Code', 'Weather_Condition']

# Initialize the encoder
encoder = ce.BinaryEncoder(cols=cols_to_encode)

# Fit and transform the columns
encoded = encoder.fit_transform(df[cols_to_encode])

# Drop original columns and concatenate encoded ones
df = df.drop(columns=cols_to_encode)
df = pd.concat([df, encoded], axis=1)

In [5]:
# One-hot encode 'State'
df = pd.get_dummies(df, columns=['State'])

In [6]:
df

Unnamed: 0,Source,Severity,Start_Lat,Start_Lng,Distance(mi),Temperature(F),Humidity(%),Pressure(in),Visibility(mi),Wind_Direction,...,State_SD,State_TN,State_TX,State_UT,State_VA,State_VT,State_WA,State_WI,State_WV,State_WY
0,1,3,39.865147,-84.058723,0.010,36.9,91.0,29.68,10.0,0,...,False,False,False,False,False,False,False,False,False,False
1,1,2,39.928059,-82.831184,0.010,37.9,100.0,29.65,10.0,0,...,False,False,False,False,False,False,False,False,False,False
2,1,2,39.063148,-84.032608,0.010,36.0,100.0,29.67,10.0,13,...,False,False,False,False,False,False,False,False,False,False
3,1,3,39.747753,-84.205582,0.010,35.1,96.0,29.64,9.0,13,...,False,False,False,False,False,False,False,False,False,False
4,1,2,39.627781,-84.188354,0.010,36.0,89.0,29.65,6.0,13,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7128747,0,2,34.002480,-117.379360,0.543,86.0,40.0,28.92,10.0,15,...,False,False,False,False,False,False,False,False,False,False
7128748,0,2,32.766960,-117.148060,0.338,70.0,73.0,29.39,10.0,13,...,False,False,False,False,False,False,False,False,False,False
7128749,0,2,33.775450,-117.847790,0.561,73.0,64.0,29.74,10.0,12,...,False,False,False,False,False,False,False,False,False,False
7128750,0,2,33.992460,-118.403020,0.772,71.0,81.0,29.62,10.0,13,...,False,False,False,False,False,False,False,False,False,False


In [7]:
X = df.drop(columns=['Severity']) # Severity is the target
y = df['Severity']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y  # stratify if classification
)

In [8]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    n_jobs=6,
    verbose=1
)

rf.fit(X_train, y_train)

[Parallel(n_jobs=6)]: Using backend ThreadingBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:  4.9min
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed: 12.6min finished


In [9]:
import joblib

# Save the trained model
filename = f"/content/binary_encoded_rf_model_2.pkl"
joblib.dump(rf, filename)
print(f"Model saved as {filename}")

Model saved as /media/abood/backup/datasets/ENSA M2/ML/binary_encoded_rf_model_2.pkl


In [10]:
# Predict on test set
y_pred = rf.predict(X_test)

[Parallel(n_jobs=6)]: Using backend ThreadingBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    8.8s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:   26.5s finished


In [11]:
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, classification_report
)

# --- Metrics ---
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, average='weighted')
rec = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
conf_matrix = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)

# --- Output ---
print("📊 Accuracy:", acc)              # Overall correct predictions
print("📊 Precision:", prec)            # How many selected items were relevant
print("📊 Recall:", rec)                # How many relevant items were selected
print("📊 F1 Score:", f1)               # Harmonic mean of precision and recall
print("📊 Confusion Matrix:\n", conf_matrix)
print("📊 Classification Report:\n", report)

📊 Accuracy: 0.9308238254786425
📊 Precision: 0.9285969568940233
📊 Recall: 0.9308238254786425
📊 F1 Score: 0.929209593574974
📊 Confusion Matrix:
 [[   9743    3080     250       8]
 [   1292 1110296   24802    8919]
 [    164   44191  186680     128]
 [     37   13259    2498   20404]]
📊 Classification Report:
               precision    recall  f1-score   support

           1       0.87      0.74      0.80     13081
           2       0.95      0.97      0.96   1145309
           3       0.87      0.81      0.84    231163
           4       0.69      0.56      0.62     36198

    accuracy                           0.93   1425751
   macro avg       0.84      0.77      0.80   1425751
weighted avg       0.93      0.93      0.93   1425751



# Conclusion

Both models took the same time, so no performance lost with binary and ont-hot encoding hence more columns. And most importantly we got a slightly higher precision with binary and one-hot encoding (by 0.02)

In [6]:
# keep the binary encoded dataset to work on

import zipfile
import os

# Step 1: Save the CSV file temporarily
csv_path = '/content/binary_encoded_df_2.csv'
df.to_csv(csv_path, index=False)

# Step 2: Zip the CSV file
zip_path = '/content/binary_encoded_df_2.zip'
with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
    zipf.write(csv_path, os.path.basename(csv_path))

# Confirm the file has been zipped and saved
print("CSV file has been zipped and saved !")

CSV file has been zipped and saved !
