**Supervised Learning Models: Predictions**

In [31]:

# 📚 Imports
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
import shap
import matplotlib.pyplot as plt


In [32]:
# Read UK raw data for labelling the cols
UK_data = pd.read_csv('/Users/mahnooriqbal/COMP702 Project/ML-AI-Risk-Analysis-AV-Data-/Datasets/UK-cleaned_data.csv')
# Read US scaled data
US_data = pd.read_csv('/Users/mahnooriqbal/COMP702 Project/ML-AI-Risk-Analysis-AV-Data-/Datasets/US_imputed_raw_data.csv')

# Print shape of each DataFrame
UK_data = UK_data.drop(['longitude', 'latitude'], axis=1)


print("UK shape:", UK_data.shape)
print("US shape:", US_data.shape)



UK shape: (229782, 21)
US shape: (4159, 21)


  UK_data = pd.read_csv('/Users/mahnooriqbal/COMP702 Project/ML-AI-Risk-Analysis-AV-Data-/Datasets/UK-cleaned_data.csv')


In [33]:
# 🧹 Make sure both have a binary target column called 'severity'
# Assume: severity = 1 if fatal/severe, 0 otherwise
# Make sure to preprocess these before use (drop NA, encode categoricals, etc.)

# UK_df_scaled.head()
# US_df_scaled.head()
# we have to rescale these as we must merge the accidents into binary
print(UK_data['Highest Injury Severity Alleged'].unique())
print(US_data['Highest Injury Severity Alleged'].unique())

['Minor' 'Serious' 'Fatality']
['No Injuries Reported' 'Minor' 'Serious' 'Moderate' 'Fatality']


In [34]:
# For UK Dataset
UK_data['Highest Injury Severity Alleged'] = UK_data['Highest Injury Severity Alleged'].apply(lambda x: 0 if x == 'Minor' else 1)

# For US Dataset
def map_us_severity(x):
    if x in ['No Injuries Reported', 'Minor']:
        return 0
    else:  # Serious or Fatality
        return 1

US_data['Highest Injury Severity Alleged'] = US_data['Highest Injury Severity Alleged'].apply(map_us_severity)
# note uk data has no 'no induries hence we will binary minor and no injuries together 
# and serious, moderarte and fatality together as minor/no injury and serious/fatalities
print(UK_data['Highest Injury Severity Alleged'].unique())
print(US_data['Highest Injury Severity Alleged'].unique())

[0 1]
[0 1]


In [35]:
# reencode and scale 
from sklearn.preprocessing import LabelEncoder
from tabulate import tabulate
import pandas as pd

# Define your columns (fixed syntax errors and duplicates)
categorical_columns = [
    'Highest Injury Severity Alleged', 'Roadway Type', 'Lighting',
    'Weather', 'Roadway Surface', 'City', 'State',
    'Make', 'Model', 'Model Year', 'ADS Equipped?', 'Automation System Engaged?',
    'Posted Speed Limit (MPH)', 'SV Pre-Crash Movement', 'SV Contact Area'# Note: This should be numerical but included here for encoding
]

datetime_cols = ['Incident Date', 'Incident Time (24:00)']
numerical_cols = []  # Add any truly numerical columns here
index_cols = ['Report ID', 'Report Version']

US_categorical_cols = [
    'Highest Injury Severity Alleged', 'Roadway Type', 'Lighting',
    'Weather', 'Roadway Surface', 'City', 'State',
    'Make', 'Model', 'Model Year', 'ADS Equipped?', 'Automation System Engaged?',
    'Posted Speed Limit (MPH)', 'SV Pre-Crash Movement', 'SV Contact Area'# Note: This should be numerical but included here for encoding
]

UK_categorical_cols = [
    'Highest Injury Severity Alleged', 'Roadway Type', 'Lighting',
    'Weather', 'Roadway Surface', 'City', 'State',
    'Make', 'Model', 'Model Year', 
    'Posted Speed Limit (MPH)','SV Pre-Crash Movement', 'SV Contact Area' # Note: This should be numerical but included here for encoding
]

# === Define columns ===
categorical_columns = [
    'Highest Injury Severity Alleged', 'Roadway Type', 'Lighting',
    'Weather', 'Roadway Surface', 'City', 'State','Country','Report ID', 'Report Version',
    'Make', 'Model', 'Model Year', 'ADS Equipped?', 'Automation System Engaged?',
    'Posted Speed Limit (MPH)', 'SV Pre-Crash Movement', 'SV Contact Area','Crash With'
]

datetime_cols = ['Incident Date', 'Incident Time (24:00)']
numerical_cols = []  # Add real numerical columns here if any


def encode_data(df, categorical_cols, datetime_cols, numerical_cols, index_cols):
    """
    Encodes categorical columns using LabelEncoder and datetime columns as numeric values.
    """
    from sklearn.preprocessing import LabelEncoder
    import pandas as pd

    df_encoded = df.copy()
    encoders = {}

    # Encode categorical columns
    for col in categorical_cols:
        if col in df_encoded.columns:
            df_encoded[col] = df_encoded[col].astype(str).fillna("Unknown")
            le = LabelEncoder()
            df_encoded[col] = le.fit_transform(df_encoded[col])
            encoders[col] = le

    # Convert datetime columns to numeric formats
    if 'Incident Date' in datetime_cols and 'Incident Date' in df_encoded.columns:
        df_encoded['Incident Date'] = pd.to_datetime(df_encoded['Incident Date'], errors='coerce')
        df_encoded['Incident_Year'] = df_encoded['Incident Date'].dt.year
        df_encoded['Incident_Month'] = df_encoded['Incident Date'].dt.month
        df_encoded['Incident_DayOfWeek'] = df_encoded['Incident Date'].dt.dayofweek
        df_encoded.drop(columns=['Incident Date'], inplace=True)

    if 'Incident Time (24:00)' in datetime_cols and 'Incident Time (24:00)' in df_encoded.columns:
        time_parsed = pd.to_datetime(df_encoded['Incident Time (24:00)'], format='%H:%M:%S', errors='coerce')
        df_encoded['Incident Time (24:00)'] = (
            time_parsed.dt.hour.fillna(0).astype(int) * 3600 +
            time_parsed.dt.minute.fillna(0).astype(int) * 60 +
            time_parsed.dt.second.fillna(0).astype(int)
        )


    return df_encoded, encoders


# === Encode both datasets ===
US_encoded_df, us_encoders = encode_data(
    df=US_data, # already imputed
    categorical_cols=categorical_columns,
    datetime_cols=datetime_cols,
    numerical_cols=numerical_cols,
    index_cols=index_cols
)

UK_encoded_df, uk_encoders = encode_data(
    df=UK_data,
    categorical_cols=categorical_columns,
    datetime_cols=datetime_cols,
    numerical_cols=numerical_cols,
    index_cols=index_cols
)

# === Preview ===
print("US Data Shape:", US_encoded_df.shape)
print("UK Data Shape:", UK_encoded_df.shape)
print("\nUK Preview:")
print(tabulate(UK_encoded_df.head(), headers='keys', tablefmt='pretty'))

print("\nUS Preview:")
print(tabulate(US_encoded_df.head(), headers='keys', tablefmt='pretty'))

US Data Shape: (4159, 23)
UK Data Shape: (229782, 23)

UK Preview:
+---+-----------+----------------+------+-------+------------+---------------+----------------------------+-----------------------+------+-------+--------------+-----------------+--------------------------+----------+------------+---------------------------------+-----------------------+---------+-----------------+---------+---------------+----------------+--------------------+
|   | Report ID | Report Version | Make | Model | Model Year | ADS Equipped? | Automation System Engaged? | Incident Time (24:00) | City | State | Roadway Type | Roadway Surface | Posted Speed Limit (MPH) | Lighting | Crash With | Highest Injury Severity Alleged | SV Pre-Crash Movement | Weather | SV Contact Area | Country | Incident_Year | Incident_Month | Incident_DayOfWeek |
+---+-----------+----------------+------+-------+------------+---------------+----------------------------+-----------------------+------+-------+--------------+----------

In [36]:
from sklearn.preprocessing import StandardScaler
from tabulate import tabulate

def scale_and_preview(df_US, df_UK, scaler_type='standard'):
    """
    Scale two dataframes (e.g., US and UK) using StandardScaler or MinMaxScaler,
    and print their shapes and head previews.
    """
    # Choose the scaler
    if scaler_type == 'standard':
        scaler = StandardScaler()
    elif scaler_type == 'minmax':
        from sklearn.preprocessing import MinMaxScaler
        scaler = MinMaxScaler()
    else:
        raise ValueError("scaler_type must be 'standard' or 'minmax'")
    
    # Scale both dataframes
    US_scaled = scaler.fit_transform(df_US)
    UK_scaled = scaler.transform(df_UK)  # assume same features

    # Convert back to DataFrames with original column names
    US_scaled_df = pd.DataFrame(US_scaled, columns=df_US.columns)
    UK_scaled_df = pd.DataFrame(UK_scaled, columns=df_UK.columns)

    # Preview
    print("US Data Shape:", US_scaled_df.shape)
    print("UK Data Shape:", UK_scaled_df.shape)

    print("\nUK Preview:")
    print(tabulate(UK_scaled_df.head(), headers='keys', tablefmt='pretty'))

    print("\nUS Preview:")
    print(tabulate(US_scaled_df.head(), headers='keys', tablefmt='pretty'))

    return US_scaled_df, UK_scaled_df
US_scaled_df, UK_scaled_df = scale_and_preview(US_encoded_df, UK_encoded_df, scaler_type='standard')

US Data Shape: (4159, 23)
UK Data Shape: (229782, 23)

UK Preview:
+---+---------------------+--------------------+---------------------+---------------------+--------------------+---------------------+----------------------------+-----------------------+---------------------+---------------------+---------------------+----------------------+--------------------------+---------------------+---------------------+---------------------------------+-----------------------+---------------------+---------------------+---------+--------------------+---------------------+----------------------+
|   |      Report ID      |   Report Version   |        Make         |        Model        |     Model Year     |    ADS Equipped?    | Automation System Engaged? | Incident Time (24:00) |        City         |        State        |    Roadway Type     |   Roadway Surface    | Posted Speed Limit (MPH) |      Lighting       |     Crash With      | Highest Injury Severity Alleged | SV Pre-Crash Movement |

In [37]:
# 📊 Shared Evaluation Function
def evaluate_model(y_true, y_pred, y_proba, model_name, dataset_label):
    print(f"\n📊 {model_name} Evaluation on {dataset_label} Dataset")
    print("Accuracy:", accuracy_score(y_true, y_pred))
    print("Precision:", precision_score(y_true, y_pred))
    print("Recall:", recall_score(y_true, y_pred))
    print("F1 Score:", f1_score(y_true, y_pred))
    print("ROC AUC:", roc_auc_score(y_true, y_proba))
    print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))

In [39]:

# 🚦 Logistic Regression
def run_logistic_regression(df, label="Dataset"):
    print(f"\n🚦 Running Logistic Regression on {label}")
    df = df.dropna()
    X = pd.get_dummies(df.drop("Highest Injury Severity Alleged", axis=1), drop_first=True)
    y = df["Highest Injury Severity Alleged"]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]

    evaluate_model(y_test, y_pred, y_proba, "Logistic Regression", label)

    print("\n🧠 Coefficients (Logistic Regression):")
    for feature, coef in zip(X_train.columns, model.coef_[0]):
        direction = "↑ increases" if coef > 0 else "↓ decreases"
        print(f"{feature}: {coef:.3f} → {direction} severity risk")

# ✅ Run Each Model Separately
# 🔹 Logistic Regression
run_logistic_regression(UK_scaled_df, label="UK (Human Driving)")
run_logistic_regression(US_scaled_df, label="US (Autonomous Driving)")



🚦 Running Logistic Regression on UK (Human Driving)


ValueError: Unknown label type: continuous. Maybe you are trying to fit a classifier, which expects discrete classes on a regression target with continuous values.

In [40]:
# 🌲 Random Forest
def run_random_forest(df, label="Dataset"):
    print(f"\n🌲 Running Random Forest on {label}")
    df = df.dropna()
    X = pd.get_dummies(df.drop("Highest Injury Severity Alleged", axis=1), drop_first=True)
    y = df["Highest Injury Severity Alleged"]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]

    evaluate_model(y_test, y_pred, y_proba, "Random Forest", label)

# 🔹 Random Forest
run_random_forest(UK_scaled_df, label="UK (Human Driving)")
run_random_forest(US_scaled_df, label="US (Autonomous Driving)")


🌲 Running Random Forest on UK (Human Driving)


ValueError: Unknown label type: continuous. Maybe you are trying to fit a classifier, which expects discrete classes on a regression target with continuous values.

In [41]:
# ⚡ XGBoost + SHAP
def run_xgboost(df, label="Dataset"):
    print(f"\n⚡ Running XGBoost on {label}")
    df = df.dropna()
    X = pd.get_dummies(df.drop("Highest Injury Severity Alleged", axis=1), drop_first=True)
    y = df["Highest Injury Severity Alleged"]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]

    evaluate_model(y_test, y_pred, y_proba, "XGBoost", label)

    # SHAP Explanation
    explainer = shap.Explainer(model, X_train)
    shap_values = explainer(X_test)

    print(f"\n📈 SHAP Summary Plot for {label}")
    shap.summary_plot(shap_values, X_test, show=False)
    plt.title(f"SHAP Feature Importance - {label}")
    plt.show()


# 🔹 XGBoost
run_xgboost(UK_scaled_df, label="UK (Human Driving)")
run_xgboost(US_scaled_df, label="US (Autonomous Driving)")


⚡ Running XGBoost on UK (Human Driving)


ValueError: Invalid classes inferred from unique values of `y`.  Expected: [0 1], got [-0.36713559  2.72378933]