In [264]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [265]:
us_accidents = pd.read_csv("C:/Users/Hobbs/OneDrive/Documents/SpartaHack2026/ShouldDrive_Spartahack11/US_Accidents_March23.csv")

In [266]:
us_accidents.head()


Unnamed: 0,ID,Source,Severity,Start_Time,End_Time,Start_Lat,Start_Lng,End_Lat,End_Lng,Distance(mi),...,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,Sunrise_Sunset,Civil_Twilight,Nautical_Twilight,Astronomical_Twilight
0,A-1,Source2,3,2016-02-08 05:46:00,2016-02-08 11:00:00,39.865147,-84.058723,,,0.01,...,False,False,False,False,False,False,Night,Night,Night,Night
1,A-2,Source2,2,2016-02-08 06:07:59,2016-02-08 06:37:59,39.928059,-82.831184,,,0.01,...,False,False,False,False,False,False,Night,Night,Night,Day
2,A-3,Source2,2,2016-02-08 06:49:27,2016-02-08 07:19:27,39.063148,-84.032608,,,0.01,...,False,False,False,False,True,False,Night,Night,Day,Day
3,A-4,Source2,3,2016-02-08 07:23:34,2016-02-08 07:53:34,39.747753,-84.205582,,,0.01,...,False,False,False,False,False,False,Night,Day,Day,Day
4,A-5,Source2,2,2016-02-08 07:39:07,2016-02-08 08:09:07,39.627781,-84.188354,,,0.01,...,False,False,False,False,True,False,Day,Day,Day,Day


In [267]:
michigan = us_accidents.loc[us_accidents['State'] == 'MI'].copy()


In [268]:
michigan['Risk_Level'] = np.where(
    michigan['Severity'] == 1,
    0,  # Low risk
    1   # High risk
)


In [269]:
michigan.columns

Index(['ID', 'Source', 'Severity', 'Start_Time', 'End_Time', 'Start_Lat',
       'Start_Lng', 'End_Lat', 'End_Lng', 'Distance(mi)', 'Description',
       'Street', 'City', 'County', 'State', 'Zipcode', 'Country', 'Timezone',
       'Airport_Code', 'Weather_Timestamp', 'Temperature(F)', 'Wind_Chill(F)',
       'Humidity(%)', 'Pressure(in)', 'Visibility(mi)', 'Wind_Direction',
       'Wind_Speed(mph)', 'Precipitation(in)', 'Weather_Condition', 'Amenity',
       'Bump', 'Crossing', 'Give_Way', 'Junction', 'No_Exit', 'Railway',
       'Roundabout', 'Station', 'Stop', 'Traffic_Calming', 'Traffic_Signal',
       'Turning_Loop', 'Sunrise_Sunset', 'Civil_Twilight', 'Nautical_Twilight',
       'Astronomical_Twilight', 'Risk_Level'],
      dtype='object')

In [270]:
model_cols = [
    "Risk_Level",
    "City",
    "Zipcode",
    "Temperature(F)",
    "Wind_Chill(F)",
    "Humidity(%)",
    "Pressure(in)",
    "Visibility(mi)",
    "Wind_Direction",
    "Wind_Speed(mph)",
    "Precipitation(in)",
    "Weather_Condition",
    "Sunrise_Sunset"
]

michigan_model = michigan[model_cols].copy()


In [271]:
michigan_model = michigan_model.dropna()

In [272]:
import numpy as np
import pandas as pd

def make_synth_low_risk(
    df: pd.DataFrame,
    n_new: int,
    risk_col: str = "Risk_Level",
    low_value: int = 0,
    seed: int = 42,
    numeric_jitter_frac: float = 0.05,
) -> pd.DataFrame:
    rng = np.random.default_rng(seed)

    low_df = df[df[risk_col] == low_value].copy()
    if low_df.empty:
        raise ValueError(f"No rows found where {risk_col} == {low_value!r}")

    # Bootstrap rows to preserve cross-feature relationships
    base_idx = rng.integers(0, len(low_df), size=n_new)
    synth = low_df.iloc[base_idx].copy()

    # Identify feature columns (exclude label only)
    feature_cols = [c for c in df.columns if c != risk_col]

    # Numeric vs categorical
    numeric_cols = [c for c in feature_cols if pd.api.types.is_numeric_dtype(df[c])]
    cat_cols = [c for c in feature_cols if c not in numeric_cols]

    # Resample categoricals from LOW distribution
    for c in cat_cols:
        probs = low_df[c].value_counts(normalize=True, dropna=True)
        if not probs.empty:
            synth[c] = rng.choice(
                probs.index.to_numpy(),
                size=n_new,
                replace=True,
                p=probs.to_numpy()
            )

    # Jitter numeric columns
    for c in numeric_cols:
        col = low_df[c].astype(float)
        sd = np.nanstd(col)
        if sd > 0:
            noise = rng.normal(0.0, sd * numeric_jitter_frac, size=n_new)
            synth[c] = synth[c].astype(float) + noise

    # Michigan-realistic clipping
    clip_bounds = {
        "Humidity(%)": (0, 100),
        "Visibility(mi)": (0, 30),
        "Wind_Speed(mph)": (0, 80),
        "Precipitation(in)": (0, 6),
        "Pressure(in)": (27, 32),
        "Temperature(F)": (-20, 100),
        "Wind_Chill(F)": (-60, 100),
    }

    for c, (lo, hi) in clip_bounds.items():
        if c in synth.columns and pd.api.types.is_numeric_dtype(synth[c]):
            synth[c] = synth[c].clip(lo, hi)

    # Ensure label consistency
    synth[risk_col] = low_value

    return synth


In [273]:
def make_synth_nonaccidents_mi(
    df: pd.DataFrame,
    n_new: int,
    seed: int = 42,
    quantile_low: float = 0.10,
    quantile_high: float = 0.90,
) -> pd.DataFrame:
    rng = np.random.default_rng(seed)
    synth = pd.DataFrame(index=range(n_new))

    # Columns to sample categorically from observed Michigan distribution
    cat_cols = [c for c in ["City", "Zipcode", "Sunrise_Sunset", "Weather_Simple"] if c in df.columns]
    for c in cat_cols:
        probs = df[c].value_counts(normalize=True, dropna=True)
        synth[c] = rng.choice(probs.index.to_numpy(), size=n_new, replace=True, p=probs.to_numpy())

    # Numeric weather columns: sample from "normal" quantile band to avoid storms/extremes
    num_cols = [c for c in [
        "Temperature(F)", "Wind_Chill(F)", "Humidity(%)", "Pressure(in)",
        "Visibility(mi)", "Wind_Speed(mph)", "Precipitation(in)"
    ] if c in df.columns]

    for c in num_cols:
        x = pd.to_numeric(df[c], errors="coerce").dropna()
        if x.empty:
            synth[c] = np.nan
            continue
        lo = x.quantile(quantile_low)
        hi = x.quantile(quantile_high)
        # Uniform within typical band; add tiny noise
        vals = rng.uniform(lo, hi, size=n_new)
        vals = vals + rng.normal(0, np.nanstd(x) * 0.01, size=n_new)
        synth[c] = vals

    # Clip to Michigan-realistic bounds
    clip_bounds = {
        "Humidity(%)": (0, 100),
        "Visibility(mi)": (0, 30),
        "Wind_Speed(mph)": (0, 80),
        "Precipitation(in)": (0, 6),
        "Pressure(in)": (27, 32),
        "Temperature(F)": (-20, 100),
        "Wind_Chill(F)": (-60, 100),
    }
    for c, (lo, hi) in clip_bounds.items():
        if c in synth.columns:
            synth[c] = pd.to_numeric(synth[c], errors="coerce").clip(lo, hi)

    # This is the NEW target for accident vs non-accident tasks
    synth["Risk_Level"] = 0

    return synth


In [274]:
nonacc = make_synth_nonaccidents_mi(michigan_model, n_new=9000, seed=42)

acc = michigan_model.copy()


acc_vs_nonacc = pd.concat([acc, nonacc], ignore_index=True)
acc_vs_nonacc["Risk_Level"].value_counts()


Risk_Level
1    96292
0    10023
Name: count, dtype: int64

In [275]:
n_to_add = 9000
synth_low = make_synth_low_risk(acc_vs_nonacc, n_new=n_to_add, low_value=0, seed=42)

michigan_augmented = pd.concat(
    [acc_vs_nonacc, synth_low],
    ignore_index=True
)

print(michigan_model.shape, "->", michigan_augmented.shape)
print(michigan_augmented["Risk_Level"].value_counts())


(97315, 13) -> (115315, 13)
Risk_Level
1    96292
0    19023
Name: count, dtype: int64


In [276]:
michigan_augmented.describe(include = 'all')

Unnamed: 0,Risk_Level,City,Zipcode,Temperature(F),Wind_Chill(F),Humidity(%),Pressure(in),Visibility(mi),Wind_Direction,Wind_Speed(mph),Precipitation(in),Weather_Condition,Sunrise_Sunset
count,115315.0,115315,115315.0,115315.0,115315.0,115315.0,115315.0,115315.0,106315,115315.0,115315.0,106315,115315
unique,,496,11538.0,,,,,,23,,,68,2
top,,Grand Rapids,48507.0,,,,,,W,,,Fair,Day
freq,,18644,2207.0,,,,,,10852,,,32876,74312
mean,0.835034,,,47.685997,43.349586,71.118792,29.262687,7.787946,,9.264201,0.006158,,
std,0.371151,,,20.488497,24.362786,17.570829,0.336735,3.135912,,5.141775,0.029787,,
min,0.0,,,-19.0,-35.6,5.0,27.65,0.0,,0.0,0.0,,
25%,1.0,,,31.0,24.0,58.038373,29.04,5.705392,,6.0,0.0,,
50%,1.0,,,45.339122,41.0,74.0,29.22,10.0,,9.0,0.0,,
75%,1.0,,,65.26949,65.0,85.0,29.43,10.0,,13.0,0.000702,,


In [277]:
michigan_augmented["Weather_Condition"].unique()

array(['Overcast', 'Light Rain', 'Light Snow', 'Light Drizzle', 'Snow',
       'Rain', 'Heavy Snow', 'Mostly Cloudy', 'Haze', 'Scattered Clouds',
       'Light Freezing Rain', 'Light Freezing Drizzle', 'Partly Cloudy',
       'Heavy Rain', 'Heavy Thunderstorms and Rain', 'Light Ice Pellets',
       'Fog', 'Light Rain Showers', 'Clear', 'Thunderstorms and Rain',
       'Light Thunderstorms and Rain', 'Fair', 'Cloudy', 'Patches of Fog',
       'Shallow Fog', 'Light Rain with Thunder', 'Thunder', 'T-Storm',
       'Heavy T-Storm', 'Showers in the Vicinity',
       'Partly Cloudy / Windy', 'Fair / Windy', 'Thunder in the Vicinity',
       'Rain / Windy', 'Cloudy / Windy', 'Mostly Cloudy / Windy',
       'Light Rain / Windy', 'Wintry Mix', 'Light Snow / Windy',
       'Haze / Windy', 'Blowing Snow / Windy', 'N/A Precipitation',
       'Sleet', 'Thunder / Wintry Mix', 'Mist', 'Drizzle',
       'Sleet / Windy', 'Snow / Windy', 'Heavy Rain / Windy',
       'Light Drizzle / Windy', 'T-Storm / W

In [278]:
import numpy as np

w = (
    michigan_augmented["Weather_Condition"]
    .astype(str)          # protects against NaN
    .str.lower()
)

conditions = [
    w.str.contains(r"thunder|t[-\s]?storm|hail", regex=True),
    w.str.contains(r"snow|sleet|ice|freezing|wintry", regex=True),
    w.str.contains(r"rain|drizzle|shower", regex=True),
    w.str.contains(r"fog|mist|haze", regex=True),
    w.str.contains(r"cloud|overcast", regex=True),
    w.str.contains(r"clear|fair", regex=True),
]

choices = [
    "Storm",
    "Snow_Ice",
    "Rain",
    "Fog_Haze",
    "Cloudy",
    "Clear",
]

michigan_augmented["Weather_Simple"] = np.select(
    conditions,
    choices,
    default="Other"
)

michigan_augmented = michigan_augmented.drop(columns="Weather_Condition")


In [279]:
michigan_augmented.describe(include='all')

Unnamed: 0,Risk_Level,City,Zipcode,Temperature(F),Wind_Chill(F),Humidity(%),Pressure(in),Visibility(mi),Wind_Direction,Wind_Speed(mph),Precipitation(in),Sunrise_Sunset,Weather_Simple
count,115315.0,115315,115315.0,115315.0,115315.0,115315.0,115315.0,115315.0,106315,115315.0,115315.0,115315,115315
unique,,496,11538.0,,,,,,23,,,2,7
top,,Grand Rapids,48507.0,,,,,,W,,,Day,Cloudy
freq,,18644,2207.0,,,,,,10852,,,74312,45637
mean,0.835034,,,47.685997,43.349586,71.118792,29.262687,7.787946,,9.264201,0.006158,,
std,0.371151,,,20.488497,24.362786,17.570829,0.336735,3.135912,,5.141775,0.029787,,
min,0.0,,,-19.0,-35.6,5.0,27.65,0.0,,0.0,0.0,,
25%,1.0,,,31.0,24.0,58.038373,29.04,5.705392,,6.0,0.0,,
50%,1.0,,,45.339122,41.0,74.0,29.22,10.0,,9.0,0.0,,
75%,1.0,,,65.26949,65.0,85.0,29.43,10.0,,13.0,0.000702,,


In [280]:
# Features and target
X = michigan_model.drop(columns=["Risk_Level"])
y = michigan_model["Risk_Level"]

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Train size:", X_train.shape)
print("Test size:", X_test.shape)


Train size: (77852, 12)
Test size: (19463, 12)


In [281]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_cols = X.select_dtypes(include=["object", "category"]).columns
numeric_cols = X.select_dtypes(exclude=["object", "category"]).columns

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
        ("num", "passthrough", numeric_cols)
    ]
)


In [282]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

rf = RandomForestClassifier(
    n_estimators=300,
    max_depth=None,
    min_samples_leaf=10,
    class_weight= "balanced",
    random_state=42,
    n_jobs=-1,
)

model = Pipeline(
    steps=[
        ("preprocess", preprocessor),
        ("rf", rf)
    ]
)

model.fit(X_train, y_train)


0,1,2
,steps,"[('preprocess', ...), ('rf', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('cat', ...), ('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,300
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,10
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [283]:
y_prob = model.predict_proba(X_test)[:, 1]

threshold = 0.67  # try 0.6–0.9
y_pred = (y_prob >= threshold).astype(int)



In [284]:
from sklearn.metrics import classification_report, confusion_matrix


print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.02      0.99      0.04       205
           1       1.00      0.48      0.65     19258

    accuracy                           0.49     19463
   macro avg       0.51      0.73      0.34     19463
weighted avg       0.99      0.49      0.64     19463

[[ 202    3]
 [9985 9273]]


In [285]:
sample_input = X_test.iloc[[0]]

probs = model.predict_proba(sample_input)[0]
classes = model.classes_

Risk_Level_output = dict(zip(classes, probs))
predicted_risk = classes[probs.argmax()]

print("Predicted risk:", predicted_risk)
print("Real Risk:", y_test.iloc[0])
print("Probabilities:", risk_output)


Predicted risk: 1
Real Risk: 1
Probabilities: {'High': np.float64(0.7983066418987046), 'Low': np.float64(0.20169335810126973)}


In [286]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

X = michigan_model.drop(columns=["Risk_Level"])
y = michigan_model["Risk_Level"].astype(str)

# Encode target
le = LabelEncoder()
y_enc = le.fit_transform(y) 

# Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y_enc, test_size=0.2, stratify=y_enc, random_state=42
)


In [287]:
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline

xgb = XGBClassifier(
    objective="binary:logistic",
    n_estimators=600,
    learning_rate=0.05,
    max_depth=4,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_lambda=1.0,
    min_child_weight=5,
    eval_metric="logloss",
    random_state=42,
    n_jobs=-1
)

model = Pipeline([
    ("preprocess", preprocessor),
    ("xgb", xgb)
])

model.fit(X_train, y_train)


0,1,2
,steps,"[('preprocess', ...), ('xgb', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('cat', ...), ('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.9
,device,
,early_stopping_rounds,
,enable_categorical,False


In [288]:
michigan_augmented.to_csv(
    "michigan_augmented.csv",
    index=False,
    encoding="utf-8",
)
