In [0]:
%pip install xgboost

dbutils.library.restartPython()

In [0]:
import pandas as pd
import numpy as np
import joblib
import os
from datetime import datetime, timedelta
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, roc_auc_score, precision_recall_curve)

DATA_TABLE = "final_project.gold.ml_features" 
VOLUME_PATH = "/Volumes/final_project/graphs/graphs/"

MODEL_FILE = os.path.join(VOLUME_PATH, "police_model_final.pkl")
METRICS_FILE = os.path.join(VOLUME_PATH, "model_metrics_final.txt")
SCHEDULE_FILE_PREFIX = os.path.join(VOLUME_PATH, "patrol_schedule_final_")

BASE_FEATURES = ['hour_sin', 'hour_cos', 'day_of_week', 'month_sin', 'month_cos', 'is_weekend']

def run_pipeline():
    print(f"Starting Final Balanced Pipeline using {DATA_TABLE}")
    
    try:
        df_spark = spark.table(DATA_TABLE)
        df = df_spark.toPandas()
        print(f"Loaded {len(df)} records.")
    except Exception as e:
        print(f"Error: {e}")
        return

    df['District'] = df['District'].astype(int)
    
    try:
        df['Is_Violent'] = df['label_category'].apply(lambda x: 1 if x == 'Violent' else 0)
    except KeyError:
        print("Column 'label_category' not found!")
        return

    if 'crime_hour' in df.columns:
        df['hour_int'] = df['crime_hour']
    else:
        df['hour_int'] = 12

    df['dist_hour_key'] = df['District'].astype(str) + "_" + df['hour_int'].astype(str)
    
    X = df[BASE_FEATURES + ['District', 'dist_hour_key']]
    y = df['Is_Violent']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    print("Applying Target Encoding...")
    
    temp_train = X_train.copy()
    temp_train['Is_Violent'] = y_train
    
    global_mean_risk = y_train.mean()

    district_risk_map = temp_train.groupby('District')['Is_Violent'].mean().to_dict()
    interaction_risk_map = temp_train.groupby('dist_hour_key')['Is_Violent'].mean().to_dict()

    def apply_mappings(data_df):
        data_df['district_risk'] = data_df['District'].map(district_risk_map).fillna(global_mean_risk)
        data_df['interaction_risk'] = data_df['dist_hour_key'].map(interaction_risk_map).fillna(global_mean_risk)
        return data_df

    X_train = apply_mappings(X_train)
    X_test = apply_mappings(X_test)

    final_features = BASE_FEATURES + ['district_risk', 'interaction_risk']
    
    X_train_final = X_train[final_features]
    X_test_final = X_test[final_features]

    print("Step 1: Training Model (Balanced)...")
    
    xgb = XGBClassifier(
        objective='binary:logistic',
        eval_metric='auc',
        scale_pos_weight=1, 
        n_estimators=400,
        learning_rate=0.05,
        max_depth=10, 
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        n_jobs=-1
    )

    xgb.fit(X_train_final, y_train)

    print("Step 2: Metrics Evaluation")
    y_proba = xgb.predict_proba(X_test_final)[:, 1]

    precisions, recalls, thresholds = precision_recall_curve(y_test, y_proba)
    
    target_recall = 0.60 
    valid_indices = np.where(recalls >= target_recall)[0]
    if len(valid_indices) > 0:
        best_idx = valid_indices[np.argmax(precisions[valid_indices])]
        optimal_threshold = thresholds[best_idx]
    else:
        optimal_threshold = 0.5

    y_pred = (y_proba >= optimal_threshold).astype(int)

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_proba)
    
    report = f"""
ACCURACY:  {acc:.4f}
PRECISION: {prec:.4f}
RECALL:    {rec:.4f}
F1_SCORE:  {f1:.4f}
ROC_AUC:   {auc:.4f}
THRESHOLD: {optimal_threshold:.4f}
"""
    print(report)

    with open(METRICS_FILE, "w") as f:
        f.write(report)

    artifact = {
        "model": xgb,
        "threshold": optimal_threshold,
        "features": final_features,
        "district_risk_map": district_risk_map,
        "interaction_risk_map": interaction_risk_map,
        "global_mean_risk": global_mean_risk,
        "version": "xgboost_final"
    }

    joblib.dump(artifact, MODEL_FILE)
    print(f"Model saved to {MODEL_FILE}")

    print("Step 3: Generating Schedule")
    start_time = datetime.now().replace(minute=0, second=0, microsecond=0)
    future_rows = []
    districts = sorted(df['District'].unique().tolist())

    for h in range(24):
        curr = start_time + timedelta(hours=h)
        py_day = curr.weekday()
        spark_day = 1 if py_day == 6 else py_day + 2
        is_weekend = 1 if py_day in [5, 6] else 0
        current_hour = curr.hour
        
        for dist in districts:
            dist_key = str(dist) + "_" + str(current_hour)
            dist_risk = district_risk_map.get(dist, global_mean_risk)
            inter_risk = interaction_risk_map.get(dist_key, global_mean_risk)
            
            future_rows.append({
                'hour_sin': np.sin(2 * np.pi * current_hour / 24),
                'hour_cos': np.cos(2 * np.pi * current_hour / 24),
                'day_of_week': spark_day,
                'month_sin': np.sin(2 * np.pi * curr.month / 12),
                'month_cos': np.cos(2 * np.pi * curr.month / 12),
                'is_weekend': is_weekend,
                'district_risk': dist_risk,
                'interaction_risk': inter_risk,
                'Timestamp': curr,
                'District_ID': dist
            })

    df_batch = pd.DataFrame(future_rows)
    X_batch = df_batch[final_features]
    probs = xgb.predict_proba(X_batch)[:, 1]

    df_batch['Risk_Probability'] = probs
    df_batch['Action'] = np.where(probs > optimal_threshold, "DISPATCH", "MONITOR")

    schedule_filename = f"{SCHEDULE_FILE_PREFIX}{start_time.date()}.csv"
    output_cols = ['Timestamp', 'District_ID', 'Risk_Probability', 'Action']
    
    df_batch[df_batch['Action'] == "DISPATCH"][output_cols] \
        .sort_values(by='Risk_Probability', ascending=False) \
        .to_csv(schedule_filename, index=False)

    print(f"Schedule saved: {schedule_filename}")
    print("Pipeline Complete.")

if __name__ == "__main__":
    run_pipeline()