In [2]:
# 📦 Import required libraries
import pandas as pd
import numpy as np
import joblib
from sklearn.impute import SimpleImputer

# 📥 Load the tuned models and thresholds
tuned_rf = joblib.load('calibrated_random_forest_model.pkl')
tuned_xgb = joblib.load('calibrated_xgboost_model.pkl')
thresholds = joblib.load('optimal_thresholds.pkl')  # Contains {'rf': 0.40, 'xgb': 1.0}

rf_threshold = thresholds['rf']
xgb_threshold = thresholds['xgb']

print(f"✅ Loaded models and thresholds: RF={rf_threshold}, XGB={xgb_threshold}")

# 📂 Load user dataset
data_path = "D:/stock_market_crash_prediction/data/processed/merged_data_top20_features.csv"  # Replace with your file
data = pd.read_csv(data_path)
print(f"✅ Dataset loaded! Shape: {data.shape}")

# 🎯 Drop rows where target is missing (if needed)
if 'is_crash' in data.columns:
    data.dropna(subset=['is_crash'], inplace=True)
    print("✅ Dropped rows with missing target.")

# 🧹 Separate features and target
if 'is_crash' in data.columns:
    X = data.drop(columns=['is_crash'])
    y = data['is_crash']
else:
    X = data.copy()
    y = None

# 🧽 Impute missing values
imputer = SimpleImputer(strategy='median')
X_imputed = imputer.fit_transform(X)

# 🔮 Get predicted probabilities
rf_probs = tuned_rf.predict_proba(X_imputed)[:, 1]
xgb_probs = tuned_xgb.predict_proba(X_imputed)[:, 1]

# 🏁 Apply thresholds for final predictions
rf_preds = (rf_probs >= rf_threshold).astype(int)
xgb_preds = (xgb_probs >= xgb_threshold).astype(int)

# 📊 Add predictions and probabilities to the dataset
results_df = X.copy()
if y is not None:
    results_df['Actual_is_crash'] = y

results_df['RF_Crash_Probability'] = rf_probs
results_df['RF_Crash_Prediction'] = rf_preds
results_df['XGB_Crash_Probability'] = xgb_probs
results_df['XGB_Crash_Prediction'] = xgb_preds

print("✅ Predictions added to dataset!")
print(results_df.head())

# 💾 Save results
results_df.to_csv("deployment_predictions.csv", index=False)
print("✅ Results saved as 'deployment_predictions.csv'")

# Optional summary
print("\n📊 Prediction Summary:")
print(results_df[['RF_Crash_Prediction', 'XGB_Crash_Prediction']].value_counts())


✅ Loaded models and thresholds: RF=0.4, XGB=0.9987601960802718
✅ Dataset loaded! Shape: (3646, 21)
✅ Dropped rows with missing target.
✅ Predictions added to dataset!
   future_return    Close_y     High_y    Volume_x      Low_y       High_x  \
0       0.012348  20.040001  21.680000  3991400000  20.030001  1133.869995   
1      -0.000264  19.350000  20.129999  2491020000  19.340000  1136.630005   
2       0.007510  19.160000  19.680000  4972660000  18.770000  1139.189941   
3       0.005930  19.059999  19.709999  5270680000  18.700001  1142.459961   
4      -0.007817  18.129999  19.270000  4389590000  18.110001  1145.390015   

        Open_x     Open_y      Close_x        Low_x  ...  \
0  1116.560059  21.680000  1132.989990  1116.560059  ...   
1  1132.660034  20.049999  1136.520020  1129.660034  ...   
2  1135.709961  19.590000  1137.140015  1133.949951  ...   
3  1136.270020  19.680000  1141.689941  1131.319946  ...   
4  1140.520020  19.270000  1144.979980  1136.219971  ...   

   

In [6]:
sample = pd.read_csv('D:/stock_market_crash_prediction/notebooks/deployment_predictions.csv').head(100)
sample.to_csv('D:/stock_market_crash_prediction/data/sample_input.csv', index=False)
