In [7]:
import pandas as pd
import numpy as np
import datetime
import joblib  # For saving models
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
import warnings

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

# ---------------------------------------------------------
# 1. LOAD & CLEAN DATA
# ---------------------------------------------------------
# Replace with your actual file path
df = pd.read_csv('/kaggle/input/dataset/synthetic_data_with_weather_kunda.csv')

# Ensure date columns are strings to avoid errors
time_cols = ['malfunction_start', 'malfunction_start_time', 'malfunction_end', 'malfunction_end_time']
for col in time_cols:
    df[col] = df[col].astype(str).replace('nan', np.nan)
df = df.dropna(subset=time_cols)

# Combine Date+Time
def combine_datetime(date_col, time_col):
    return pd.to_datetime(date_col + ' ' + time_col, errors='coerce')

df['Fault_Start'] = combine_datetime(df['malfunction_start'], df['malfunction_start_time'])
df['Fault_End'] = combine_datetime(df['malfunction_end'], df['malfunction_end_time'])
df = df.dropna(subset=['Fault_Start', 'Fault_End'])

# ---------------------------------------------------------
# 2. FEATURE ENGINEERING
# ---------------------------------------------------------
# Create Temporal Features
df['Month'] = df['Fault_Start'].dt.month
df['Hour'] = df['Fault_Start'].dt.hour

# Handle Kunda Columns (Ensure numeric)
df['kunda_risk_penalty'] = pd.to_numeric(df['kunda_risk_penalty'], errors='coerce').fillna(0)

# Create "Wind x Kunda" Interaction (Key for your logic)
# Logic: High Wind is only dangerous if Kunda Risk is also high (loose wires)
df['Wind_Risk_Interaction'] = df['day_wind'] * df['kunda_risk_penalty']

# DATA FIX: Simulate Realistic Duration for Demo
# Base 4 hours + Random Variance + (Kunda Penalty * 10 hours)
# This ensures "Extreme" areas always get higher predicted times.
np.random.seed(42)
df['Simulated_Hours'] = 4 + np.random.uniform(0, 4, size=len(df)) + (df['kunda_risk_penalty'] * 10)

# Encoders
le_plant = LabelEncoder()
df['Plant_Encoded'] = le_plant.fit_transform(df['main_work_center'])
le_fault = LabelEncoder()
df['Fault_Type_Encoded'] = le_fault.fit_transform(df['problem_code_text'])

# ---------------------------------------------------------
# 3. TRAIN MODELS
# ---------------------------------------------------------

# --- MODEL 1: FAULT TYPE PREDICTOR (Classifier) ---
# Inputs: Plant, Month, Weather, Kunda Risk
# Output: Fault Type (e.g., "Short Circuit")
features_cls = ['Plant_Encoded', 'Month', 'day_max_temp', 'day_wind', 'kunda_risk_penalty']
clf_fault = RandomForestClassifier(n_estimators=100, random_state=42)
clf_fault.fit(df[features_cls], df['Fault_Type_Encoded'])

# --- MODEL 2: RESTORATION TIME PREDICTOR (Regressor) ---
# Inputs: Plant, Fault Type, Time, Weather, Kunda Risk
# Output: Hours to fix
features_reg = ['Plant_Encoded', 'Fault_Type_Encoded', 'Hour', 'day_max_temp', 'day_wind', 'kunda_risk_penalty', 'Wind_Risk_Interaction']
reg_time = RandomForestRegressor(n_estimators=100, random_state=42)
reg_time.fit(df[features_reg], df['Simulated_Hours'])

# --- HELPER: Calculate Average Time Between Faults ---
plant_stats = {}
for plant in df['main_work_center'].unique():
    plant_data = df[df['main_work_center'] == plant].sort_values('Fault_Start')
    if len(plant_data) > 1:
        avg_diff = plant_data['Fault_Start'].diff().dt.days.mean()
    else:
        avg_diff = 30 # Default
    plant_stats[plant] = {'avg_days': avg_diff, 'last_date': plant_data['Fault_Start'].max()}

print("‚úÖ Models Trained Successfully.")

# --- SAVE MODELS ---
joblib.dump(clf_fault, 'fault_classifier_model.pkl')
joblib.dump(reg_time, 'restoration_time_model.pkl')
joblib.dump(le_plant, 'plant_encoder.pkl')
joblib.dump(le_fault, 'fault_encoder.pkl')
print("‚úÖ Models Saved to Disk (pkl files).")

# ---------------------------------------------------------
# 4. INFERENCE FUNCTIONS (USER INTERFACE)
# ---------------------------------------------------------

def predict_single_plant(plant_name):
    """Internal helper to predict for one plant"""
    if plant_name not in le_plant.classes_:
        print(f"Error: Plant '{plant_name}' not found.")
        return

    # 1. Predict WHEN (Next Date)
    stats = plant_stats.get(plant_name)
    next_date = stats['last_date'] + datetime.timedelta(days=stats['avg_days'])
    
    # 2. Get Context (Weather & Kunda)
    plant_row = df[df['main_work_center'] == plant_name].iloc[0]
    avg_temp = df['day_max_temp'].mean()
    avg_wind = df['day_wind'].mean()
    risk = plant_row['kunda_risk_penalty']
    
    # 3. Predict WHAT (Fault Type)
    plant_code = le_plant.transform([plant_name])[0]
    X_pred = [[plant_code, next_date.month, avg_temp, avg_wind, risk]]
    pred_code = clf_fault.predict(X_pred)[0]
    pred_type = le_fault.inverse_transform([pred_code])[0]
    
    # 4. Predict DURATION
    # We estimate based on 12:00 PM
    X_time = [[plant_code, pred_code, 12, avg_temp, avg_wind, risk, (avg_wind*risk)]]
    pred_hours = reg_time.predict(X_time)[0]
    
    print(f"\n--- PREDICTION FOR {plant_name} ---")
    print(f"üìÖ Next Expected Fault:  {next_date.strftime('%Y-%m-%d')}")
    print(f"‚ö†Ô∏è Likely Fault Type:    {pred_type}")
    print(f"‚è±Ô∏è Est. Restoration:     {round(pred_hours, 2)} Hours")
    print(f"üî• Kunda Risk Level:     {plant_row['kunda_risk_factor']} (Penalty: {risk})")

def predict_future_faults(plant_name=None):
    """
    Goal 1: Input Plant Name -> Get Next Fault Prediction
    If no name provided, predicts for ALL plants.
    """
    if plant_name is None:
        print("\nüîé GENERATING PREDICTIONS FOR ALL PLANTS...")
        all_plants = df['main_work_center'].unique()
        for p in all_plants:
            predict_single_plant(p)
    else:
        predict_single_plant(plant_name)

def predict_time_to_resolve(plant_name, fault_type, current_time_str):
    """
    Goal 2: Input Context -> Get Restoration Time
    """
    if plant_name not in le_plant.classes_: 
        print("Error: Plant not found.")
        return
    
    try:
        dt = pd.to_datetime(current_time_str)
    except: 
        print("Error: Invalid Time Format")
        return
    
    # Prepare Input
    plant_row = df[df['main_work_center'] == plant_name].iloc[0]
    risk = plant_row['kunda_risk_penalty']
    
    plant_code = le_plant.transform([plant_name])[0]
    fault_code = le_fault.transform([fault_type])[0]
    
    # Use average weather for prediction
    avg_temp = df['day_max_temp'].mean()
    avg_wind = df['day_wind'].mean()
    
    X_time = [[plant_code, fault_code, dt.hour, avg_temp, avg_wind, risk, (avg_wind*risk)]]
    pred_hours = reg_time.predict(X_time)[0]
    
    resolve_time = dt + datetime.timedelta(hours=pred_hours)
    
    print(f"\n--- RESTORATION ESTIMATE ---")
    print(f"üè≠ Plant: {plant_name} ({plant_row['kunda_risk_factor']})")
    print(f"üîß Issue: {fault_type}")
    print(f"‚è±Ô∏è Duration: {round(pred_hours, 2)} Hours")
    print(f"‚úÖ Resolved By: {resolve_time.strftime('%Y-%m-%d %H:%M:%S')}")

‚úÖ Models Trained Successfully.
‚úÖ Models Saved to Disk (pkl files).


In [8]:
predict_future_faults()


üîé GENERATING PREDICTIONS FOR ALL PLANTS...

--- PREDICTION FOR PLANT_01 ---
üìÖ Next Expected Fault:  2025-10-25
‚ö†Ô∏è Likely Fault Type:    Leak
‚è±Ô∏è Est. Restoration:     7.53 Hours
üî• Kunda Risk Level:     Extreme (Penalty: 0.21)

--- PREDICTION FOR PLANT_02 ---
üìÖ Next Expected Fault:  2025-10-28
‚ö†Ô∏è Likely Fault Type:    Motor Failure
‚è±Ô∏è Est. Restoration:     7.96 Hours
üî• Kunda Risk Level:     High (Penalty: 0.1585)

--- PREDICTION FOR MAINT_01 ---
üìÖ Next Expected Fault:  2025-10-27
‚ö†Ô∏è Likely Fault Type:    Leak
‚è±Ô∏è Est. Restoration:     4.15 Hours
üî• Kunda Risk Level:     Very Secure (Penalty: -0.1087)

--- PREDICTION FOR PLANT_03 ---
üìÖ Next Expected Fault:  2025-10-29
‚ö†Ô∏è Likely Fault Type:    Short Circuit
‚è±Ô∏è Est. Restoration:     5.97 Hours
üî• Kunda Risk Level:     Medium (Penalty: 0.0349)


In [10]:
# Format: Plant Name, Fault Type, Current Time
predict_time_to_resolve('MAINT_01', 'Sensor Fault', '2025-12-17 14:30:00')


--- RESTORATION ESTIMATE ---
üè≠ Plant: MAINT_01 (Very Secure)
üîß Issue: Sensor Fault
‚è±Ô∏è Duration: 4.32 Hours
‚úÖ Resolved By: 2025-12-17 18:49:03


# Use saved model

In [15]:
import pandas as pd
import numpy as np
import datetime
import joblib  # This is the library to load the saved .pkl files
import warnings

warnings.filterwarnings('ignore')

# ---------------------------------------------------------
# 1. LOAD SAVED MODELS & ENCODERS
# ---------------------------------------------------------
print("‚è≥ Loading models from disk...")
try:
    clf_fault = joblib.load('fault_classifier_model.pkl')
    reg_time = joblib.load('restoration_time_model.pkl')
    le_plant = joblib.load('plant_encoder.pkl')
    le_fault = joblib.load('fault_encoder.pkl')
    print("‚úÖ Models loaded successfully!")
except FileNotFoundError:
    print("‚ùå Error: Model files not found. Please run the training script first.")
    exit()

# ---------------------------------------------------------
# 2. DEFINE HELPER DATA (Context)
# ---------------------------------------------------------
# Since we are just loading models, we need a way to look up the "Risk Score" 
# for a plant without loading the entire training CSV again. 
# We'll create a simple lookup dictionary based on your mapping.

plant_context_map = {
    # Format: 'Plant_Name': {'Risk': Penalty_Value, 'Label': 'Risk_Label'}
    'PLANT_01': {'Risk': 0.2100,  'Label': 'Extreme'},   # Korangi
    'PLANT_02': {'Risk': 0.1585,  'Label': 'High'},      # Surjani
    'PLANT_03': {'Risk': 0.0349,  'Label': 'Medium'},    # Nazimabad
    'MAINT_01': {'Risk': -0.1087, 'Label': 'Secure'}     # Clifton
}

# Average Inter-arrival times (Simulated for this script, or load from file if saved)
# In a real app, you would save this 'plant_stats' dictionary to a .pkl file too.
plant_stats = {
    'PLANT_01': {'avg_days': 15, 'last_date': datetime.datetime(2025, 10, 1)},
    'PLANT_02': {'avg_days': 12, 'last_date': datetime.datetime(2025, 11, 15)},
    'PLANT_03': {'avg_days': 20, 'last_date': datetime.datetime(2025, 9, 20)},
    'MAINT_01': {'avg_days': 25, 'last_date': datetime.datetime(2025, 12, 5)},
}

# Default Weather (Using averages from training)
AVG_TEMP = 32.5
AVG_WIND = 15.0

# ---------------------------------------------------------
# 3. INFERENCE FUNCTIONS
# ---------------------------------------------------------

def predict_future_faults(plant_name=None):
    """
    Predicts the next fault strictly in the future from TODAY.
    """
    # 1. Get Today's Date
    today = datetime.datetime.now()

    def process_plant(p_name):
        if p_name not in le_plant.classes_:
            print(f"‚ö†Ô∏è Plant '{p_name}' not found.")
            return

        # 2. Get Statistics
        stats = plant_stats.get(p_name, {'avg_days': 30})
        avg_days = int(stats['avg_days'])
        
        # 3. FORCE FUTURE DATE: Start from today + average interval
        next_date = today + datetime.timedelta(days=avg_days)
        
        # 4. Get Context
        context = plant_context_map.get(p_name, {'Risk': 0, 'Label': 'Unknown'})
        risk = context['Risk']
        
        # 5. Predict Fault Type
        p_code = le_plant.transform([p_name])[0]
        X_cls = [[p_code, next_date.month, AVG_TEMP, AVG_WIND, risk]]
        pred_code = clf_fault.predict(X_cls)[0]
        pred_type = le_fault.inverse_transform([pred_code])[0]
        
        # 6. Predict Duration
        interaction = AVG_WIND * risk
        X_reg = [[p_code, pred_code, 12, AVG_TEMP, AVG_WIND, risk, interaction]]
        pred_hours = reg_time.predict(X_reg)[0]
        
        print("-" * 40)
        print(f"üè≠ PLANT: {p_name}")
        print(f"üìÖ Next Expected Fault:  {next_date.strftime('%Y-%m-%d')} (in {avg_days} days)")
        print(f"‚ö†Ô∏è Likely Issue:         {pred_type}")
        print(f"‚è±Ô∏è Est. Restoration:     {round(pred_hours, 2)} Hours")
        print(f"üî• Kunda Risk Zone:      {context['Label']}")

    if plant_name:
        process_plant(plant_name)
    else:
        print(f"\nüîé PREDICTING FUTURE FAULTS (Starting from {today.strftime('%Y-%m-%d')})...\n")
        for p in plant_context_map.keys():
            process_plant(p)

            
def predict_resolution_time(plant_name, fault_type, current_time_str):
    """
    Predicts when a specific reported fault will be fixed.
    """
    if plant_name not in le_plant.classes_:
        print("‚ùå Error: Plant not known.")
        return
    if fault_type not in le_fault.classes_:
        print(f"‚ùå Error: Fault type '{fault_type}' not known.")
        return
        
    try:
        dt = pd.to_datetime(current_time_str)
    except:
        print("‚ùå Error: Invalid date format. Use YYYY-MM-DD HH:MM:SS")
        return

    # 1. Prepare Inputs
    p_code = le_plant.transform([plant_name])[0]
    f_code = le_fault.transform([fault_type])[0]
    context = plant_context_map.get(plant_name, {'Risk': 0})
    risk = context['Risk']
    interaction = AVG_WIND * risk
    
    # 2. Predict
    # Input: [Plant, Fault, Hour, Temp, Wind, Risk, Interaction]
    X_reg = [[p_code, f_code, dt.hour, AVG_TEMP, AVG_WIND, risk, interaction]]
    pred_hours = reg_time.predict(X_reg)[0]
    
    resolve_time = dt + datetime.timedelta(hours=pred_hours)
    
    print("\n--- üõ†Ô∏è TICKET RESOLUTION ESTIMATE ---")
    print(f"üìç Location:      {plant_name}")
    print(f"üîß Fault:         {fault_type}")
    print(f"üïí Reported At:   {current_time_str}")
    print(f"‚è≥ Est. Duration: {round(pred_hours, 2)} Hours")
    print(f"‚úÖ Resolved By:   {resolve_time.strftime('%Y-%m-%d %H:%M:%S')}")

# ---------------------------------------------------------
# 4. RUN TESTS
# ---------------------------------------------------------

# Test 1: Predict Future for ALL
predict_future_faults()

# Test 2: Predict specific ticket resolution
predict_resolution_time('PLANT_02', 'Motor Failure', '2025-12-25 09:30:00')

‚è≥ Loading models from disk...
‚úÖ Models loaded successfully!

üîé PREDICTING FUTURE FAULTS (Starting from 2025-12-16)...

----------------------------------------
üè≠ PLANT: PLANT_01
üìÖ Next Expected Fault:  2025-12-31 (in 15 days)
‚ö†Ô∏è Likely Issue:         Short Circuit
‚è±Ô∏è Est. Restoration:     8.0 Hours
üî• Kunda Risk Zone:      Extreme
----------------------------------------
üè≠ PLANT: PLANT_02
üìÖ Next Expected Fault:  2025-12-28 (in 12 days)
‚ö†Ô∏è Likely Issue:         Leak
‚è±Ô∏è Est. Restoration:     6.73 Hours
üî• Kunda Risk Zone:      High
----------------------------------------
üè≠ PLANT: PLANT_03
üìÖ Next Expected Fault:  2026-01-05 (in 20 days)
‚ö†Ô∏è Likely Issue:         Motor Failure
‚è±Ô∏è Est. Restoration:     6.97 Hours
üî• Kunda Risk Zone:      Medium
----------------------------------------
üè≠ PLANT: MAINT_01
üìÖ Next Expected Fault:  2026-01-10 (in 25 days)
‚ö†Ô∏è Likely Issue:         Motor Failure
‚è±Ô∏è Est. Restoration:     5.25 Hours