In [3]:
# Core
import pandas as pd
import numpy as np

# Preprocessing
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

# Train/Test Split
from sklearn.model_selection import train_test_split

# Classification Algorithms
from sklearn.ensemble import RandomForestClassifier   # Strong baseline
from sklearn.linear_model import LogisticRegression   # Simple baseline
from xgboost import XGBClassifier                     # Advanced boosting

# Evaluation Metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

In [4]:
dataset=pd.read_csv('/Energy_Theft_Detection.csv')

In [5]:
dataset.sample(10)

Unnamed: 0,Customer_ID,Age,Energy_Consumption,Location,Time_of_Use,Previous_Bills,Average_Temperature,Energy_Usage_History,Payment_Method,Consumption_Type
314,6717e1bf-9355-4429-82f5-766ee079222c,55,746.0,Urban,Day,227.0,24.602937,Abnormal,Debit Card,Commercial
351,03a58500-e92c-4c76-a668-faaee89e5655,74,771.0,Rural,Day,378.0,27.528446,Normal,Credit Card,Commercial
649,af3b439a-b3ae-4a53-bb19-1fe6a2df4ead,75,659.0,Suburban,Day,421.0,22.257668,Abnormal,Debit Card,Residential
803,52723009-55d5-4271-b41d-8a26bb1c0ba6,70,646.0,Rural,Night,473.0,22.42776,Normal,Credit Card,Commercial
772,024fb4fa-62a7-4a17-aef9-93930ef4885b,25,602.0,Urban,Night,399.0,20.434795,Normal,Debit Card,Residential
310,4e481c3c-0659-482d-ba7a-3e0ffd244a92,66,546.0,Suburban,Day,64.0,31.423611,Abnormal,Cash,Residential
812,488e7d48-6e47-4bde-b07f-c9d4b889fa1b,36,231.0,Urban,Day,176.0,31.35705,Normal,Cash,Residential
715,2e619fbd-ceb9-4af8-b0aa-5af733b4d409,61,917.0,Rural,Night,128.0,19.601835,Abnormal,Cash,Commercial
70,b71299d1-6208-451d-bd2f-678f420c7475,69,429.0,Suburban,Day,152.0,25.525117,Abnormal,Credit Card,Commercial
184,3874ec05-82a7-4b07-a138-777ffebdd062,19,436.0,Rural,Day,73.0,16.023428,Normal,Cash,Commercial


In [7]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1050 entries, 0 to 1049
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Customer_ID           1050 non-null   object 
 1   Age                   1050 non-null   int64  
 2   Energy_Consumption    1008 non-null   float64
 3   Location              1050 non-null   object 
 4   Time_of_Use           1050 non-null   object 
 5   Previous_Bills        1018 non-null   float64
 6   Average_Temperature   1040 non-null   float64
 7   Energy_Usage_History  1050 non-null   object 
 8   Payment_Method        1050 non-null   object 
 9   Consumption_Type      1050 non-null   object 
dtypes: float64(3), int64(1), object(6)
memory usage: 82.2+ KB


In [11]:
imputer = SimpleImputer(strategy='most frequent')

In [14]:
imputer = SimpleImputer(strategy='most_frequent')
numerical_cols = ['Energy_Consumption', 'Previous_Bills', 'Average_Temperature']
dataset[numerical_cols] = imputer.fit_transform(dataset[numerical_cols])

In [16]:
from sklearn.preprocessing import OneHotEncoder

categorical_cols = ['Location','Time_of_Use','Energy_Usage_History','Payment_Method','Consumption_Type']

encoder = OneHotEncoder()
encoded = encoder.fit_transform(dataset[categorical_cols])

# Convert to DataFrame
encoded_df = pd.DataFrame(encoded.toarray(), columns=encoder.get_feature_names_out(categorical_cols))

# Drop original categorical columns
dataset.drop(categorical_cols + ['Customer_ID'], axis=1, inplace=True)

# Concatenate encoded features
dataset = pd.concat([dataset, encoded_df], axis=1)

In [17]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1050 entries, 0 to 1049
Data columns (total 16 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Age                            1050 non-null   int64  
 1   Energy_Consumption             1050 non-null   float64
 2   Previous_Bills                 1050 non-null   float64
 3   Average_Temperature            1050 non-null   float64
 4   Location_Rural                 1050 non-null   float64
 5   Location_Suburban              1050 non-null   float64
 6   Location_Urban                 1050 non-null   float64
 7   Time_of_Use_Day                1050 non-null   float64
 8   Time_of_Use_Night              1050 non-null   float64
 9   Energy_Usage_History_Abnormal  1050 non-null   float64
 10  Energy_Usage_History_Normal    1050 non-null   float64
 11  Payment_Method_Cash            1050 non-null   float64
 12  Payment_Method_Credit Card     1050 non-null   f

In [18]:
scaler = MinMaxScaler(feature_range=(0,5))
dataset[numerical_cols] = scaler.fit_transform(dataset[numerical_cols])

In [19]:
X = dataset.drop("Energy_Usage_History_Normal", axis=1)  # Example: one-hot encoded target column
y = dataset["Energy_Usage_History_Normal"]               # Target: Normal vs Abnormal

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [22]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("precision:", precision_score(y_test, y_pred))

Accuracy: 1.0
precision: 1.0


In [24]:
import joblib

# 1. Save the Trained Model
joblib.dump(model, 'model.pkl')

# 2. Save the Scaler (used for X numeric features)
joblib.dump(scaler, 'scaler.pkl')

# 3. Save the Feature Encoder (used for X categorical features)
joblib.dump(encoder, 'encoder.pkl')

# 4. Save the Target Encoder (used to decode y predictions back to text)
joblib.dump(encoder, 'target_encoder.pkl')

print("All artifacts saved successfully:")
print("- model.pkl")
print("- scaler.pkl")
print("- encoder.pkl")
print("- target_encoder.pkl")

All artifacts saved successfully:
- model.pkl
- scaler.pkl
- encoder.pkl
- target_encoder.pkl


In [25]:
pip install gradio



In [None]:
import gradio as gr
import pandas as pd
import joblib
import numpy as np

# 1. Load your saved artifacts
# Ensure these files (model.pkl, scaler.pkl, encoder.pkl) are in the same folder
model = joblib.load('model.pkl')
scaler = joblib.load('scaler.pkl')
encoder = joblib.load('encoder.pkl')

# 2. Define the Prediction Function
def predict_energy_theft(age, energy_cons, location, time_of_use, prev_bills, avg_temp, pay_method, cons_type):

    # A. Create DataFrame from User Inputs
    # We add 'Energy_Usage_History' as "Normal" temporarily because your
    # saved encoder expects this column to exist to perform the transformation.
    input_data = {
        'Age': [age],
        'Energy_Consumption': [energy_cons],
        'Location': [location],
        'Time_of_Use': [time_of_use],
        'Previous_Bills': [prev_bills],
        'Average_Temperature': [avg_temp],
        'Energy_Usage_History': ['Normal'],  # Placeholder value
        'Payment_Method': [pay_method],
        'Consumption_Type': [cons_type]
    }
    input_df = pd.DataFrame(input_data)

    # B. Scaling (Numerical Features)
    # Applies the exact MinMax (0-5) scaling from your training
    numerical_cols = ['Energy_Consumption', 'Previous_Bills', 'Average_Temperature']
    input_df[numerical_cols] = scaler.transform(input_df[numerical_cols])

    # C. Encoding (Categorical Features)
    # Must match the list used in training exactly
    categorical_cols = ['Location', 'Time_of_Use', 'Energy_Usage_History', 'Payment_Method', 'Consumption_Type']

    # Transform and create new DataFrame with encoded columns
    encoded_vals = encoder.transform(input_df[categorical_cols])
    encoded_df = pd.DataFrame(
        encoded_vals.toarray(),
        columns=encoder.get_feature_names_out(categorical_cols)
    )

    # D. Combine Features
    # We drop the original categorical columns and attach the new encoded ones
    # We also keep 'Age' which wasn't scaled/encoded
    final_input = pd.concat([input_df[['Age'] + numerical_cols], encoded_df], axis=1)

    # E. Align Columns with Training Data
    # In your notebook, you dropped "Energy_Usage_History_Normal" for X.
    # We must do the same here to match the model's expected input shape.
    if 'Energy_Usage_History_Normal' in final_input.columns:
        final_input = final_input.drop('Energy_Usage_History_Normal', axis=1)

    # F. Predict
    prediction = model.predict(final_input)
    prediction_prob = model.predict_proba(final_input)

    # Decode Result (Assuming 1 = Normal based on typical encoding)
    result_text = "Normal" if prediction[0] == 1 else "Abnormal (Potential Theft)"
    confidence = np.max(prediction_prob) * 100

    return f"{result_text} (Confidence: {confidence:.2f}%)"

# 3. Create the Gradio Interface
# We map the inputs to match the function arguments above
interface = gr.Interface(
    fn=predict_energy_theft,
    inputs=[
        gr.Number(label="Age", value=35),
        gr.Number(label="Energy Consumption (kWh)", value=500),
        gr.Dropdown(["Urban", "Rural", "Suburban"], label="Location"),
        gr.Dropdown(["Day", "Night"], label="Time of Use"),
        gr.Number(label="Previous Bills ($)", value=150),
        gr.Number(label="Average Temperature (Â°C)", value=25),
        gr.Dropdown(["Credit Card", "Debit Card", "Cash"], label="Payment Method"),
        gr.Dropdown(["Residential", "Commercial"], label="Consumption Type"),
    ],
    outputs="text",
    title="Energy Theft Detection System",
    description="Enter customer details to predict if energy usage is Normal or Abnormal."
)

# 4. Launch the App
interface.launch(share=True)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://7ce1bbc730eef98e4b.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


