In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

# Function to determine Rwanda's seasons
def get_rwanda_season(date):
    month = date.month
    if month in [3, 4, 5]:
        return 'Long Rainy Season'
    elif month in [10, 11, 12]:
        return 'Short Rainy Season'
    elif month in [6, 7, 8, 9]:
        return 'Dry Season'
    else:
        return 'Dry Season'

: 

In [None]:

# Generate date range for 2017 to 2024
start_date = datetime(2017, 1, 1)
end_date = datetime(2024, 12, 31)
date_range = pd.date_range(start=start_date, end=end_date, freq='H')



In [None]:
pip uninstall pandas


In [None]:
# Simulate data
np.random.seed(42)
energy_consumption = np.random.uniform(5, 30, len(date_range))  # Energy consumption (kWh)
appliance_usage = np.random.uniform(2, 15, len(date_range))  # Appliance usage (kWh)
cost_per_kWh = np.random.choice([0.10, 0.12, 0.15], len(date_range))  # Cost per kWh (RWF)
weather_temp = np.random.uniform(10, 30, len(date_range))  # Temperature (°C)
season = [get_rwanda_season(date) for date in date_range]
weekday_weekend = ['Weekday' if date.weekday() < 5 else 'Weekend' for date in date_range]
special_event = np.random.choice([0, 1], len(date_range))  # Special events indicator



In [None]:
# Calculate predicted energy cost (RWF)
predicted_energy_cost = (energy_consumption * cost_per_kWh).round(2)



In [None]:
# Create the dataset
data = {
    'Date': date_range,
    'Time of Day': date_range.time,
    'Energy Consumption (kWh)': energy_consumption.round(2),
    'Appliance Usage (kWh)': appliance_usage.round(2),
    'Cost per kWh (RWF)': cost_per_kWh.round(2),
    'Weather (Temp °C)': weather_temp.round(2),
    'Season': season,
    'Weekday/Weekend': weekday_weekend,
    'Predicted Energy Cost (RWF)': predicted_energy_cost,
    'Special Event': special_event
}



In [None]:
df = pd.DataFrame(data)


In [None]:

# Save the dataset as a CSV file
df.to_csv('/kaggle/working/energy_usage_prediction_rwanda.csv', index=False)

print("Dataset saved as 'energy_usage_prediction_rwanda.csv' in the Kaggle working directory.")


In [None]:
import pandas as pd

# Load the dataset
file_path = '/kaggle/working/energy_usage_prediction_rwanda.csv'  # Assuming it's saved in the Kaggle working directory
df = pd.read_csv(file_path)

# Display the first few rows
df.head(), df.info()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Set visualization style
sns.set(style="whitegrid")

# Visualization 1: Energy consumption over time (sample of data for clarity)
plt.figure(figsize=(12, 6))
sample_data = df.sample(1000, random_state=42).sort_values(by='Date')  # Sample data for better visibility
plt.plot(sample_data['Date'], sample_data['Energy Consumption (kWh)'], label='Energy Consumption (kWh)', alpha=0.8)
plt.title("Energy Consumption Over Time (Sample Data)", fontsize=16)
plt.xlabel("Date", fontsize=12)
plt.ylabel("Energy Consumption (kWh)", fontsize=12)
plt.xticks(rotation=45)
plt.legend()
plt.tight_layout()
plt.show()

# Visualization 2: Average energy consumption by season
plt.figure(figsize=(10, 6))
sns.barplot(data=df, x='Season', y='Energy Consumption (kWh)', estimator=np.mean, palette='viridis')
plt.title("Average Energy Consumption by Season", fontsize=16)
plt.xlabel("Season", fontsize=12)
plt.ylabel("Average Energy Consumption (kWh)", fontsize=12)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Visualization 3: Energy consumption distribution
plt.figure(figsize=(10, 6))
sns.histplot(df['Energy Consumption (kWh)'], kde=True, color='blue', bins=30)
plt.title("Distribution of Energy Consumption", fontsize=16)
plt.xlabel("Energy Consumption (kWh)", fontsize=12)
plt.ylabel("Frequency", fontsize=12)
plt.tight_layout()
plt.show()

# Visualization 4: Heatmap for correlations between numerical features
plt.figure(figsize=(12, 8))
correlation_matrix = df[['Energy Consumption (kWh)', 'Appliance Usage (kWh)', 'Cost per kWh (RWF)', 
                         'Weather (Temp °C)', 'Predicted Energy Cost (RWF)']].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", square=True)
plt.title("Correlation Matrix", fontsize=16)
plt.tight_layout()
plt.show()



In [None]:
# Step 1: Simulate Historical Customer Payment Data
# Simulating monthly payment data for 100 customers from 2017 to 2024
customers = [f"Customer_{i}" for i in range(1, 101)]  # 100 customers
months = pd.date_range(start="2017-01-01", end="2024-12-31", freq="M")
customer_monthly_data = pd.DataFrame([(customer, month) for customer in customers for month in months], 
                                     columns=["Customer", "Month"])

# Generate historical payments with random fluctuations
np.random.seed(42)
customer_monthly_data["Payment (RWF)"] = np.random.uniform(5000, 20000, len(customer_monthly_data))

# Step 2: Add lagged payment features
customer_monthly_data["Month"] = pd.to_datetime(customer_monthly_data["Month"])
customer_monthly_data.sort_values(by=["Customer", "Month"], inplace=True)
customer_monthly_data["Previous Month Payment"] = customer_monthly_data.groupby("Customer")["Payment (RWF)"].shift(1)
customer_monthly_data["Two Months Ago Payment"] = customer_monthly_data.groupby("Customer")["Payment (RWF)"].shift(2)

# Drop rows with NaN values from lagged features
customer_monthly_data.dropna(inplace=True)


In [None]:

# Step 3: Train/Test Split
# Use the last 6 months for testing for each customer
train_data = customer_monthly_data.groupby("Customer").apply(lambda x: x.iloc[:-6]).reset_index(drop=True)
test_data = customer_monthly_data.groupby("Customer").apply(lambda x: x.iloc[-6:]).reset_index(drop=True)

# Features and Target
X_train = train_data[["Previous Month Payment", "Two Months Ago Payment"]]
y_train = train_data["Payment (RWF)"]
X_test = test_data[["Previous Month Payment", "Two Months Ago Payment"]]
y_test = test_data["Payment (RWF)"]



In [None]:
# Re-importing necessary libraries
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np
import pandas as pd

# Step 4: Train a Model
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

# Step 5: Predict and Evaluate
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Predict next month's payment for a random customer# Step 1: Recreate Dataset (Simulating customer payment data)
customers = [f"Customer_{i}" for i in range(1, 101)]
months = pd.date_range(start="2017-01-01", end="2024-12-31", freq="M")
customer_monthly_data = pd.DataFrame([(customer, month) for customer in customers for month in months],
                                     columns=["Customer", "Month"])

np.random.seed(42)
customer_monthly_data["Payment (RWF)"] = np.random.uniform(5000, 20000, len(customer_monthly_data))
customer_monthly_data["Month"] = pd.to_datetime(customer_monthly_data["Month"])
customer_monthly_data.sort_values(by=["Customer", "Month"], inplace=True)
customer_monthly_data["Previous Month Payment"] = customer_monthly_data.groupby("Customer")["Payment (RWF)"].shift(1)
customer_monthly_data["Two Months Ago Payment"] = customer_monthly_data.groupby("Customer")["Payment (RWF)"].shift(2)
customer_monthly_data.dropna(inplace=True)

# Step 2: Train/Test Split
train_data = customer_monthly_data.groupby("Customer").apply(lambda x: x.iloc[:-6]).reset_index(drop=True)
test_data = customer_monthly_data.groupby("Customer").apply(lambda x: x.iloc[-6:]).reset_index(drop=True)
X_train = train_data[["Previous Month Payment", "Two Months Ago Payment"]]
y_train = train_data["Payment (RWF)"]
X_test = test_data[["Previous Month Payment", "Two Months Ago Payment"]]
y_test = test_data["Payment (RWF)"]

# Step 3: Train the Model
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

# Step 4: Predictions
y_pred = model.predict(X_test)

# Step 5: Evaluate the Model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Visualization 1: Predicted vs. Actual Values
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.7, edgecolor='k')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='red', linewidth=2)
plt.title("Predicted vs. Actual Values", fontsize=16)
plt.xlabel("Actual Payments (RWF)", fontsize=12)
plt.ylabel("Predicted Payments (RWF)", fontsize=12)
plt.tight_layout()
plt.show()

# Visualization 2: Residual Distribution
residuals = y_test - y_pred
plt.figure(figsize=(10, 6))
sns.histplot(residuals, kde=True, bins=30, color='blue')
plt.title("Residual Distribution", fontsize=16)
plt.xlabel("Residuals (Actual - Predicted)", fontsize=12)
plt.ylabel("Frequency", fontsize=12)
plt.tight_layout()
plt.show()

# Visualization 3: Feature Importance
feature_importances = model.feature_importances_
features = X_train.columns
plt.figure(figsize=(10, 6))
sns.barplot(x=feature_importances, y=features, palette="viridis")
plt.title("Feature Importance", fontsize=16)
plt.xlabel("Importance Score", fontsize=12)
plt.ylabel("Features", fontsize=12)
plt.tight_layout()
plt.show()

mae, mse, r2

random_customer = test_data[test_data["Customer"] == "Customer_10"]
random_customer_features = random_customer[["Previous Month Payment", "Two Months Ago Payment"]].iloc[-1].values.reshape(1, -1)
next_month_payment_prediction = model.predict(random_customer_features)[0]

mae, mse, r2, next_month_payment_prediction
# Step 1: Recreate Dataset (Simulating customer payment data)
customers = [f"Customer_{i}" for i in range(1, 101)]
months = pd.date_range(start="2017-01-01", end="2024-12-31", freq="M")
customer_monthly_data = pd.DataFrame([(customer, month) for customer in customers for month in months],
                                     columns=["Customer", "Month"])

np.random.seed(42)
customer_monthly_data["Payment (RWF)"] = np.random.uniform(5000, 20000, len(customer_monthly_data))
customer_monthly_data["Month"] = pd.to_datetime(customer_monthly_data["Month"])
customer_monthly_data.sort_values(by=["Customer", "Month"], inplace=True)
customer_monthly_data["Previous Month Payment"] = customer_monthly_data.groupby("Customer")["Payment (RWF)"].shift(1)
customer_monthly_data["Two Months Ago Payment"] = customer_monthly_data.groupby("Customer")["Payment (RWF)"].shift(2)
customer_monthly_data.dropna(inplace=True)

# Step 2: Train/Test Split
train_data = customer_monthly_data.groupby("Customer").apply(lambda x: x.iloc[:-6]).reset_index(drop=True)
test_data = customer_monthly_data.groupby("Customer").apply(lambda x: x.iloc[-6:]).reset_index(drop=True)
X_train = train_data[["Previous Month Payment", "Two Months Ago Payment"]]
y_train = train_data["Payment (RWF)"]
X_test = test_data[["Previous Month Payment", "Two Months Ago Payment"]]
y_test = test_data["Payment (RWF)"]

# Step 3: Train the Model
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

# Step 4: Predictions
y_pred = model.predict(X_test)

# Step 5: Evaluate the Model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Visualization 1: Predicted vs. Actual Values
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.7, edgecolor='k')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='red', linewidth=2)
plt.title("Predicted vs. Actual Values", fontsize=16)
plt.xlabel("Actual Payments (RWF)", fontsize=12)
plt.ylabel("Predicted Payments (RWF)", fontsize=12)
plt.tight_layout()
plt.show()

# Visualization 2: Residual Distribution
residuals = y_test - y_pred
plt.figure(figsize=(10, 6))
sns.histplot(residuals, kde=True, bins=30, color='blue')
plt.title("Residual Distribution", fontsize=16)
plt.xlabel("Residuals (Actual - Predicted)", fontsize=12)
plt.ylabel("Frequency", fontsize=12)
plt.tight_layout()
plt.show()

# Visualization 3: Feature Importance
feature_importances = model.feature_importances_
features = X_train.columns
plt.figure(figsize=(10, 6))
sns.barplot(x=feature_importances, y=features, palette="viridis")
plt.title("Feature Importance", fontsize=16)
plt.xlabel("Importance Score", fontsize=12)
plt.ylabel("Features", fontsize=12)
plt.tight_layout()
plt.show()

mae, mse, r2


In [None]:
# Step 1: Recreate Dataset (Simulating customer payment data)
customers = [f"Customer_{i}" for i in range(1, 101)]
months = pd.date_range(start="2017-01-01", end="2024-12-31", freq="M")
customer_monthly_data = pd.DataFrame([(customer, month) for customer in customers for month in months],
                                     columns=["Customer", "Month"])

np.random.seed(42)
customer_monthly_data["Payment (RWF)"] = np.random.uniform(5000, 20000, len(customer_monthly_data))
customer_monthly_data["Month"] = pd.to_datetime(customer_monthly_data["Month"])
customer_monthly_data.sort_values(by=["Customer", "Month"], inplace=True)
customer_monthly_data["Previous Month Payment"] = customer_monthly_data.groupby("Customer")["Payment (RWF)"].shift(1)
customer_monthly_data["Two Months Ago Payment"] = customer_monthly_data.groupby("Customer")["Payment (RWF)"].shift(2)
customer_monthly_data.dropna(inplace=True)

# Step 2: Train/Test Split
train_data = customer_monthly_data.groupby("Customer").apply(lambda x: x.iloc[:-6]).reset_index(drop=True)
test_data = customer_monthly_data.groupby("Customer").apply(lambda x: x.iloc[-6:]).reset_index(drop=True)
X_train = train_data[["Previous Month Payment", "Two Months Ago Payment"]]
y_train = train_data["Payment (RWF)"]
X_test = test_data[["Previous Month Payment", "Two Months Ago Payment"]]
y_test = test_data["Payment (RWF)"]

# Step 3: Train the Model
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

# Step 4: Predictions
y_pred = model.predict(X_test)

# Step 5: Evaluate the Model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Visualization 1: Predicted vs. Actual Values
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.7, edgecolor='k')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='red', linewidth=2)
plt.title("Predicted vs. Actual Values", fontsize=16)
plt.xlabel("Actual Payments (RWF)", fontsize=12)
plt.ylabel("Predicted Payments (RWF)", fontsize=12)
plt.tight_layout()
plt.show()

# Visualization 2: Residual Distribution
residuals = y_test - y_pred
plt.figure(figsize=(10, 6))
sns.histplot(residuals, kde=True, bins=30, color='blue')
plt.title("Residual Distribution", fontsize=16)
plt.xlabel("Residuals (Actual - Predicted)", fontsize=12)
plt.ylabel("Frequency", fontsize=12)
plt.tight_layout()
plt.show()

# Visualization 3: Feature Importance
feature_importances = model.feature_importances_
features = X_train.columns
plt.figure(figsize=(10, 6))
sns.barplot(x=feature_importances, y=features, palette="viridis")
plt.title("Feature Importance", fontsize=16)
plt.xlabel("Importance Score", fontsize=12)
plt.ylabel("Features", fontsize=12)
plt.tight_layout()
plt.show()

mae, mse, r2


In [None]:
# Re-importing visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Visualization 1: Predicted vs. Actual Values
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.7, edgecolor='k')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='red', linewidth=2)
plt.title("Predicted vs. Actual Values", fontsize=16)
plt.xlabel("Actual Payments (RWF)", fontsize=12)
plt.ylabel("Predicted Payments (RWF)", fontsize=12)
plt.tight_layout()
plt.show()

# Visualization 2: Residual Distribution
residuals = y_test - y_pred
plt.figure(figsize=(10, 6))
sns.histplot(residuals, kde=True, bins=30, color='blue')
plt.title("Residual Distribution", fontsize=16)
plt.xlabel("Residuals (Actual - Predicted)", fontsize=12)
plt.ylabel("Frequency", fontsize=12)
plt.tight_layout()
plt.show()

# Visualization 3: Feature Importance
feature_importances = model.feature_importances_
features = X_train.columns
plt.figure(figsize=(10, 6))
sns.barplot(x=feature_importances, y=features, palette='viridis')
plt.title("Feature Importance", fontsize=16)
plt.xlabel("Importance Score", fontsize=12)
plt.ylabel("Features", fontsize=12)
plt.tight_layout()
plt.show()

# Display metrics
mae, mse, r2


In [None]:
# Step 1: Export the Model
import joblib
# Save the trained model to a file
model_filename = "customer_payment_model.pkl"
joblib.dump(model, model_filename)
# Step 2: Load the Model
loaded_model = joblib.load(model_filename)
# Step 3: Prepare a Sample Customer's Data
# Simulating new customer data with similar features
sample_customer_data = pd.DataFrame({
    "Previous Month Payment": [15000],  # Example value
    "Two Months Ago Payment": [14000]  # Example value
}

                                   )
# Step 4: Test the Model
predicted_payment = loaded_model.predict(sample_customer_data)[0]
predicted_payment

In [None]:
# Adjusting Random Forest parameters to reduce model size
# Reducing the number of trees and limiting tree depth
optimized_model = RandomForestRegressor(
    n_estimators=50,        # Fewer trees
    max_depth=10,           # Limiting tree depth
    random_state=42
)
optimized_model.fit(X_train, y_train)



In [None]:
# Save the optimized model
optimized_model_filename = "optimized_customer_payment_model.pkl"
joblib.dump(optimized_model, optimized_model_filename)



In [None]:
# Check the size of the optimized model file
import os
optimized_model_size = os.path.getsize(optimized_model_filename) / (1024 * 1024)  # Convert bytes to MB

# Test the optimized model with the sample customer data
optimized_predicted_payment = optimized_model.predict(sample_customer_data)[0]

optimized_model_size, optimized_predicted_payment


In [None]:
import pickle

# Save the optimized model with pickle
with open("optimized_customer_payment_model.pkl", "wb") as f:
    pickle.dump(optimized_model, f)

# Load the model later for prediction
with open("optimized_customer_payment_model.pkl", "rb") as f:
    loaded_model = pickle.load(f)

# Make predictions
predicted_payment = loaded_model.predict(sample_customer_data)[0]
print("Predicted Payment:", predicted_payment)


In [None]:
pip install onnxruntime


In [None]:
##Step 2: Export the Model to ONNX
#Use the sklearn-onnx library to export your model.##

In [None]:
pip install skl2onnx


In [None]:
pip install onnxruntime


In [None]:
pip install skl2onnx-*.whl


In [None]:
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType
import joblib

# Load the optimized model
optimized_model = joblib.load("optimized_customer_payment_model.pkl")

# Convert the model to ONNX format
initial_type = [("float_input", FloatTensorType([None, 2]))]  # 2 features in input
onnx_model = convert_sklearn(optimized_model, initial_types=initial_type)

# Save ONNX model
with open("optimized_customer_payment_model.onnx", "wb") as f:
    f.write(onnx_model.SerializeToString())


In [None]:

pip install skl2onnx

