In [1]:
# simulate training data for an ev charging station app considers distance, time of day, day of week, weather and type of vehicle

import pandas as pd  # to work with structured data
import numpy as np  # fornumerical operations and generating random numbers

np.random.seed(42)  #random number to ensure reproducability

# Simulate 500 samples
n_samples = 500  #creating a data set that i can use for training

data = pd.DataFrame({
    'distance_km': np.random.uniform(0.1, 10, n_samples),          # Distance from user to station
    'hour_of_day': np.random.randint(0, 24, n_samples),           # Hour of day
    'day_of_week': np.random.randint(0, 7, n_samples),            # 0=Mon, 6=Sun
    'vehicle_type': np.random.choice(['Motorbike', 'Car', 'Tuk-tuk'], n_samples),
    'weather_temp': np.random.uniform(15, 35, n_samples),         # Celsius temperature
})

# Encode vehicle type to numeric - convert the type to a numeric value
data['vehicle_type_encoded'] = data['vehicle_type'].map({'Motorbike': 0, 'Car': 1, 'Tuk-tuk': 2})

# Create label: available_slots (0 to 5)
# Assume availability tends to be lower during peak hours (7-9am, 5-7pm), varies with temp and distance
def generate_slots(row):
    base = 3
    if 7 <= row['hour_of_day'] <= 9 or 17 <= row['hour_of_day'] <= 19:
        base -= 2
    base -= (row['distance_km'] / 10)  # farther stations less likely available
    base += (35 - row['weather_temp']) / 10  # cooler temp might increase usage
    base += np.random.normal(0, 0.5)
    return max(0, min(5, int(round(base))))

data['available_slots'] = data.apply(generate_slots, axis=1)

data.head()


Unnamed: 0,distance_km,hour_of_day,day_of_week,vehicle_type,weather_temp,vehicle_type_encoded,available_slots
0,3.807947,21,2,Tuk-tuk,33.892284,2,2
1,9.512072,16,1,Tuk-tuk,28.664961,2,2
2,7.34674,8,2,Car,24.943495,1,1
3,6.026719,0,0,Motorbike,27.356945,0,3
4,1.644585,20,2,Motorbike,32.3781,0,4


In [2]:
#data preparation and model training

from sklearn.model_selection import train_test_split # split data for training and testing
from sklearn.ensemble import RandomForestRegressor # The machine-learning model
from sklearn.metrics import mean_squared_error # A metric to measure how well your model’s predictions match the true values

# Features (X) and label. WE are predicting y
X = data[['distance_km', 'hour_of_day', 'day_of_week', 'vehicle_type_encoded', 'weather_temp']]
y = data['available_slots']

# Split dataset 20% testing, 80% training,makeit reproduceable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = RandomForestRegressor(n_estimators=100, random_state=42)  #train model
model.fit(X_train, y_train)

# Test model
y_pred = model.predict(X_test)
print(f"Mean Squared Error on Test Set: {mean_squared_error(y_test, y_pred):.2f}")


Mean Squared Error on Test Set: 0.46


A Mean Squared Error (MSE) of 0.46 means:

On average, the squared difference between the model’s predicted number of available slots and the true (simulated) value is 0.46. To interpret this in the original “slots” units, you can take the square root—this is the Root Mean Squared Error (RMSE):

RMSE = SQRT OF 0.46 = 0.68 on average, your predictions are off by about 0.68 slots. it means the model usually predicts within one slot of the true availability. In real deployment, if the model says “2.3 slots available,” you’d know that the true value is most likely between about 1.6 and 3.0 slots.

an average squared error under 1 is strong performance. to improve (if you want even lower error): More features (weather, traffic, historical usage).



In [5]:
#predict availability and recommend best station

from geopy.distance import geodesic # Calculate the distance between points.

# Example list of stations (mock data)
stations = [
    {'name': 'Station A', 'location': (-1.285, 36.82)},
    {'name': 'Station B', 'location': (-1.26, 36.80)},
    {'name': 'Station C', 'location': (-1.25, 36.78)},
]

# User location & time (example)
user_location = (-1.28, 36.81)
current_hour = 14
current_day = 2  # Wednesday
vehicle_type = 'Car'
weather_temp = 28

def recommend_stations(user_location, stations, model):
    recommendations = []
    vehicle_map = {'Motorbike': 0, 'Car': 1, 'Tuk-tuk': 2}
    v_encoded = vehicle_map[vehicle_type]

    for station in stations: # loop through the stations anf compute distance
        dist = geodesic(user_location, station['location']).km
        features = pd.DataFrame([{
            'distance_km': dist,
            'hour_of_day': current_hour,
            'day_of_week': current_day,
            'vehicle_type_encoded': v_encoded,
            'weather_temp': weather_temp
        }])
        predicted_slots = model.predict(features)[0] # predict slots

        recommendations.append({
            'station': station['name'],
            'distance_km': round(dist, 2),
            'predicted_available_slots': round(predicted_slots, 2) # collect the recommendations
        })

    # Sort by predicted availability descending, then distance ascending
    recommendations.sort(key=lambda x: (-x['predicted_available_slots'], x['distance_km'])) #no slots is primary criteria then distance

    return recommendations

recommendations = recommend_stations(user_location, stations, model)
for r in recommendations:
    print(r)


{'station': 'Station C', 'distance_km': 4.71, 'predicted_available_slots': np.float64(3.2)}
{'station': 'Station B', 'distance_km': 2.48, 'predicted_available_slots': np.float64(3.05)}
{'station': 'Station A', 'distance_km': 1.24, 'predicted_available_slots': np.float64(3.04)}


Station C
Predicted available slots: 3.20 (highest of the three)
Distance: 4.71 km from you

Our ranking sorts first by the model’s predicted availability, so even though C is the farthest, it’s expected to have the most open chargers.

Station B
Predicted available slots: 3.05
Distance: 2.48 km
Why second? It has the next-highest predicted availability. It’s also closer than C, but availability was your primary sort key.

Station A
Predicted available slots: 3.04
Distance: 1.24 km (closest of the three)
Although it’s the nearest station, the model forecasts slightly fewer open slots than B and C, so it ranks third.