### Import packages

In [13]:
from collections import defaultdict
import contextily as ctx
from datetime import datetime
import folium
import geopandas as gpd
from matplotlib.colors import Normalize, to_hex
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.stats as stats
from scipy.optimize import curve_fit
from scipy.stats import kstest
from shapely.geometry import Polygon, Point
import seaborn as sns


### Load and inspect the data
- Load the Excel file into a DataFrame and inspect it for data quality.
- Remove rows where latitude (Lat) or longitude (Log) values are 0 in the dataset


In [None]:
# File path
file_path = "/Users/mayar/Desktop/MIT/Research_Fellow/ENERGY_SENSING/DATA/2022_vitals.xlsx"

# Specify the column names explicitly
column_names = [
    "deviceID", "Timestamp", "Lat", "Log", "SOC_batt", "temp_batt", "volatge_batt",
    "voltage_particle", "current_batt", "isCharging", "isCharginS", "isCharged",
    "Temp_int", "Hum_int", "solar_current", "Cellular_signal_strength","index"
]

# Load all sheets into a dictionary
sheets_dict = pd.read_excel(file_path, sheet_name=None, header=None)  # No header initially

# Process each sheet
processed_sheets = []
for sheet_name, sheet_data in sheets_dict.items():
    # Ensure the number of columns matches the expected number
    sheet_data = sheet_data.iloc[:, :len(column_names)]

    # Fix misaligned rows where the first column is invalid
    def fix_alignment(row):
        # Convert the row to a list
        row_list = row.tolist()

        # Find the first valid `deviceID` (assumes valid `deviceID` has > 5 characters)
        for i, value in enumerate(row_list):
            if isinstance(value, str) and len(value) > 5:  # Valid `deviceID` found
                # Align the row starting from the valid `deviceID`
                aligned_row = row_list[i:i + len(column_names)]
                # Ensure the row is padded or trimmed to match `column_names`
                return aligned_row + [None] * (len(column_names) - len(aligned_row))

        # If no valid `deviceID` is found, return a row of NaN
        return [None] * len(column_names)

    # Apply alignment fix to all rows
    sheet_data = sheet_data.apply(fix_alignment, axis=1, result_type="expand")
    
    # Assign column names
    sheet_data.columns = column_names

    # Drop rows where 'deviceID' is still invalid or starts with "deviceID"
    sheet_data = sheet_data[sheet_data['deviceID'].notna()]
    sheet_data = sheet_data[sheet_data['deviceID'] != "deviceID"]  # Remove rows starting with "deviceID"

    # Append processed sheet
    processed_sheets.append(sheet_data)

# Concatenate all sheets into one DataFrame
df = pd.concat(processed_sheets, ignore_index=True)

# Drop rows where Lat or Log is 0
df = df[(df['Lat'] != 0) & (df['Log'] != 0)]

# Correct the indexing column to start at 1 and increment sequentially
df.reset_index(drop=True, inplace=True)  # Reset pandas index
df['index'] = df.index + 1  # Create a 1-based index

# Save the cleaned data back to Excel (optional)
output_path = "/Users/mayar/Desktop/MIT/Research_Fellow/ENERGY_SENSING/DATA/2022_vitals_cleaned.xlsx"
df.to_excel(output_path, index=False)

# Print the cleaned data preview
print(df.head())


### Convert Timestamps and Create a Spatial Grid
- Convert the Timestamp column from Unix format to human-readable datetime.
- Group the GPS data into a spatial grid for coverage analysis.

In [None]:
# Convert Unix timestamp to datetime
df['Timestamp'] = pd.to_datetime(df['Timestamp'], unit='s')

# Ensure 'Lat' and 'Log' are numeric
df['Lat'] = pd.to_numeric(df['Lat'], errors='coerce')
df['Log'] = pd.to_numeric(df['Log'], errors='coerce')

# Spatial Resolution: 120m grid
grid=120
lat_resolution = grid / 111320  
df['Lat_Grid'] = (df['Lat'] // lat_resolution) * lat_resolution

# Longitude resolution depends on latitude
df['Lon_Resolution'] = grid / (111320 * np.cos(np.radians(df['Lat'])))
df['Log_Grid'] = (df['Log'] // df['Lon_Resolution']) * df['Lon_Resolution']

# Temporal Resolution: 10s intervals
# temporal_resolution = '10s'  
# df['Time_Bin'] = df['Timestamp'].dt.floor(temporal_resolution)

# Drop auxiliary column
df = df.drop(columns=['Lon_Resolution'])

# Step 1: Sort by deviceID and Timestamp
df = df.sort_values(by=['deviceID', 'Timestamp'])

# Step 2: Detect continuous stationary periods
df['Prev_Lat_Grid'] = df.groupby('deviceID')['Lat_Grid'].shift(1)
df['Prev_Log_Grid'] = df.groupby('deviceID')['Log_Grid'].shift(1)
df['Prev_Timestamp'] = df.groupby('deviceID')['Timestamp'].shift(1)

# Step 3: Identify whether the taxi has stayed in the same grid
df['Same_Grid'] = (df['Lat_Grid'] == df['Prev_Lat_Grid']) & (df['Log_Grid'] == df['Prev_Log_Grid'])

# Step 4: Compute time spent in the grid continuously
df['Time_Diff'] = (df['Timestamp'] - df['Prev_Timestamp']).dt.total_seconds()

# Step 5: Assign a group ID that resets when the taxi leaves a grid
df['Group'] = (~df['Same_Grid']).cumsum()

# Step 6: Compute total time spent in each visit to the grid
df['Cumulative_Time'] = df.groupby(['deviceID', 'Lat_Grid', 'Log_Grid', 'Group'])['Time_Diff'].cumsum()

# Step 7: Remove vehicles that stayed continuously in the same grid for more than 3hours (10800 sec)
df = df[~(df['Cumulative_Time'] >= 10800)]

# Drop helper columns
df = df.drop(columns=['Prev_Lat_Grid', 'Prev_Log_Grid', 'Prev_Timestamp', 'Same_Grid', 'Time_Diff', 'Group', 'Cumulative_Time'])


Plot cumulative sampling maps

In [None]:
# Dynamically define the bounds from the DataFrame
min_lat = df['Lat'].min()
max_lat = df['Lat'].max()
min_lon = df['Log'].min()
max_lon = df['Log'].max()

# Define grid size (120x120 meters)
grid_size = 120
lat_resolution = grid_size / 111320  # Approximate latitude resolution
lon_resolution_at_lat = lambda lat: grid_size / (111320 * np.cos(np.radians(lat)))

# Generate grid of polygons
grid = []
lat = min_lat
while lat < max_lat:
    lon = min_lon
    while lon < max_lon:
        lon_res = lon_resolution_at_lat(lat)
        grid.append(Polygon([
            (lon, lat),
            (lon + lon_res, lat),
            (lon + lon_res, lat + lat_resolution),
            (lon, lat + lat_resolution)
        ]))
        lon += lon_res
    lat += lat_resolution

# Create an empty GeoDataFrame for the grid
grid_gdf = gpd.GeoDataFrame({'geometry': grid, 'Count': 0}, crs="EPSG:4326")

# Create a GeoDataFrame for the points in df
df_gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df['Log'], df['Lat']), crs="EPSG:4326")

# Increment the count for each grid square containing points
for index, point in df_gdf.iterrows():
    match = grid_gdf.contains(point.geometry)
    if match.any():
        grid_gdf.loc[match.idxmax(), 'Count'] += 1

# Apply Fractional Power Scaling to 'Count'
gamma = 0.3  # Adjust between 0.2 - 0.5 for better visibility
grid_gdf['Scaled_Count'] = (grid_gdf['Count'] + 1) ** gamma

# Plot the heatmap
fig, ax = plt.subplots(1, 1, figsize=(12, 12))

# Plot the grid using the scaled count with a jet colormap
grid_gdf.plot(
    column='Scaled_Count',
    cmap='jet',
    legend=True,
    alpha=0.4,
    ax=ax,
    # edgecolor='gray'
)

# Add a basemap
ctx.add_basemap(ax, crs=grid_gdf.crs.to_string(), source=ctx.providers.OpenStreetMap.Mapnik)

# Adjust plot aesthetics
ax.set_title('Heatmap of Measurements Across Stockholm (120x120m Grid)', fontsize=12)
ax.set_axis_off()

plt.show()


In [None]:
# Select the top 10 most frequently sensed grid cells
top_10_cells = grid_gdf.nlargest(10, 'Count')

# Filter the data for points within these top 10 grid cells
df_top_cells = df_gdf[df_gdf.geometry.apply(lambda point: any(top_10_cells.contains(point)))].copy()  # Make a copy

# Compute the time differences for each device in each grid cell
df_top_cells['Time_Diff'] = df_top_cells.groupby(['Lat_Grid', 'Log_Grid', 'deviceID'])['Timestamp'].diff().dt.total_seconds()

# Calculate the mean sampling frequency (1 / mean time difference) per grid cell
sampling_frequency = df_top_cells.groupby(['Lat_Grid', 'Log_Grid'])['Time_Diff'].mean().dropna().apply(lambda x: 1 / x)

top_10_frequencies = sampling_frequency.nlargest(10)
# Convert to DataFrame for better visualization
top_10_frequencies_df = top_10_frequencies.reset_index()
top_10_frequencies_df.columns = ['Lat_Grid', 'Log_Grid', 'Sampling_Frequency (Hz)']
top_10_frequencies_df


In [None]:
# Assuming df is already loaded with the necessary data
# Identify unique days where at least one device's SOC_batt dropped below 50%
df['Date'] = df['Timestamp'].dt.date  # Extract the date
days_with_depletion = df[df['SOC_batt'] < 50]['Date'].nunique()

# Display the number of days with a battery drop below 50%
days_with_depletion


In [None]:
# Step 1: Assign Vehicles to Grid Cells Using Polygon Containment
df_gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df['Log'], df['Lat']), crs="EPSG:4326")

# Function to find which grid cell a point belongs to
def find_grid_cell(point, grid_gdf):
    match = grid_gdf.contains(point.geometry)
    if match.any():
        return match.idxmax()  # Return index of matching grid cell
    else:
        return None  # No match found

# Apply function to assign each GPS point to a grid cell
df_gdf['Grid_Cell'] = df_gdf.apply(lambda row: find_grid_cell(row, grid_gdf), axis=1)

# Drop rows where no grid cell was matched (outliers)
df_gdf = df_gdf.dropna(subset=['Grid_Cell'])

# Step 2: Sort by deviceID and Timestamp for Transition Analysis
df_gdf = df_gdf.sort_values(by=['deviceID', 'Timestamp'])

# Step 3: Identify Sensor Depletion (SOC_batt < 50)
depletion_threshold = 50
df_gdf['Depleted'] = df_gdf['SOC_batt'] < depletion_threshold

# Step 4: Separate Pre- and Post-Depletion Data
df_pre_depletion = df_gdf[~df_gdf['Depleted']].copy()
df_post_depletion = df_gdf[df_gdf['Depleted']].copy()

# Step 5: Create Transitions (from one grid cell to the next)
df_pre_depletion['Next_Grid_Cell'] = df_pre_depletion.groupby('deviceID')['Grid_Cell'].shift(-1)

# Drop last row per vehicle (no next transition available)
df_transitions = df_pre_depletion.dropna(subset=['Next_Grid_Cell'])

# Step 6: Build the Markov Transition Matrix
transition_counts = (
    df_transitions.groupby(['Grid_Cell', 'Next_Grid_Cell'])
    .size()
    .unstack(fill_value=0)
)
transition_probabilities = transition_counts.div(transition_counts.sum(axis=1), axis=0)  # Normalize to probabilities

# Step 7: Define a Function to Predict the Next Grid Cell
def predict_next_grid(current_grid, transition_matrix):
    if current_grid in transition_matrix.index:
        return transition_matrix.loc[current_grid].idxmax()  # Most likely transition
    else:
        return None  # No transition data available

# Step 8: Validate Predictions Using Post-Depletion Data
df_post_depletion['Predicted_Grid_Cell'] = df_post_depletion['Grid_Cell'].apply(
    lambda x: predict_next_grid(x, transition_probabilities)
)

# Step 9: Measure Prediction Accuracy
df_post_depletion['Correct_Prediction'] = df_post_depletion['Predicted_Grid_Cell'] == df_post_depletion['Grid_Cell']
accuracy = df_post_depletion['Correct_Prediction'].mean()

print(f"Prediction Accuracy: {accuracy:.2%}")

# Step 10: Save the Transition Matrix and Post-Depletion Validation
output_path_transition_matrix = "/Users/mayar/Desktop/MIT/Research_Fellow/ENERGY_SENSING/DATA/Transition_Matrix.xlsx"
transition_probabilities.to_excel(output_path_transition_matrix)

output_path_post_depletion = "/Users/mayar/Desktop/MIT/Research_Fellow/ENERGY_SENSING/DATA/Post_Depletion_Validation.xlsx"
df_post_depletion.to_excel(output_path_post_depletion, index=False)

# Step 11: Analyze and Print the Top 10 Most Likely Transitions
most_likely_transitions = (
    transition_probabilities.stack()
    .reset_index()
    .rename(columns={0: 'Probability', 'level_0': 'From_Grid', 'level_1': 'To_Grid'})
    .sort_values(by='Probability', ascending=False)
)

print("Top 10 Most Likely Transitions:")
print(most_likely_transitions.head(10))


In [None]:
# Convert Post-Depletion Data to GeoDataFrame for Visualization
df_post_depletion = gpd.GeoDataFrame(df_post_depletion, 
                                     geometry=gpd.points_from_xy(df_post_depletion['Log_Grid'], df_post_depletion['Lat_Grid']), 
                                     crs="EPSG:4326")

# Convert Predicted Grid Cells to Points
predicted_points = df_post_depletion[['Predicted_Grid_Cell']].dropna().copy()
predicted_points['Lat_Pred'] = predicted_points['Predicted_Grid_Cell'].apply(lambda x: grid_gdf.loc[x, 'geometry'].centroid.y if x in grid_gdf.index else None)
predicted_points['Lon_Pred'] = predicted_points['Predicted_Grid_Cell'].apply(lambda x: grid_gdf.loc[x, 'geometry'].centroid.x if x in grid_gdf.index else None)
predicted_points = predicted_points.dropna()

# Convert to GeoDataFrame
gdf_predicted = gpd.GeoDataFrame(predicted_points, 
                                 geometry=gpd.points_from_xy(predicted_points['Lon_Pred'], predicted_points['Lat_Pred']), 
                                 crs="EPSG:4326")

# Plot the actual and predicted trajectories
fig, ax = plt.subplots(figsize=(12, 12))

# Plot Grid
grid_gdf.plot(ax=ax, color='lightgrey', edgecolor='grey', alpha=0.4)

# Plot Actual Trajectory (Post-Depletion)
df_post_depletion.plot(ax=ax, color='red', markersize=20, label="Actual Trajectory", alpha=0.7)

# Plot Predicted Trajectory
gdf_predicted.plot(ax=ax, color='blue', markersize=20, label="Predicted Trajectory", alpha=0.7)

# Add Basemap
ctx.add_basemap(ax, crs=df_post_depletion.crs.to_string(), source=ctx.providers.OpenStreetMap.Mapnik)

# Formatting
ax.set_title("Actual vs Predicted Trajectory (Post-Depletion)", fontsize=14)
ax.legend()
ax.set_axis_off()

plt.show()


In [None]:
# Extract unique days where depletion occurred
depleted_days = df_post_depletion['Timestamp'].dt.date.unique()

# Loop through each depleted day and generate a plot
for day in depleted_days:
    # Filter data for the current day
    df_day = df_post_depletion[df_post_depletion['Timestamp'].dt.date == day]
    df_pre_depletion_day = df_pre_depletion[df_pre_depletion['Timestamp'].dt.date == day]

    # Convert actual, predicted, and pre-depletion data into GeoDataFrames
    gdf_actual = grid_gdf.loc[grid_gdf.index.isin(df_day['Grid_Cell'])].copy()
    gdf_actual['Color'] = 'red'

    predicted_grid_cells = df_day['Predicted_Grid_Cell'].dropna().unique()
    gdf_predicted = grid_gdf.loc[grid_gdf.index.isin(predicted_grid_cells)].copy()
    gdf_predicted['Color'] = 'blue'

    pre_depletion_grid_cells = df_pre_depletion_day['Grid_Cell'].unique()
    gdf_pre_depletion = grid_gdf.loc[grid_gdf.index.isin(pre_depletion_grid_cells)].copy()
    gdf_pre_depletion['Color'] = 'black'

    # Skip plotting if all GeoDataFrames are empty for the day
    if gdf_actual.empty and gdf_predicted.empty and gdf_pre_depletion.empty:
        print(f"Skipping {day}: No valid data for plotting.")
        continue

    # Create the plot for the current day
    fig, ax = plt.subplots(figsize=(12, 12))

    # Plot Grid Cells
    grid_gdf.plot(ax=ax, color='lightgrey', edgecolor='grey', alpha=0.2)

    # Plot Pre-Depletion Trajectory (Black) if not empty
    if not gdf_pre_depletion.empty:
        gdf_pre_depletion.plot(ax=ax, color='black', alpha=0.5, edgecolor='black', label="Pre-Depletion Trajectory")

    # Plot Actual Trajectory (Red) if not empty
    if not gdf_actual.empty:
        gdf_actual.plot(ax=ax, color='red', alpha=0.5, edgecolor='red', label="Actual Trajectory")

    # Plot Predicted Trajectory (Blue) if not empty
    if not gdf_predicted.empty:
        gdf_predicted.plot(ax=ax, color='blue', alpha=0.5, edgecolor='blue', label="Predicted Trajectory")

    # Add Basemap
    try:
        ctx.add_basemap(ax, crs=grid_gdf.crs.to_string(), source=ctx.providers.OpenStreetMap.Mapnik)
    except Exception as e:
        print(f"Basemap Error on {day}: {e}")

    # Formatting
    ax.set_title(f"Actual vs Predicted Trajectory (Post-Depletion) - {day}", fontsize=14)
    ax.legend()
    ax.set_axis_off()

    # Show plot
    plt.show()


In [None]:
## CURRENT 
df_temp = df.copy().reset_index(drop=True)  # Ensure indices are sequential
df_temp=df_temp.sort_values(by=['Timestamp']).reset_index(drop=True) 

# Define different time thresholds to compare
time_thresholds = {
    "3 sec": 3,
    "12 sec": 12
}

# Create a dictionary to store SOC and sensor states for each threshold
soc_depletion_results = {}

# Iterate over different time thresholds
for label, TIME_THRESHOLD in time_thresholds.items():

    # Track last sensed timestamp, and stored energy during OFF periods
    last_sensed_time = {}
    stored_energy = {}

    # Previous date
    prev_date=None
    
    for i in range(len(df_temp)):
        row = df_temp.iloc[i]
        grid_key = (row['Lat_Grid'], row['Log_Grid'])  # Unique grid identifier
        current_time = row['Timestamp']
        current_date = row['Date']
        device= row['deviceID']

        # Initialise inter-row differences when OFF
        d_diff_prev=0 
        
        # Reset stored energy at the start of a new day
        if prev_date is not None and current_date != prev_date:
            stored_energy={}  # Reset stored energy for all grid cells
            df_temp.loc[i:, f'Energy_Saved_{label}'] = 0  # Reset energy savings for the new day
            print(f"[RESET] Reset stored energy for new day: {current_date}")

        prev_date = current_date  # Update previous date tracker

        if df_temp.loc[i, 'SOC_batt']>99:
            stored_energy[grid_key]=0
            df_temp.loc[i:, f'Energy_Saved_{label}']=0

        # If the grid was sensed recently (within the threshold), turn OFF the sensor
        if grid_key in last_sensed_time and (current_time - last_sensed_time[grid_key]).total_seconds() < TIME_THRESHOLD:
            df_temp.at[i, f'Sensor_ON_{label}'] = False   

            # Accumulate stored energy
            if i > 0 and pd.notna(df_temp.iloc[i - 1]['SOC_batt']) and pd.notna(row['SOC_batt']):
            
                # Find the last preceding row for this device
                if device == df_temp.iloc[i-1]['deviceID']:
                    d_diff = max(0, df_temp.iloc[i - 1]['SOC_batt'] - row['SOC_batt']) #current inter-row difference
                    diff = max(d_diff_prev, d_diff)
                    stored_energy[grid_key] = df_temp.loc[i-1, f'Energy_Saved_{label}']
                    stored_energy[grid_key] += diff
                    
                    if diff != 0: 
                        df_temp.loc[i:, f'Energy_Saved_{label}'] = stored_energy[grid_key]
                    else:
                        df_temp.loc[i:, f'Energy_Saved_{label}'] = df_temp.loc[i-1, f'Energy_Saved_{label}']

                    d_diff_prev=diff
                    print(f"[OFF]: Accumulated {diff:.2f}% for device {device}. Total stored: {stored_energy[grid_key]:.2f}%")

                else:
                    d_diff = max(0, df_temp.loc[df_temp.deviceID == device, :]['SOC_batt'].iloc[-1] - row['SOC_batt']) #current inter-row difference
                    diff = max(d_diff_prev, d_diff)
                    stored_energy[grid_key] = df_temp.loc[df_temp.deviceID == device, :][f'Energy_Saved_{label}'].iloc[-1]
                    stored_energy[grid_key] += diff
                    
                    if diff != 0: 
                        df_temp.loc[i:, f'Energy_Saved_{label}'] = stored_energy[grid_key]
                    else:
                        df_temp.loc[i:, f'Energy_Saved_{label}'] = df_temp.loc[df_temp.deviceID == device, :][f'Energy_Saved_{label}'].iloc[-1]

                    d_diff_prev=diff
                    print(f"CHANGE [OFF]: Accumulated {diff:.2f}% for device {device}. Total stored: {stored_energy[grid_key]:.2f}%")


        else:
            # Update last sensed time when the sensor turns ON
            last_sensed_time[grid_key] = current_time
            df_temp.at[i, f'Sensor_ON_{label}'] = True
            d_diff_prev=0

            if device == df_temp.iloc[i-1]['deviceID']:

                # Ensure stored_energy is initialized per grid cell without overwriting previous values
                if grid_key not in stored_energy:
                    if i > 0:
                        #Carry forward the stored energy from the last known row
                        stored_energy[grid_key] = df_temp.loc[i-1, f'Energy_Saved_{label}']
                        df_temp.loc[i:, f'Energy_Saved_{label}'] = stored_energy[grid_key]
                    elif i == 0:
                        stored_energy[grid_key] = 0  # First iteration, no prior energy 
                        df_temp.loc[i:, f'Energy_Saved_{label}'] = stored_energy[grid_key]  
                print(f"[ON]: Accumulated {diff:.2f}% for device {device}. Total stored: {stored_energy[grid_key]:.2f}%")


            else:

                 # Ensure stored_energy is initialized per grid cell without overwriting previous values
                if grid_key not in stored_energy:
                    if i > 0:
                        #Carry forward the stored energy from the last known row
                        stored_energy[grid_key] = df_temp.loc[df_temp.deviceID == device, :][f'Energy_Saved_{label}'].iloc[-1]
                        df_temp.loc[i:, f'Energy_Saved_{label}'] = stored_energy[grid_key]
                    elif i == 0:
                        stored_energy[grid_key] = 0  # First iteration, no prior energy 
                        df_temp.loc[i:, f'Energy_Saved_{label}'] = stored_energy[grid_key]                 
                
                print(f"CHANGE [ON]: Accumulated {diff:.2f}% for device {device}. Total stored: {stored_energy[grid_key]:.2f}%")

    
    # Compute new SOC_batt with savings
    df_temp[f'SOC_batt_{label}'] = df_temp['SOC_batt'] + df_temp[f'Energy_Saved_{label}']
    df_temp[f'SOC_batt_{label}'] = df_temp[f'SOC_batt_{label}'].clip(upper=100)

    # Compute SOC depletion for this threshold
    daily_soc = df_temp.groupby(['Date', 'deviceID'])[f'SOC_batt_{label}'].mean()
    soc_depletion_results[label] = daily_soc


# Baseline: Compute SOC depletion without constraints
soc_depletion_results["No Constraint"] = df_temp.groupby(['Date', 'deviceID'])['SOC_batt'].mean()

# Convert results to a DataFrame for plotting
soc_depletion_df = pd.DataFrame(soc_depletion_results)

# Save the updated dataset with sensor states and energy savings for each threshold
output_path = "/Users/mayar/Desktop/MIT/Research_Fellow/ENERGY_SENSING/DATA/updated_SOC_batt_with_energy_savings.xlsx"
df_temp.to_excel(output_path, index=False)



In [None]:
import matplotlib.pyplot as plt

# Define line styles for each threshold
line_styles = {
    "No Constraint": "-",
    "3 sec": "--",
    "12 sec": ":"
}

# Predefined colors for devices
predefined_colors = ['red', 'blue', 'green', 'black', 'purple']
device_ids = set()
for soc_series in soc_depletion_results.values():
    device_ids.update(soc_series.index.get_level_values('deviceID').unique())

# Create a color map using predefined colors
color_map = {device_id: predefined_colors[i % len(predefined_colors)] for i, device_id in enumerate(sorted(device_ids))}

# Plot SOC depletion for different devices and thresholds
plt.figure(figsize=(12, 6))

# Iterate over thresholds and plot per device
for label, soc_series in soc_depletion_results.items():  # soc_series is a MultiIndexed Series
    for device_id in soc_series.index.get_level_values('deviceID').unique():  # Get unique devices
        device_data = soc_series[soc_series.index.get_level_values('deviceID') == device_id]
        plt.plot(
            device_data.index.get_level_values('Date'),  # X-axis: Dates
            device_data.values,  # Y-axis: SOC values
            linestyle=line_styles[label],
            color=color_map[device_id],  # Use predefined color for the device
            marker='o',
            markersize=3,
            label=f"Device {device_id} - {label}"
        )

plt.xlabel('Date')
plt.ylabel('Mean SOC (%)')
plt.title('SOC Depletion Comparison Across Devices and Time Constraints')

# Place the legend outside the plot
plt.legend(
    bbox_to_anchor=(1.05, 1),  # Place legend to the right of the plot
    loc='upper left',          # Align legend to the top-left of the bounding box
    borderaxespad=0.           # Reduce spacing between the legend and the plot
)
plt.grid(True)
plt.xticks(rotation=45)

# Adjust layout to make room for the legend
plt.tight_layout()

# Show plot
plt.show()


In [None]:
df_temp = df.copy().reset_index(drop=True)  # Ensure indices are sequential
df_temp=df_temp.sort_values(by=['Timestamp']).reset_index(drop=True) 

# Define different time thresholds to compare
time_thresholds = {
    "3 sec": 3,
    "12 sec": 12
}

# Create a dictionary to store SOC and sensor states for each threshold
soc_depletion_results = {}

# Iterate over different time thresholds
for label, TIME_THRESHOLD in time_thresholds.items():

    # Track last sensed timestamp, and stored energy during OFF periods
    last_sensed_time = {}
    stored_energy = {}
 
    # Previous date
    prev_date=None
    
    for i in range(len(df_temp)):
        row = df_temp.iloc[i]
        grid_key = (row['Lat_Grid'], row['Log_Grid'])  # Unique grid identifier
        current_time = row['Timestamp']
        current_date = row['Date']

        # Initialise inter-row differences when OFF
        d_diff_prev=0 
        
        # Reset stored energy at the start of a new day
        if prev_date is not None and current_date != prev_date:
            stored_energy={}  # Reset stored energy for all grid cells
            df_temp.loc[i:, f'Energy_Saved_{label}'] = 0  # Reset energy savings for the new day
            print(f"[RESET] Reset stored energy for new day: {current_date}")

        prev_date = current_date  # Update previous date tracker

        if df_temp.loc[i, 'SOC_batt']>99:
            stored_energy[grid_key]=0
            df_temp.loc[i:, f'Energy_Saved_{label}']=0

        # If the grid was sensed recently (within the threshold), turn OFF the sensor
        if grid_key in last_sensed_time and (current_time - last_sensed_time[grid_key]).total_seconds() < TIME_THRESHOLD:
            df_temp.at[i, f'Sensor_ON_{label}'] = False   

            # Accumulate stored energy
            if i > 0 and pd.notna(df_temp.iloc[i - 1]['SOC_batt']) and pd.notna(row['SOC_batt']):

                d_diff = max(0, df_temp.iloc[i - 1]['SOC_batt'] - row['SOC_batt']) #current inter-row difference
                diff = max(d_diff_prev, d_diff)
                stored_energy[grid_key] = df_temp.loc[i-1, f'Energy_Saved_{label}']
                stored_energy[grid_key] += diff
                
                if diff != 0: 
                    df_temp.loc[i:, f'Energy_Saved_{label}'] = stored_energy[grid_key]
                else:
                    df_temp.loc[i:, f'Energy_Saved_{label}'] = df_temp.loc[i-1, f'Energy_Saved_{label}']

                d_diff_prev=diff
                print(f"[DEBUG] OFF: Accumulated {diff:.2f}% for grid {grid_key}. Total stored: {stored_energy[grid_key]:.2f}%")


        else:
            # Update last sensed time when the sensor turns ON
            last_sensed_time[grid_key] = current_time
            df_temp.at[i, f'Sensor_ON_{label}'] = True
            d_diff_prev=0

            # Ensure stored_energy is initialized per grid cell without overwriting previous values
            if grid_key not in stored_energy:
                if i > 0:
                    #Carry forward the stored energy from the last known row
                    stored_energy[grid_key] = df_temp.loc[i-1, f'Energy_Saved_{label}']
                    df_temp.loc[i:, f'Energy_Saved_{label}'] = stored_energy[grid_key]
                elif i == 0:
                    stored_energy[grid_key] = 0  # First iteration, no prior energy 
                    df_temp.loc[i:, f'Energy_Saved_{label}'] = stored_energy[grid_key]  
            

    
    # Compute new SOC_batt with savings
    df_temp[f'SOC_batt_{label}'] = df_temp['SOC_batt'] + df_temp[f'Energy_Saved_{label}']
    df_temp[f'SOC_batt_{label}'] = df_temp[f'SOC_batt_{label}'].clip(upper=100)

    # Compute SOC depletion for this threshold
    daily_soc = df_temp.groupby('Date')[f'SOC_batt_{label}'].mean()
    soc_depletion_results[label] = daily_soc


# Baseline: Compute SOC depletion without constraints
soc_depletion_results["No Constraint"] = df_temp.groupby('Date')['SOC_batt'].mean()

# Convert results to a DataFrame for plotting
soc_depletion_df = pd.DataFrame(soc_depletion_results)

# Save the updated dataset with sensor states and energy savings for each threshold
output_path = "/Users/mayar/Desktop/MIT/Research_Fellow/ENERGY_SENSING/DATA/updated_SOC_batt_with_energy_savings.xlsx"
df_temp.to_excel(output_path, index=False)

# Plot SOC depletion for different thresholds
plt.figure(figsize=(12, 6))
for label in soc_depletion_df.columns:
    plt.plot(soc_depletion_df.index, soc_depletion_df[label], marker='o', linestyle='-', label=label)

plt.xlabel('Date')
plt.ylabel('Mean SOC (%)')
plt.title('SOC Depletion Comparison Across Different Time Constraints (With Energy Storage)')
plt.legend()
plt.grid(True)
plt.xticks(rotation=45)

# Show plot
plt.show()


In [None]:
df_temp = df.copy().reset_index(drop=True)  # Ensure indices are sequential
df_temp = df_temp.sort_values(by=['Timestamp']).reset_index(drop=True) 

# Define different time thresholds to compare
time_thresholds = {
    "3 sec": 3,
    "12 sec": 12
}

# Create a dictionary to store SOC and sensor states for each threshold
soc_depletion_results = {}

# Iterate over different time thresholds
for label, TIME_THRESHOLD in time_thresholds.items():

    # Track last sensed timestamp per grid cell and stored energy per device
    last_sensed_time = {}
    stored_energy = {}  
    last_device_row = {}  # To track the last row where each device appeared

    # Previous date
    prev_date = None
    
    for i in range(len(df_temp)):
        row = df_temp.iloc[i]
        device_id = row['deviceID']
        grid_key = (row['Lat_Grid'], row['Log_Grid'])  # Unique grid identifier
        device_key = (device_id, row['Lat_Grid'], row['Log_Grid'])  # Unique per device-grid
        current_time = row['Timestamp']
        current_date = row['Date']

        # Initialize inter-row differences when OFF
        d_diff_prev = 0 
        
        # Reset stored energy at the start of a new day
        if prev_date is not None and current_date != prev_date:
            stored_energy.clear()  # Reset stored energy for all devices
            df_temp.loc[df_temp['Date'] == current_date, f'Energy_Saved_{label}'] = 0  
            print(f"[RESET] Reset stored energy for new day: {current_date}")

        prev_date = current_date  # Update previous date tracker

        # Reset stored energy if SOC is at maximum for this device
        if df_temp.loc[i, 'SOC_batt'] > 99:
            stored_energy[device_key] = 0
            df_temp.at[i, f'Energy_Saved_{label}'] = 0

        # **Grid-based sensing decision (shared across devices)**
        if grid_key in last_sensed_time and (current_time - last_sensed_time[grid_key]).total_seconds() < TIME_THRESHOLD:
            df_temp.at[i, f'Sensor_ON_{label}'] = False   

            # **Device-specific energy storage**
            if i > 0 and pd.notna(df_temp.iloc[i - 1]['SOC_batt']) and pd.notna(row['SOC_batt']):
                d_diff = max(0, df_temp.iloc[i - 1]['SOC_batt'] - row['SOC_batt'])  # Current inter-row difference
                diff = max(d_diff_prev, d_diff)

                if device_key not in stored_energy:
                    stored_energy[device_key] = 0  # Ensure initialized storage

                # Find the last preceding row for this device
                if device_id in last_device_row:
                    prev_idx = last_device_row[device_id]  # The last row where this device appeared
                    stored_energy[device_key] = df_temp.at[prev_idx, f'Energy_Saved_{label}']
                    stored_energy[device_key] += diff  # Accumulate for this device only

                    # Store the energy saved for this specific device
                    if diff != 0: 
                        df_temp.at[i, f'Energy_Saved_{label}'] = stored_energy[device_key]
                    else:
                        df_temp.at[i, f'Energy_Saved_{label}'] = df_temp.at[prev_idx, f'Energy_Saved_{label}']
                else:
                    df_temp.at[i, f'Energy_Saved_{label}'] = 0  # If no previous row, start at 0

                d_diff_prev = diff
                print(f"[DEBUG] OFF: Accumulated {diff:.2f}% for device {device_id} at grid {grid_key}. Total stored: {stored_energy[device_key]:.2f}%")

        else:
            # **Grid-based sensing decision**
            last_sensed_time[grid_key] = current_time  # Same grid key shared across devices
            df_temp.at[i, f'Sensor_ON_{label}'] = True
            d_diff_prev = 0

            # Ensure stored_energy is initialized per device without overwriting previous values
            if device_key not in stored_energy:
                if device_id in last_device_row:
                    prev_idx = last_device_row[device_id]  # Get last row for this device
                    # Carry forward the stored energy from the last known row for this device
                    stored_energy[device_key] = df_temp.at[prev_idx, f'Energy_Saved_{label}']
                else:
                    stored_energy[device_key] = 0  # No previous entry, initialize to 0

                df_temp.at[i, f'Energy_Saved_{label}'] = stored_energy[device_key]

        # Update last row for this device
        last_device_row[device_id] = i  

    # Compute new SOC_batt with savings per device
    df_temp[f'SOC_batt_{label}'] = df_temp['SOC_batt'] + df_temp[f'Energy_Saved_{label}']
    df_temp[f'SOC_batt_{label}'] = df_temp[f'SOC_batt_{label}'].clip(upper=100)  # Cap SOC at 100%

    # Compute SOC depletion for this threshold per device
    daily_soc = df_temp.groupby(['Date', 'deviceID'])[f'SOC_batt_{label}'].mean().unstack()
    soc_depletion_results[label] = daily_soc

# Baseline: Compute SOC depletion without constraints per device
soc_depletion_results["No Constraint"] = df_temp.groupby(['Date', 'deviceID'])['SOC_batt'].mean().unstack()

# Convert results to a DataFrame for plotting
soc_depletion_df = pd.concat(soc_depletion_results, axis=1)

# Save the updated dataset with sensor states and energy savings for each threshold
output_path = "/Users/mayar/Desktop/MIT/Research_Fellow/ENERGY_SENSING/DATA/updated_SOC_batt_with_energy_savings.xlsx"
df_temp.to_excel(output_path, index=False)

# Define line styles for each threshold
line_styles = {
    "No Constraint": ":",
    "3 sec": "--",
    "12 sec": "-"
}

# Plot SOC depletion for different devices and thresholds
plt.figure(figsize=(12, 6))


# Iterate over thresholds and plot per device
for label, df_temp in soc_depletion_results.items():

    for device_id in df_temp.columns:  # Iterate over devices
        plt.plot(df_temp.index, df_temp[device_id], linestyle=line_styles[label], marker='o', label=f"Device {device_id} - {label}")

plt.xlabel('Date')
plt.ylabel('Mean SOC (%)')
plt.title('SOC Depletion Comparison Across Devices and Time Constraints')
plt.legend()
plt.grid(True)
plt.xticks(rotation=45)

# Show plot
plt.show()


In [None]:
# Compute the count of times the sensor was turned OFF for each constraint scenario
off_counts = {}

# Iterate over different time thresholds
for label, TIME_THRESHOLD in time_thresholds.items():
    df_copy = df.copy()  # Work on a copy of the dataset
    df_copy['Sensor_ON'] = True  # Default: Sensor is ON

    # Track last sensed timestamp per grid cell
    last_sensed_time = {}
    off_count = 0

    for i, row in df_copy.iterrows():
        grid_key = (row['Lat_Grid'], row['Log_Grid'])  # Unique grid identifier
        current_time = row['Timestamp']

        # If the grid was sensed recently (within the threshold), turn OFF the sensor
        if grid_key in last_sensed_time and (current_time - last_sensed_time[grid_key]).total_seconds() < TIME_THRESHOLD:
            df_copy.at[i, 'Sensor_ON'] = False
            off_count += 1
        else:
            # Update last sensed time when the sensor turns ON
            last_sensed_time[grid_key] = current_time

    off_counts[label] = off_count

# Convert to DataFrame for better visualization
off_counts_df = pd.DataFrame.from_dict(off_counts, orient='index', columns=['Sensor OFF Count'])

# Display results
off_counts_df

In [None]:
import pandas as pd

# Create a dictionary to store debug results
debug_metrics = {}

# 1️⃣ Count how many times the sensor is ON vs OFF per day
sensor_on_off_daily = df.groupby('Date')['Sensor_ON'].value_counts().unstack().fillna(0)
sensor_on_off_daily.columns = ['Sensor_OFF_Count', 'Sensor_ON_Count']
debug_metrics["Sensor ON/OFF Count Per Day"] = sensor_on_off_daily

# 2️⃣ Check SOC statistics per day (flatten multi-index)
soc_stats_daily = df.groupby('Date')[['SOC_batt', 'SOC_Without_Constraint']].agg(['mean', 'min', 'max'])
soc_stats_daily.columns = ['_'.join(col) for col in soc_stats_daily.columns]  # Flatten columns
debug_metrics["SOC Statistics Per Day"] = soc_stats_daily

# 3️⃣ Compare SOC drop per day (difference between first and last SOC reading)
daily_soc_change = df.groupby('Date').agg(
    First_SOC=('SOC_batt', 'first'),
    Last_SOC=('SOC_batt', 'last'),
    First_SOC_Without_Constraint=('SOC_Without_Constraint', 'first'),
    Last_SOC_Without_Constraint=('SOC_Without_Constraint', 'last')
)
daily_soc_change['SOC_Change'] = daily_soc_change['Last_SOC'] - daily_soc_change['First_SOC']
daily_soc_change['SOC_Change_Without_Constraint'] = daily_soc_change['Last_SOC_Without_Constraint'] - daily_soc_change['First_SOC_Without_Constraint']
debug_metrics["Daily SOC Change"] = daily_soc_change[['SOC_Change', 'SOC_Change_Without_Constraint']]

# 4️⃣ Check if Charging Frequency is Affecting SOC per day
charging_events_per_day = df.groupby('Date')['isCharging'].sum().to_frame()  # Ensure it's a DataFrame
debug_metrics["Charging Events Per Day"] = charging_events_per_day

# 5️⃣ Check how frequently the sensor is turning ON/OFF (total transitions per day)
df['Sensor_Transition'] = df['Sensor_ON'].diff().abs()
sensor_transitions_per_day = df.groupby('Date')['Sensor_Transition'].sum().to_frame()  # Ensure it's a DataFrame
debug_metrics["Sensor ON/OFF Transitions Per Day"] = sensor_transitions_per_day

# Convert debug metrics to a single DataFrame for easy viewing
debug_df = pd.concat(debug_metrics.values(), axis=1, keys=debug_metrics.keys())
debug_df


In [None]:
# Create a dictionary to store debug results
debug_metrics = {}

# 1️⃣ Check if Charging Frequency Increases When the 10-Minute Rule is Applied
charging_events_per_day = df.groupby('Date')['isCharging'].sum()
debug_metrics["Charging Events Per Day"] = charging_events_per_day

# 2️⃣ Compare SOC Drop During Charging vs. Not Charging
soc_during_charging = df.groupby(['Date', 'isCharging'])['SOC_batt'].mean().unstack()
debug_metrics["SOC During Charging vs Not Charging"] = soc_during_charging

# 3️⃣ Compare Average Energy Consumption Per Active Hour
avg_batt_current_on = df[df['Sensor_ON']].groupby('Date')['current_batt'].mean()
debug_metrics["Avg Battery Current When Sensor is ON"] = avg_batt_current_on

# 4️⃣ Check if Solar Current is Higher Without Constraints
solar_current_comparison = df.groupby(['Date', 'Sensor_ON'])['solar_current'].mean().unstack()
debug_metrics["Solar Charging When Sensor is ON vs OFF"] = solar_current_comparison

# 5️⃣ Compare the Number of Unique Grid Cells Covered
unique_grid_cells_per_day = df.groupby('Date')['Lat_Grid'].nunique()
debug_metrics["Unique Grid Cells Covered Per Day"] = unique_grid_cells_per_day

# Convert debug metrics to a single DataFrame for easy viewing
debug_df = pd.concat(debug_metrics.values(), axis=1, keys=debug_metrics.keys())

# Display the debug DataFrame
debug_df

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Define battery capacity in mAh
BATTERY_CAPACITY = 10000  

# Define the time interval threshold (10 minutes in seconds)
TIME_THRESHOLD = 600  

# Convert the Timestamp column to datetime
df['Timestamp'] = pd.to_datetime(df['Timestamp'], unit='s')

# Step 1: Sort data by device and timestamp
df = df.sort_values(by=['deviceID', 'Timestamp']).reset_index(drop=True)

# Step 2: Identify sensor activity based on 10-minute rule
df['Sensor_ON'] = True  # Default: Sensor is ON

# Track the last sensing timestamp per grid cell
last_sensed_time = {}
last_SOC = {}

off_count = 0  # Counter for how many times the sensor turns OFF

print("\n=== DEBUG LOG: SENSOR ON/OFF STATUS ===")
for i, row in df.iterrows():
    grid_key = (row['Lat_Grid'], row['Log_Grid'])  # Unique grid identifier
    current_time = row['Timestamp']

    # If the grid was sensed recently (within 10 minutes), turn OFF the sensor
    if grid_key in last_sensed_time and (current_time - last_sensed_time[grid_key]).total_seconds() < TIME_THRESHOLD:
        df.at[i, 'Sensor_ON'] = False
        off_count += 1
        print(f"[OFF] {row['Timestamp']} | Grid: {grid_key} | Device: {row['deviceID']} | SOC: {row['SOC_batt']:.2f}%")
    else:
        # Update last sensed time when the sensor turns ON
        last_sensed_time[grid_key] = current_time
        last_SOC[grid_key] = row['SOC_batt']
        print(f"[ON]  {row['Timestamp']} | Grid: {grid_key} | Device: {row['deviceID']} | SOC: {row['SOC_batt']:.2f}%")

print(f"\nTotal times sensor was turned OFF due to 10-min rule: {off_count}\n")

# Step 3: Calculate SOC changes only when the sensor is ON
df['SOC_Change'] = 0.0  # Default no change

print("\n=== DEBUG LOG: SOC CHANGES ===")
for i in range(1, len(df)):
    if df.at[i, 'Sensor_ON']:  # Process only if sensor is ON
        dt = (df.at[i, 'Timestamp'] - df.at[i-1, 'Timestamp']).total_seconds() / 3600  # Time difference in hours
        I_batt = df.at[i, 'current_batt']  # Current in mA
        SOC_change_mAh = -1 * I_batt * dt  # Calculate change in mAh
        SOC_change_percentage = (SOC_change_mAh / BATTERY_CAPACITY) * 100  # Convert to percentage

        # Apply SOC change
        df.at[i, 'SOC_Change'] = SOC_change_percentage
        df.at[i, 'SOC_batt'] = max(0, min(100, df.at[i-1, 'SOC_batt'] + SOC_change_percentage))  # Keep SOC in range

        print(f"[SOC UPDATE] {df.at[i, 'Timestamp']} | Device: {df.at[i, 'deviceID']} | ΔSOC: {SOC_change_percentage:.2f}% | New SOC: {df.at[i, 'SOC_batt']:.2f}%")

# Step 4: Compute the daily increased coverage due to energy conservation
df['Date'] = df['Timestamp'].dt.date  # Extract date

# Count the number of unique grid cells sensed each day
daily_coverage = df[df['Sensor_ON']].groupby('Date')['Lat_Grid'].nunique().reset_index()
daily_coverage.columns = ['Date', 'Unique_Grid_Cells_Sensed']

# Compute SOC depletion without the 10-minute constraint (baseline scenario)
df['SOC_Without_Constraint'] = df['SOC_batt'].copy()

for i in range(1, len(df)):
    dt = (df.at[i, 'Timestamp'] - df.at[i-1, 'Timestamp']).total_seconds() / 3600  # Time difference in hours
    I_batt = df.at[i, 'current_batt']  # Current in mA
    SOC_change_mAh = -1 * I_batt * dt  # Calculate change in mAh
    SOC_change_percentage = (SOC_change_mAh / BATTERY_CAPACITY) * 100  # Convert to percentage

    # Apply SOC change without the 10-minute constraint
    df.at[i, 'SOC_Without_Constraint'] = max(0, min(100, df.at[i-1, 'SOC_Without_Constraint'] + SOC_change_percentage))

# Group by date to calculate mean SOC per day
soc_depletion_comparison = df.groupby('Date')[['SOC_batt', 'SOC_Without_Constraint']].mean()

# Plot SOC depletion comparison
plt.figure(figsize=(10, 6))
plt.plot(soc_depletion_comparison.index, soc_depletion_comparison['SOC_batt'], marker='o', label='With 10-min Constraint')
plt.plot(soc_depletion_comparison.index, soc_depletion_comparison['SOC_Without_Constraint'], marker='s', linestyle='dashed', label='Without 10-min Constraint')

plt.xlabel('Date')
plt.ylabel('Mean SOC (%)')
plt.title('SOC Depletion Comparison (With vs Without 10-Min Constraint)')
plt.legend()
plt.grid(True)

# Show plot
plt.xticks(rotation=45)
plt.show()


In [None]:
# Minimum sensing interval (in seconds)
min_sensing_interval = 600  # 10 minutes

# Make a fresh copy of df to avoid persistent memory issues in Jupyter Notebook
df_all = df.copy()

# Step 1: Sort by deviceID and Timestamp
df_all = df_all.sort_values(by=['deviceID', 'Timestamp'])

# Step 2: Initialize tracking variables
df_all['Last_Sensed_Timestamp'] = pd.NaT  
df_all['Turn_Off_Sensor'] = False  # Default is ON

# Dictionary to store the last valid ON timestamp per grid cell
last_sensed_time = {}

# DEBUGGING: Create a list to track ON/OFF transitions
debug_logs = []

for index, row in df_all.iterrows():
    # ✅ Standardize grid_key to avoid floating point mismatches
    grid_key = (round(row['Lat_Grid'], 6), round(row['Log_Grid'], 6))  # Rounding avoids precision mismatches
    current_time = row['Timestamp']

    if grid_key in last_sensed_time:
        time_diff = (current_time - last_sensed_time[grid_key]).total_seconds()

        # ✅ Enforce 10-minute OFF period
        if time_diff < min_sensing_interval:
            df_all.at[index, 'Turn_Off_Sensor'] = True  # Sensor stays OFF
            debug_logs.append(f"🔴 OFF: {current_time} at {grid_key} (time_diff={time_diff:.2f}s)")
        else:
            # ✅ If 10 minutes have passed, allow new measurement
            df_all.at[index, 'Turn_Off_Sensor'] = False  # Sensor turns ON
            last_sensed_time[grid_key] = current_time  # ✅ Update timestamp
            df_all.at[index, 'Last_Sensed_Timestamp'] = current_time
            debug_logs.append(f"🟢 ON: {current_time} at {grid_key} (time_diff={time_diff:.2f}s)")
    else:
        # ✅ First-time sensing this grid cell, allow sensing
        df_all.at[index, 'Turn_Off_Sensor'] = False  # Sensor is ON
        last_sensed_time[grid_key] = current_time
        df_all.at[index, 'Last_Sensed_Timestamp'] = current_time
        debug_logs.append(f"🟢 ON (First Time): {current_time} at {grid_key}")

# ✅ Print debug logs to verify ON/OFF behavior
for log in debug_logs[:20]:  # Print first 20 logs to check behavior
    print(log)

# Step 3: Ensure battery remains constant while the sensor is OFF
df_all['SOC_batt_with_constraint'] = df_all['SOC_batt']  # Initialize new SOC values

# Carry forward last known SOC for OFF events
df_all['Prev_SOC'] = df_all.groupby('deviceID')['SOC_batt_with_constraint'].shift(1)
df_all.loc[df_all['Turn_Off_Sensor'], 'SOC_batt_with_constraint'] = df_all.loc[df_all['Turn_Off_Sensor'], 'Prev_SOC']

# Step 4: Predict battery depletion dynamically when the sensor turns back ON
df_all['Δt_hours'] = df_all.groupby('deviceID')['Timestamp'].diff().dt.total_seconds() / 3600  # Convert time to hours
df_all['ΔSOC'] = -1 * df_all['current_batt'] * df_all['Δt_hours'] / 10000 * 100  # Battery depletion as %

# Apply depletion only when the sensor is ON
df_all.loc[~df_all['Turn_Off_Sensor'], 'SOC_batt_with_constraint'] = df_all.loc[~df_all['Turn_Off_Sensor'], 'Prev_SOC'] + df_all.loc[~df_all['Turn_Off_Sensor'], 'ΔSOC']

# Step 5: Ensure all OFF states inherit the **same** SOC from the first OFF event
df_all['SOC_batt_with_constraint'] = df_all.groupby(['deviceID', 'Lat_Grid', 'Log_Grid'])['SOC_batt_with_constraint'].fillna(method='ffill')

# Step 6: Clip values to ensure SOC stays between 0% - 100%
df_all['SOC_batt_with_constraint'] = df_all['SOC_batt_with_constraint'].fillna(df_all['SOC_batt'])
df_all['SOC_batt_with_constraint'] = df_all['SOC_batt_with_constraint'].clip(0, 100)

# Remove helper columns
df_all = df_all.drop(columns=['Prev_SOC', 'Δt_hours', 'ΔSOC'])

# Step 7: Assign Vehicles to Grid Cells Using Polygon Containment
df_gdf = gpd.GeoDataFrame(df_all, geometry=gpd.points_from_xy(df_all['Log'], df_all['Lat']), crs="EPSG:4326")

# Ensure 'Grid_Cell' column exists
def find_grid_cell(row, grid_gdf):
    match = grid_gdf.contains(row.geometry)
    return match.idxmax() if match.any() else None

df_gdf['Grid_Cell'] = df_gdf.apply(lambda row: find_grid_cell(row, grid_gdf), axis=1)

# Drop rows where no grid cell was matched (outliers)
df_gdf = df_gdf.dropna(subset=['Grid_Cell'])

# Step 8: Compute spatio-temporal coverage
df_active_sensing = df_gdf[~df_gdf['Turn_Off_Sensor']].copy()

# Count the number of unique grid cells sensed per day
df_active_sensing['Date'] = df_active_sensing['Timestamp'].dt.date
daily_spatio_temporal_coverage = df_active_sensing.groupby('Date')['Grid_Cell'].nunique()

# Step 9: Aggregate daily SOC values
df_gdf['Date'] = df_gdf['Timestamp'].dt.date
soc_daily_no_constraint = df_gdf.groupby('Date')['SOC_batt'].mean()
soc_daily_with_constraint = df_gdf.groupby('Date')['SOC_batt_with_constraint'].mean()

# Step 10: Plot SOC depletion comparison (With vs Without the 10-minute constraint)
fig, ax = plt.subplots(figsize=(10, 6))

ax.plot(soc_daily_no_constraint.index, soc_daily_no_constraint, label="Always ON (No Constraint)", color="red", linestyle="--", marker="o")
ax.plot(soc_daily_with_constraint.index, soc_daily_with_constraint, label="With 10-min Rule", color="green", linestyle="-", marker="o")

# Formatting
ax.set_title("Battery SOC Depletion: Always ON vs 10-Minute Sensing Constraint", fontsize=14)
ax.set_xlabel("Date", fontsize=12)
ax.set_ylabel("Average SOC (%)", fontsize=12)
ax.legend()
ax.grid(True)

# Show the plot
plt.xticks(rotation=45)
plt.show()

print("Updated spatio-temporal coverage and energy savings calculated successfully.")


In [None]:
print(df_all[['Timestamp', 'Lat_Grid', 'Log_Grid', 'Turn_Off_Sensor', 'Last_Sensed_Timestamp']].head(20))


In [None]:
# Step 1: Compute total battery consumption before and after applying the 10-minute rule
df_gdf['SOC_batt_before'] = df_gdf['SOC_batt']  # Before applying 10min rule
df_gdf['SOC_batt_after'] = df_gdf['SOC_batt_new']  # After applying 10min rule

# Aggregate by day
df_gdf['Date'] = df_gdf['Timestamp'].dt.date
energy_usage_before = df_gdf.groupby('Date')['SOC_batt_before'].mean()
energy_usage_after = df_gdf.groupby('Date')['SOC_batt_after'].mean()

# Step 2: Plot the energy savings
fig, ax = plt.subplots(figsize=(10, 6))
ax.plot(energy_usage_before.index, energy_usage_before, label="Before 10-min Rule", color="red", linestyle="--", marker="o")
ax.plot(energy_usage_after.index, energy_usage_after, label="After 10-min Rule", color="green", linestyle="-", marker="o")

# Formatting
ax.set_title("Average Battery SOC Before and After 10-Minute Sensing Interval", fontsize=14)
ax.set_xlabel("Date", fontsize=12)
ax.set_ylabel("Average SOC (%)", fontsize=12)
ax.legend()
ax.grid(True)

# Show the plot
plt.xticks(rotation=45)
plt.show()


In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Define different sensing intervals to test (in seconds)
sensing_intervals = [600, 900, 3600, 28800, 86400]  # 10min, 15min, 1h, 8h, 24h

# Store results in a dictionary
energy_savings = {}

for interval in sensing_intervals:
    # Identify oversampled grid cells (visited within less than the given interval)
    df_gdf[f'Prev_Timestamp_{interval}'] = df_gdf.groupby(['deviceID', 'Lat_Grid', 'Log_Grid'])['Timestamp'].shift(1)
    df_gdf[f'Time_Diff_{interval}'] = (df_gdf['Timestamp'] - df_gdf[f'Prev_Timestamp_{interval}']).dt.total_seconds()
    df_gdf[f'Turn_Off_Sensor_{interval}'] = df_gdf[f'Time_Diff_{interval}'] < interval

    # Compute SOC change for active sensing periods
    df_gdf[f'Δt_hours_{interval}'] = df_gdf[f'Time_Diff_{interval}'] / 3600  # Convert time difference to hours
    df_gdf[f'ΔSOC_mAh_{interval}'] = -1 * df_gdf['current_batt'] * df_gdf[f'Δt_hours_{interval}']  # Compute SOC change in mAh

    # Apply SOC update formula only when sensors are ON
    df_gdf[f'ΔSOC_percent_{interval}'] = (df_gdf[f'ΔSOC_mAh_{interval}'] / C_batt) * 100
    df_gdf[f'SOC_batt_{interval}'] = df_gdf['SOC_batt']  # Initialize new SOC values
    df_gdf.loc[~df_gdf[f'Turn_Off_Sensor_{interval}'], f'SOC_batt_{interval}'] += df_gdf.loc[~df_gdf[f'Turn_Off_Sensor_{interval}'], f'ΔSOC_percent_{interval}']
    
    # Ensure SOC stays within valid range (0% - 100%)
    df_gdf[f'SOC_batt_{interval}'] = df_gdf[f'SOC_batt_{interval}'].clip(0, 100)

    # Aggregate by day
    energy_savings[interval] = df_gdf.groupby(df_gdf['Timestamp'].dt.date)[f'SOC_batt_{interval}'].mean()

# Step 2: Plot the energy savings across different intervals
fig, ax = plt.subplots(figsize=(12, 6))

# Plot before 10-min rule
ax.plot(energy_usage_before.index, energy_usage_before, label="Before Any Rule", color="red", linestyle="--", marker="o")

# Plot results for different sensing intervals
colors = ['green', 'blue', 'purple', 'orange', 'brown']
interval_labels = ["10 min", "15 min", "1 hr", "8 hrs", "24 hrs"]

for i, interval in enumerate(sensing_intervals):
    ax.plot(energy_savings[interval].index, energy_savings[interval], label=f"After {interval_labels[i]} Rule", color=colors[i], linestyle="-", marker="o")

# Formatting
ax.set_title("Average Battery SOC Before and After Different Sensing Intervals", fontsize=14)
ax.set_xlabel("Date", fontsize=12)
ax.set_ylabel("Average SOC (%)", fontsize=12)
ax.legend()
ax.grid(True)

# Show the plot
plt.xticks(rotation=45)
plt.show()



In [None]:
# Extract unique days where depletion occurred
depleted_days = df_post_depletion['Timestamp'].dt.date.unique()

# Loop through each depleted day for visualization
for day in depleted_days:
    # Filter data for the current depleted day
    df_day_pre = df_pre_coverage[df_pre_coverage['Date'] == day]
    df_day_new = df_new_coverage_only[df_new_coverage_only['Date'] == day]
    df_day_actual = df_post_depletion[df_post_depletion['Timestamp'].dt.date == day]
    df_day_predicted = df_post_depletion[df_post_depletion['Timestamp'].dt.date == day]

    # Convert to GeoDataFrames
    gdf_pre = grid_gdf.loc[grid_gdf.index.isin(df_day_pre['Grid_Cell'])].copy()
    gdf_pre['Color'] = 'black'  # Pre-depletion trajectory

    gdf_new = grid_gdf.loc[grid_gdf.index.isin(df_day_new['Grid_Cell'])].copy()
    gdf_new['Color'] = 'green'  # Newly sensed due to 10min rule

    gdf_actual = grid_gdf.loc[grid_gdf.index.isin(df_day_actual['Grid_Cell'])].copy()
    gdf_actual['Color'] = 'red'  # Actual trajectory after depletion

    predicted_grid_cells = df_day_predicted['Predicted_Grid_Cell'].dropna().unique()
    gdf_predicted = grid_gdf.loc[grid_gdf.index.isin(predicted_grid_cells)].copy()
    gdf_predicted['Color'] = 'blue'  # Predicted trajectory

    # Skip if no relevant data for the day
    if gdf_pre.empty and gdf_new.empty and gdf_actual.empty and gdf_predicted.empty:
        print(f"Skipping {day}: No valid data for plotting.")
        continue

    # Create the plot
    fig, ax = plt.subplots(figsize=(12, 12))

    # Plot Grid Cells (Background)
    grid_gdf.plot(ax=ax, color='lightgrey', edgecolor='grey', alpha=0.2)

    # Plot Pre-Depletion Trajectory (Black)
    if not gdf_pre.empty:
        gdf_pre.plot(ax=ax, color='black', alpha=0.5, edgecolor='black', label="Pre-Depletion Trajectory")

    # Plot Actual Post-Depletion Trajectory (Red)
    if not gdf_actual.empty:
        gdf_actual.plot(ax=ax, color='red', alpha=0.5, edgecolor='red', label="Actual Trajectory (Post-Depletion)")

    # Plot Predicted Post-Depletion Trajectory (Blue)
    if not gdf_predicted.empty:
        gdf_predicted.plot(ax=ax, color='blue', alpha=0.5, edgecolor='blue', label="Predicted Trajectory")

    # Plot Newly Sensed Cells Due to 10-Minute Rule (Green)
    if not gdf_new.empty:
        gdf_new.plot(ax=ax, color='green', alpha=0.5, edgecolor='green', label="Newly Sensed Cells (10-min Interval)")

    # Add Basemap
    try:
        ctx.add_basemap(ax, crs=grid_gdf.crs.to_string(), source=ctx.providers.OpenStreetMap.Mapnik)
    except Exception as e:
        print(f"Basemap Error on {day}: {e}")

    # Formatting
    ax.set_title(f"Trajectory Visualization with 10-Minute Sensing Constraint - {day}", fontsize=14)
    ax.legend()
    ax.set_axis_off()

    # Show plot
    plt.show()


In [None]:
# Dynamically get the bounds from the data
min_lat, max_lat = df['Lat'].min(), df['Lat'].max()
min_lon, max_lon = df['Log'].min(), df['Log'].max()

# Define grid size (120x120 meters)
grid_size = 120
lat_resolution = grid_size / 111320  # Convert meters to latitude degrees
lon_resolution_at_lat = lambda lat: grid_size / (111320 * np.cos(np.radians(lat)))

# Generate grid covering the dataset area
grid = []
lat = min_lat
while lat < max_lat:
    lon = min_lon
    while lon < max_lon:
        lon_res = lon_resolution_at_lat(lat)
        grid.append(Polygon([
            (lon, lat),
            (lon + lon_res, lat),
            (lon + lon_res, lat + lat_resolution),
            (lon, lat + lat_resolution)
        ]))
        lon += lon_res
    lat += lat_resolution

# Create an empty GeoDataFrame for the grid
grid_gdf = gpd.GeoDataFrame({'geometry': grid, 'Count': 0}, crs="EPSG:4326")

# Create a GeoDataFrame for the data points
df_gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df['Log'], df['Lat']), crs="EPSG:4326")

# Assign each measurement to a grid square
for index, point in df_gdf.iterrows():
    match = grid_gdf.contains(point.geometry)
    if match.any():
        grid_gdf.loc[match.idxmax(), 'Count'] += 1

# Apply Fractional Power Scaling
gamma = 0.3  # Adjust for visibility
grid_gdf['Scaled_Count'] = (grid_gdf['Count'] + 1) ** gamma

# Normalize values for color mapping
norm = Normalize(vmin=grid_gdf['Scaled_Count'].min(), vmax=grid_gdf['Scaled_Count'].max())
cmap = plt.get_cmap('jet')

# Convert scaled values to hex colors
grid_gdf['Color'] = grid_gdf['Scaled_Count'].apply(lambda x: to_hex(cmap(norm(x))))

# Create Folium map centered on Stockholm
m = folium.Map(location=[df['Lat'].mean(), df['Log'].mean()], zoom_start=12, tiles='Cartodb dark_matter')

# Function to color the grid based on scaled counts
def style_function(feature):
    color = feature['properties']['Color']  # Get precomputed color
    return {
        'fillColor': color,
        'color': 'black',
        'weight': 0.1,
        'fillOpacity': 0.4
    }

# Add grid layer to Folium
folium.GeoJson(
    grid_gdf,
    name="Measurement Grid",
    style_function=style_function,
    tooltip=folium.GeoJsonTooltip(fields=['Count'], aliases=["Measurements:"])
).add_to(m)

# Add layer control
folium.LayerControl().add_to(m)

# Display the map
m


### Initial Analysis
#### Check Data Distribution
- Before plotting, inspect the distribution of the Count column to confirm the skew. If the Count values have a large range (e.g., some counts are much higher than others), you can apply a logarithmic scale to the color mapping. This makes smaller variations more distinguishable.


In [None]:
# Group by spatial grid and count occurrences
coverage = df.groupby(['Lat_Grid', 'Log_Grid']).size().reset_index(name='Count')

# Count the frequency of each unique coverage count
coverage_freq = coverage['Count'].value_counts().reset_index()
coverage_freq.columns = ['Coverage Count', 'Frequency']

# Sort in descending order
coverage_freq = coverage_freq.sort_values(by='Coverage Count', ascending=False)

# Find the maximum coverage count
max_count = coverage['Count'].max()

sns.histplot(coverage['Count'], bins=max_count, kde=True, color='blue')
plt.title('Distribution of Coverage Counts')
plt.xlabel('Coverage Count')
plt.ylabel('Frequency')
plt.ylim(0, 2)
plt.show()


In [113]:
# df['Timestamp'] = pd.to_datetime(df['Timestamp'], unit='s')

# # Define Zipf-Mandelbrot function
# def zipf_mandelbrot_func(r, s, q, C):
#     return C / (r + np.abs(q)) ** s  # Ensure q is positive

# # Define resolutions to test
# spatial_resolutions = [0.00001, 0.0001, 0.001, 0.01]
# temporal_resolutions = ['10S', '30S', '1T', '5T']

# # Store results
# results = []

# for spatial_resolution in spatial_resolutions:
#     for temporal_resolution in temporal_resolutions:
#         # Create spatial grid
#         df['Lat_Grid'] = (df['Lat'] // spatial_resolution) * spatial_resolution
#         df['Log_Grid'] = (df['Log'] // spatial_resolution) * spatial_resolution
        
#         # Create temporal bins
#         df['Time_Bin'] = df['Timestamp'].dt.floor(temporal_resolution)

#         # Group by spatial grid and count occurrences
#         coverage = df.groupby(['Lat_Grid', 'Log_Grid']).size().reset_index(name='Count')

#         # Sort data in Zipfian order
#         sorted_counts = np.sort(coverage['Count'])[::-1]  # Descending order
#         ranks = np.arange(1, len(sorted_counts) + 1)  # Rank numbers

#         # Fit Zipf-Mandelbrot
#         try:
#             params, _ = curve_fit(zipf_mandelbrot_func, ranks, sorted_counts, 
#                                   p0=[1, 1, max(sorted_counts)], 
#                                   bounds=([0.5, 0.0001, 0], [3, 10, np.inf]))

#             s_fit, q_fit, C_fit = params
#             expected_values = zipf_mandelbrot_func(ranks, s_fit, q_fit, C_fit)
            
#             # Compute residuals
#             residuals = sorted_counts - expected_values
#             std_residuals = np.std(residuals)
            
#             # Perform KS test
#             ks_stat, p_value = kstest(sorted_counts, zipf_mandelbrot_func, args=(s_fit, q_fit, C_fit))

#             # Compute AIC (Akaike Information Criterion)
#             AIC = -2 * np.log(p_value) + 2 * 3  # 3 parameters: s, q, C

#             # Store results
#             results.append({
#                 'Spatial_Resolution': spatial_resolution,
#                 'Temporal_Resolution': temporal_resolution,
#                 'KS_Statistic': ks_stat,
#                 'p_value': p_value,
#                 'Std_Residuals': std_residuals,
#                 'AIC': AIC
#             })

#         except RuntimeError:
#             print(f"Fit failed for Spatial={spatial_resolution}, Temporal={temporal_resolution}")

# # Convert results to DataFrame
# results_df = pd.DataFrame(results)

# # Select the best resolution (min AIC, high p-value, low KS statistic)
# best_result = results_df.sort_values(by=['AIC', 'KS_Statistic'], ascending=[True, True]).iloc[0]
# print("Best Resolution Parameters:")
# print(best_result)


In [None]:
# Sort coverage counts in descending order (ranked frequencies)
sorted_counts = np.sort(coverage['Count'])[::-1]  # Descending order
ranks = np.arange(1, len(sorted_counts) + 1)  # Rank numbers

# Define Zipf-Mandelbrot function: f(r) = C / (r + q)^s
def zipf_mandelbrot_func(r, s, q, C):
    return C / (r + np.abs(q)) ** s  # Ensure q is positive

# Fit Zipf-Mandelbrot with constraints to avoid numerical issues
params, _ = curve_fit(zipf_mandelbrot_func, ranks, sorted_counts, p0=[1, 1, max(sorted_counts)], bounds=([0.5, 0.0001, 0], [3, 10, np.inf]))
s_fit, q_fit, C_fit = params

# Compute expected values from the fitted Zipf-Mandelbrot model
expected_values = zipf_mandelbrot_func(ranks, s_fit, q_fit, C_fit)

# Compute residuals (Observed - Expected)
residuals = sorted_counts - expected_values
relative_residuals = residuals / expected_values  # Normalize residuals

# Plot Residuals
plt.figure(figsize=(10, 6))
plt.scatter(ranks, residuals, alpha=0.6, color="red", label="Residuals (Observed - Expected)")
plt.axhline(0, linestyle="--", color="black", alpha=0.6)
plt.xscale("log")
plt.yscale("linear")
plt.xlabel("Rank", fontsize=12)
plt.ylabel("Residual (Observed - Expected)", fontsize=12)
plt.title("Residuals from Zipf-Mandelbrot Fit", fontsize=14)
plt.legend()
plt.grid(True, which="both", linestyle="--", alpha=0.5)
plt.show()


In [None]:
# Define a threshold for outliers (e.g., 1.2x expected value)
tolerance_factor = 3

# Identify outliers (values too far from expected Zipfian behavior)
outlier_mask = (sorted_counts > expected_values * tolerance_factor) | (sorted_counts < expected_values / tolerance_factor)

# Count the number of removed points
num_outliers = outlier_mask.sum()
print(f"Number of detected outliers: {num_outliers}")

# Remove outliers from dataset
filtered_counts = sorted_counts[~outlier_mask]
filtered_ranks = ranks[~outlier_mask]

# Exclude extreme values (top 5% and bottom 5%)
lower_bound_c = int(0.1 * len(filtered_counts))
upper_bound_c = int(0.9 * len(filtered_counts))
filtered_counts=filtered_counts[lower_bound_c:upper_bound_c]
lower_bound_r = int(0.1 * len(filtered_ranks))
upper_bound_r = int(0.9 * len(filtered_ranks))
filtered_ranks = filtered_ranks[lower_bound_r:upper_bound_r]

# Re-Fit Zipf-Mandelbrot with filtered data
params_filtered, _ = curve_fit(
    zipf_mandelbrot_func, 
    filtered_ranks, 
    filtered_counts, 
    p0=[1, 1, max(filtered_counts)], 
    bounds=([0.5, 0.0001, 0], [3, 10, np.inf])
)

# Extract new parameters
s_fit_filtered, q_fit_filtered, C_fit_filtered = params_filtered

# Compute expected values with new parameters
expected_values_filtered = zipf_mandelbrot_func(filtered_ranks, s_fit_filtered, q_fit_filtered, C_fit_filtered)

# Plot cleaned data vs. new Zipf-Mandelbrot fit
plt.figure(figsize=(10, 6))
plt.scatter(filtered_ranks, filtered_counts, label="Filtered Data (No Outliers)", alpha=0.6, color="blue")
plt.plot(filtered_ranks, expected_values_filtered, 'r-', linewidth=2, label="Re-Fitted Zipf-Mandelbrot Model")

plt.xscale("log")
plt.yscale("log")
plt.xlabel("Rank", fontsize=12)
plt.ylabel("Coverage Count", fontsize=12)
plt.title("Zipf-Mandelbrot Fit After Outlier Removal & Re-Fitting", fontsize=14)
plt.legend()
plt.grid(True, which="both", linestyle="--", alpha=0.5)

plt.show()


In [None]:
# Exclude extreme values (top 5% and bottom 5%)
lower_bound = int(0.1 * len(filtered_counts))
upper_bound = int(0.9 * len(filtered_counts))

ks_stat_truncated, p_value_truncated = kstest(
    filtered_counts[lower_bound:upper_bound], 
    zipf_mandelbrot_func, 
    args=(s_fit_filtered, q_fit_filtered, C_fit_filtered)
)
ks_stat_after, p_value_after = kstest(filtered_counts, zipf_mandelbrot_func, args=(s_fit_filtered, q_fit_filtered, C_fit_filtered))


print(f"Truncated KS Test - Statistic: {ks_stat_truncated:.4f}, p-value: {p_value_truncated:.4f}")


In [None]:
print(f"Estimated Zipf-Mandelbrot Exponent (s): {s_fit:.4f}")
from scipy.stats import norm

# Compute log-likelihood
log_likelihood = np.sum(norm.logpdf(filtered_counts, loc=zipf_mandelbrot_func(filtered_ranks, s_fit_filtered, q_fit_filtered, C_fit_filtered), scale=np.std(filtered_counts)))
AIC_fixed = -2 * log_likelihood + 2 * 3  # 3 parameters: s, q, C

print(f"Fixed AIC: {AIC_fixed:.4f}")



In [None]:
# Define pure Zipf function: f(r) = C / r^s
def zipf_func(r, s, C):
    return C / r ** s

# Fit pure Zipf
params_zipf, _ = curve_fit(zipf_func, filtered_ranks, filtered_counts, 
                           p0=[1, max(filtered_counts)], 
                           bounds=([0.5, 0], [3, np.inf]))

s_zipf, C_zipf = params_zipf
expected_values_zipf = zipf_func(filtered_ranks, s_zipf, C_zipf)

# Compute Log-Likelihood and AIC for pure Zipf
log_likelihood_zipf = np.sum(norm.logpdf(
    filtered_counts, 
    loc=expected_values_zipf, 
    scale=np.std(filtered_counts)
))
AIC_zipf = -2 * log_likelihood_zipf + 2 * 2  # 2 parameters: s, C

print(f"Pure Zipf Log-Likelihood: {log_likelihood_zipf:.4f}")
print(f"Pure Zipf AIC: {AIC_zipf:.4f}")


In [None]:
# Top 10% (frequent ranks)
top_residuals = filtered_counts[:int(0.1 * len(filtered_counts))] - expected_values_filtered[:int(0.1 * len(filtered_counts))]

# Bottom 10% (rare ranks)
bottom_residuals = filtered_counts[-int(0.1 * len(filtered_counts)):] - expected_values_filtered[-int(0.1 * len(filtered_counts)):]

# Plot residuals
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.scatter(filtered_ranks[:int(0.1 * len(filtered_ranks))], top_residuals, color='red', alpha=0.6)
plt.axhline(0, linestyle='--', color='black', alpha=0.6)
plt.xscale('log')
plt.title("Top 10% Residuals")
plt.xlabel("Rank")
plt.ylabel("Residual")

plt.subplot(1, 2, 2)
plt.scatter(filtered_ranks[-int(0.1 * len(filtered_ranks)):], bottom_residuals, color='blue', alpha=0.6)
plt.axhline(0, linestyle='--', color='black', alpha=0.6)
plt.xscale('log')
plt.title("Bottom 10% Residuals")
plt.xlabel("Rank")
plt.ylabel("Residual")

plt.tight_layout()
plt.show()


In [None]:
# Focus on middle 80% of ranks
lower_bound = int(0.1 * len(filtered_counts))
upper_bound = int(0.9 * len(filtered_counts))

ks_stat_truncated, p_value_truncated = kstest(
    filtered_counts[lower_bound:upper_bound], 
    zipf_mandelbrot_func, 
    args=(s_fit_filtered, q_fit_filtered, C_fit_filtered)
)

print(f"Truncated KS Test - Statistic: {ks_stat_truncated:.4f}, p-value: {p_value_truncated:.4f}")


In [None]:
expected_values_filtered = zipf_mandelbrot_func(filtered_ranks, s_fit_filtered, q_fit_filtered, C_fit_filtered)
residuals_filtered = filtered_counts - expected_values_filtered


plt.figure(figsize=(10, 6))
plt.scatter(filtered_ranks, residuals_filtered, alpha=0.6, color="red", label="Residuals (Observed - Expected)")

plt.axhline(0, linestyle="--", color="black", alpha=0.6)
plt.xscale("log")
plt.yscale("linear")
plt.xlabel("Rank", fontsize=12)
plt.ylabel("Residual (Observed - Expected)", fontsize=12)
plt.title("Residuals After Outlier Removal", fontsize=14)
plt.legend()
plt.grid(True, which="both", linestyle="--", alpha=0.5)

plt.show()


### Plot
- Analyze sensor coverage by aggregating the spatial grid.
- Visualize coverage heatmaps. To better visualize the data, apply logarithmic scaling to the color values. This will compress the range of large values and expand the smaller values for more differentiation in color. 
- We apply Fractional Power Scaling: Highlights smaller values significantly, making subtle differences more visible. We Raise the log-transformed values to a fractional power $ \log(x+1)^{0.5} $. This amplifies small differences while keeping the general scale.
##### Note:
- If \( x = 0 \), the standard `np.log(x)` would result in an error because the logarithm of 0 is undefined. 
`np.log1p(x)` handles this safely by adding \( 1 \) to the input before computing the logarithm, ensuring it works for non-negative numbers, including \( 0 \).
- The square root further compresses the range of the values.  
It emphasizes smaller differences by reducing the impact of large values. For example:  
$\log(x+1)^{0.5} $ grows slower than $\log(x+1)$ as $\ x $ increases.

**This transformation is particularly useful for skewed data, for `Count` values, where:**

- Most data points are small.
- A few extreme values (outliers) dominate.




In [None]:
# Set up the figure size
plt.figure(figsize=(10, 8))

# Apply logarithmic scaling
log_scaled_values = np.log1p(coverage['Count'])**0.5 

# Apply logarithmic scaling to color values
sc = plt.scatter(
    coverage['Log_Grid'], 
    coverage['Lat_Grid'], 
    c=log_scaled_values,  
    cmap='jet', 
    s=30, 
    edgecolor='k', 
    alpha=0.8
)

# Add a color bar with the original scale in the label
cbar = plt.colorbar(sc)
cbar.set_label('√(Log(Coverage Count + 1))', fontsize=12)
cbar.ax.tick_params(labelsize=10)

# Add labels and title with improved font sizes
plt.xlabel('Longitude Grid', fontsize=14)
plt.ylabel('Latitude Grid', fontsize=14)
plt.title('Coverage Heatmap with Logarithmic Scale', fontsize=16)

# Add grid lines for reference
plt.grid(visible=True, linestyle='--', alpha=0.6)

# Improve tick sizes for better readability
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)

# Show the plot
plt.tight_layout()
plt.show()


### Analyse the Temporal Sampling

In [None]:
# Calculate time difference in seconds between consecutive rows
df['Delta_t'] = df['Timestamp'].diff().dt.total_seconds()  

# Define an expected interval in seconds (e.g., 60 seconds)
expected_interval = 60

# Count the total number of occurrences of measurements
tot_count = df['Delta_t'].count()
print(f"Number of values: {tot_count}")

# Count the number of occurrences of low frequency measurements
highf_count = (df['Delta_t'] > expected_interval).sum()
print(f"Number of values higher than 60sec: {highf_count}")

# Count the number of occurrences of 0.0
zero_count = (df['Timestamp'].diff().dt.total_seconds() == 0.0).sum()
print(f"Number of values equal to 0.0sec: {zero_count}")


### Energy and Coverage Model Preparation
- Create columns to represent:

    - Whether a street segment is already covered.
    - Battery state changes.

##### Filtering out Outliners

In [220]:
# Calculate time difference in seconds between consecutive rows
df['Delta_t'] = df['Timestamp'].diff().dt.total_seconds()  

# Define a threshold for acceptable intervals (e.g., 60 seconds)
acceptable_threshold = 60   # in seconds

# Filter out rows with large Delta_t
df = df[df['Delta_t'] <= acceptable_threshold]

# Drop rows with Delta_t equal to zero
df = df[df['Delta_t'] > 0]

# Reset the index
df = df.reset_index(drop=True)

The battery capacity is 10,000 mAh, so SOC_Change (calculated from current and time) must be converted into a percentage of the total capacity before being added to `SOC_batt`.

### SOC Update Formula:
$SOC_{new} = SOC_{old} + \frac{\Delta SOC_{mAh}}{C_{batt}} \times 100$ 

and 

$\Delta SOC_{mAh} = -1 \times I_{batt} \times \Delta t$ (mAh change based on current and time)

Where:
- $SOC_{old}$ is $SOC_{batt}$
- $\ I_{batt} $: Net current (`current_batt`) in mA (positive for consumption, negative for storage).
- $\ \Delta t $: Time difference in hours between consecutive rows.
- $\ C_{batt} $: Battery capacity in mAh (10,000 mAh).


In [None]:
# Battery capacity in mAh
battery_capacity = 10000

# Calculate time difference in hours between consecutive rows
df['Delta_t'] = df['Delta_t'] / 3600  # Time difference in hours

# Calculate SOC change (in %) using the corrected formula
df['SOC_Change'] = (-1 * df['current_batt'] * df['Delta_t'] / battery_capacity) * 100

# Set SOC_Change to 0 when SOC is saturated
df.loc[df['SOC_batt'] >= 90, 'SOC_Change'] = 0

# Ensure SOC values are capped between 0 and 100
df['SOC_batt'] = df['SOC_batt'] + df['SOC_Change']
df['SOC_batt'] = df['SOC_batt'].clip(lower=0, upper=100)

# Assume that is >90%, charging is stopped
df.loc[df['SOC_batt'] >= 90, 'SOC_Change'] = 0

# Preview the updated DataFrame
print(df[['Timestamp', 'Lat', 'Log', 'SOC_batt', 'SOC_Change', 'Delta_t', 'current_batt']])

In [None]:
# Plot SOC over time
plt.figure(figsize=(12, 6))
plt.plot(df['Timestamp'], df['SOC_batt'], label='State of Charge (SOC)', color='blue', linewidth=1.5)
plt.xlabel('Time', fontsize=14)
plt.ylabel('SOC (%)', fontsize=14)
plt.title('Battery State of Charge Over Time', fontsize=16)
plt.grid(alpha=0.5, linestyle='--')
plt.legend(fontsize=12)
plt.tight_layout()
plt.show()


In [None]:
# Filter rows where SOC is increasing or decreasing
increasing = df[df['SOC_Change'] > 0]
decreasing = df[df['SOC_Change'] < 0]

print(f"Number of times SOC increases: {len(increasing)}")
print(f"Number of times SOC decreases: {len(decreasing)}")


In [None]:
# Mark covered segments (spatio-temporal condition)
df['Is_Covered'] = df.duplicated(subset=['Lat_Grid', 'Log_Grid', 'Time_Bin'], keep='first')

# Preview the updated DataFrame
print(df.head())

In [None]:
def energy_aware_switch(row):
    if row['Is_Covered'] and row['SOC_batt'] > 20:
        return 'OFF'
    elif not row['Is_Covered'] and row['SOC_batt'] > 10:
        return 'ON'
    else:
        return 'IDLE'

df['Sensor_State'] = df.apply(energy_aware_switch, axis=1)

df['Cumulative_Spatial_Coverage'] = (~df['Is_Covered']).cumsum()
temporal_coverage = df.groupby(['Lat_Grid', 'Log_Grid', 'Time_Bin']).size().reset_index(name='Frequency')

high_coverage_cells = temporal_coverage[temporal_coverage['Frequency'] > 2]
high_coverage_cells


In [None]:
# Initialize cumulative coverage and energy tracking
df['Cumulative_Coverage'] = 0
df['Average_SOC'] = 0

# Initialize variables
cumulative_coverage = set()  # To store unique covered grid cells
soc_list = []  # To store SOC levels

# Simulate over the data
for idx, row in df.iterrows():
    # Update cumulative coverage if sensor is ON
    if row['Sensor_State'] == 'ON':
        cumulative_coverage.add((row['Lat_Grid'], row['Log_Grid']))

    # Update SOC tracking
    soc_list.append(row['SOC_batt'])

    # Update DataFrame
    df.at[idx, 'Cumulative_Coverage'] = len(cumulative_coverage)
    df.at[idx, 'Average_SOC'] = sum(soc_list) / len(soc_list)

# Preview results
print(df[['Timestamp', 'Cumulative_Coverage', 'Average_SOC']])


In [None]:
# Plot cumulative coverage over time
plt.figure(figsize=(10, 6))
plt.plot(df['Timestamp'], df['Cumulative_Coverage'], label='Cumulative Coverage', color='b')
plt.xlabel('Time', fontsize=14)
plt.ylabel('Cumulative Coverage', fontsize=14)
plt.title('Cumulative Coverage Over Time', fontsize=16)
plt.grid(True, linestyle='--', alpha=0.6)
plt.legend(fontsize=12)
plt.tight_layout()
plt.show()

# Plot average SOC over time
plt.figure(figsize=(10, 6))
plt.plot(df['Timestamp'], df['Average_SOC'], label='Average SOC', color='g')
plt.xlabel('Time', fontsize=14)
plt.ylabel('Average SOC (%)', fontsize=14)
plt.title('Average SOC Over Time', fontsize=16)
plt.grid(True, linestyle='--', alpha=0.6)
plt.legend(fontsize=12)
plt.tight_layout()
plt.show()


In [None]:
# Baseline: All sensors ON
df['Baseline_Coverage'] = 0
df['Baseline_SOC'] = 0.0

# Initialize variables
baseline_coverage = set()
baseline_soc_list = []

for idx, row in df.iterrows():
    # Assume sensors are always ON
    baseline_coverage.add((row['Lat_Grid'], row['Log_Grid']))
    baseline_soc_list.append(row['SOC_batt'])

    # Update DataFrame
    df.at[idx, 'Baseline_Coverage'] = len(baseline_coverage)
    df.at[idx, 'Baseline_SOC'] = sum(baseline_soc_list) / len(baseline_soc_list)

# Compare cumulative coverage and SOC
print(df[['Timestamp', 'Cumulative_Coverage', 'Baseline_Coverage', 'Average_SOC', 'Baseline_SOC']])


In [None]:
# Total cumulative coverage
energy_aware_coverage = df['Cumulative_Coverage'].iloc[-1]
baseline_coverage = df['Baseline_Coverage'].iloc[-1]

# Average SOC
energy_aware_avg_soc = df['Average_SOC'].mean()
baseline_avg_soc = df['Baseline_SOC'].mean()

# Improvement metrics
coverage_improvement = (energy_aware_coverage - baseline_coverage) / baseline_coverage * 100
soc_savings = (baseline_avg_soc - energy_aware_avg_soc) / baseline_avg_soc * 100

# Print results
print(f"Energy-Aware Total Coverage: {energy_aware_coverage}")
print(f"Baseline Total Coverage: {baseline_coverage}")
print(f"Coverage Improvement: {coverage_improvement:.2f}%")
print(f"Energy Savings: {soc_savings:.2f}%")
