In [1]:
import csv

file_path = r"C:\Users\Narula\india_selected_cities_cleaned.csv"

try:
    with open(file_path, mode='r', newline='', encoding='utf-8') as file:
        csv_reader = csv.reader(file)

        # Read the header
        headers = next(csv_reader)
        print("Headers:", headers)

        print("\nTop 5 Rows:\n")
        for i, row in enumerate(csv_reader):
            if i == 5:   # stop after 5 rows
                break
            print(row)

except FileNotFoundError:
    print(f"The file at {file_path} was not found.")
except Exception as e:
    print(f"An error occurred: {e}")


Headers: ['scrape_id', 'lat', 'lon', 'locationId', 'city', 'state', 'country', 'last_updated', 'AQI_IN', 'AQI_US', 'CO_PPB', 'H_PERCENT', 'NO2_PPB', 'O3_PPB', 'PM10_UGM3', 'PM2_5_UGM3', 'SO2_PPB', 'T_C', 'PM1_UGM3', 'TVOC_PPM', 'Noise_DB']

Top 5 Rows:

['624', '30.6324', '76.7244', '48F6EE546F34', 'Manauli', 'Punjab', 'India', '2025-11-09 17:22:00', '92', '149', '', '45', '', '', '68', '55', '', '24.5', '36.0', '0.21', '65.0']
['635', '30.6999', '76.835', 'PLLODA000621', 'Chandigarh', 'Chandigarh', 'India', '2025-11-09 17:23:00', '83', '137', '454.0', '38', '9.0', '13.0', '60', '50', '3.0', '19.0', '', '', '']
['636', '30.7058', '76.8532', '8680', 'Panchkula', 'Haryana', 'India', '2025-11-09 17:22:00', '82', '134', '309.0', '38', '10.0', '8.0', '60', '49', '2.0', '19.0', '', '', '']
['637', '30.6995', '76.8176', 'PLLODA000600', 'Chandigarh', 'Chandigarh', 'India', '2025-11-09 17:23:00', '87', '142', '511.0', '38', '8.0', '18.0', '60', '52', '3.0', '19.0', '', '', '']
['639', '30.6912'

In [2]:
import pandas as pd

# Load CSV
df = pd.read_csv(file_path)

# Get unique locations based on lat + lon OR locationId
unique_locations = df[['locationId', 'lat', 'lon', 'city']].drop_duplicates()

# Convert to array of dicts
locations_array = unique_locations.to_dict(orient='records')

print("Unique Locations:")
for loc in locations_array:
    print(loc)

print("\nTotal unique nodes:", len(locations_array))


Unique Locations:
{'locationId': '48F6EE546F34', 'lat': 30.6324, 'lon': 76.7244, 'city': 'Manauli'}
{'locationId': 'PLLODA000621', 'lat': 30.6999, 'lon': 76.835, 'city': 'Chandigarh'}
{'locationId': '8680', 'lat': 30.7058, 'lon': 76.8532, 'city': 'Panchkula'}
{'locationId': 'PLLODA000600', 'lat': 30.6995, 'lon': 76.8176, 'city': 'Chandigarh'}
{'locationId': '48F6EE549CB8', 'lat': 30.6912, 'lon': 76.7443, 'city': 'Sahibzada Ajit Singh Nagar'}
{'locationId': '-160063', 'lat': 30.716, 'lon': 76.852, 'city': 'Panchkula'}
{'locationId': '48F6EE568468', 'lat': 30.6969, 'lon': 76.7556, 'city': 'Chandigarh'}
{'locationId': 'E4B06332EC44', 'lat': 30.7127, 'lon': 76.8283, 'city': 'Chandigarh'}
{'locationId': 'PLLODA000170', 'lat': 30.7046, 'lon': 76.7457, 'city': 'Mauli'}
{'locationId': 'PLLODA000590', 'lat': 30.7034, 'lon': 76.7301, 'city': 'Mauli'}
{'locationId': 'PLLODA000245', 'lat': 30.715, 'lon': 76.7651, 'city': 'Chandigarh'}
{'locationId': '13876', 'lat': 30.7199, 'lon': 76.7386, 'city':

In [3]:
import pandas as pd
import folium
from IPython.display import display

# Load CSV
df = pd.read_csv(file_path)

# Get unique sensor locations
unique_locations = df[['locationId', 'lat', 'lon', 'city']].drop_duplicates()

# Center the map
center_lat = unique_locations['lat'].mean()
center_lon = unique_locations['lon'].mean()

m = folium.Map(location=[center_lat, center_lon], zoom_start=11)

# Plot ONLY the unique locations
for _, row in unique_locations.iterrows():
    folium.Marker(
        location=[row['lat'], row['lon']],
        popup=f"ID: {row['locationId']}\nCity: {row['city']}",
    ).add_to(m)

# Display map inside Jupyter Notebook
display(m)


calc the Global Moran's I to check if the particular parameter is clustered in tricity

In [13]:
import pandas as pd
import geopandas as gpd
from libpysal.weights import KNN
from esda import Moran
import numpy as np

file_path = r"C:\Users\Narula\india_selected_cities_cleaned.csv"
df = pd.read_csv(file_path)
df = df.sort_values('last_updated').drop_duplicates('locationId', keep='last')

pollutants = [
    'AQI_IN',
    'CO_PPB',
    'NO2_PPB',
    'O3_PPB',
    'PM10_UGM3',
    'PM2_5_UGM3',
    'SO2_PPB'
]

for col in pollutants:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# -----------------------------------------------------------
# GeoDataFrame conversion
# -----------------------------------------------------------
gdf = gpd.GeoDataFrame(
    df,
    geometry=gpd.points_from_xy(df['lon'], df['lat']),
    crs='EPSG:4326'
)

# -----------------------------------------------------------
# Spatial weights (K-rearest neighbors)
# -----------------------------------------------------------
w = KNN.from_dataframe(gdf, k=6)
w.transform = 'R'     # row standardization

results = []

# -----------------------------------------------------------
# Function to interpret Moran’s I
# -----------------------------------------------------------
def interpret_moran(i, p):
    if p < 0.05 and abs(i) > 0.3:
        return "Strong spatial clustering"
    elif p < 0.05:
        return "Moderate spatial clustering"
    else:
        return "No significant spatial clustering"

# -----------------------------------------------------------
# Compute Moran’s I
# -----------------------------------------------------------
for col in pollutants:

    series = gdf[col]

    # --- CASE 1: Column all missing ---
    if series.isna().all():
        results.append([col, np.nan, np.nan, np.nan, "No valid data"])
        continue

    # Fill missing values with mean
    series = series.fillna(series.mean())

    # --- CASE 2: No variance (constant column) ---
    if series.std() == 0:
        results.append([col, np.nan, np.nan, np.nan, "No variance in data"])
        continue

    # Moran calculation
    mi = Moran(series, w)

    interpretation = interpret_moran(mi.I, mi.p_sim)

    results.append([
        col,
        round(mi.I, 4),
        round(mi.p_sim, 5),
        round(mi.z_sim, 4),
        interpretation
    ])

# -----------------------------------------------------------
# Final Results Table
# -----------------------------------------------------------
results_df = pd.DataFrame(
    results,
    columns=["Pollutant", "Moran_I", "p-value", "z-score", "Interpretation"]
)

print(results_df)


    Pollutant  Moran_I  p-value  z-score                     Interpretation
0      AQI_IN   0.0326    0.155   1.0037  No significant spatial clustering
1      CO_PPB   0.0835    0.095   1.4084  No significant spatial clustering
2     NO2_PPB   0.3685    0.002   4.4945          Strong spatial clustering
3      O3_PPB   0.3433    0.001   4.1159          Strong spatial clustering
4   PM10_UGM3   0.0233    0.203   0.7663  No significant spatial clustering
5  PM2_5_UGM3   0.0335    0.197   0.7994  No significant spatial clustering
6     SO2_PPB   0.1573    0.033   2.3452        Moderate spatial clustering


In [18]:
import pandas as pd
import geopandas as gpd
from libpysal.weights import KNN
from esda import Moran
import numpy as np
from collections import Counter

# -----------------------------------------------------------
# Load Data
# -----------------------------------------------------------
file_path = r"C:\Users\Narula\india_selected_cities_cleaned.csv"
df = pd.read_csv(file_path)

# Convert timestamps
df['last_updated'] = pd.to_datetime(df['last_updated'])

# Pollutant list
pollutants = [
    'AQI_IN',
    'CO_PPB',
    'NO2_PPB',
    'O3_PPB',
    'PM10_UGM3',
    'PM2_5_UGM3',
    'SO2_PPB'
]

# Convert pollutant columns to numeric
for col in pollutants:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# -----------------------------------------------------------
# Remove Manauli sensors (the two bottom points in the map)
# These have lat < 30.60 based on your screenshot
# -----------------------------------------------------------
df = df[df['lat'] > 30.60]


# -----------------------------------------------------------
# Classification Function
# -----------------------------------------------------------
def interpret_moran(moran_I, p):
    if p < 0.05 and abs(moran_I) > 0.3:
        return "Strong"
    elif p < 0.05:
        return "Moderate"
    else:
        return "None"


# Counters for each pollutant
final_counts = {col: Counter() for col in pollutants}


# -----------------------------------------------------------
# Loop over each timestamp
# -----------------------------------------------------------
for ts, sub in df.groupby('last_updated'):

    # Too few points to compute spatial weights
    if len(sub) < 6:
        continue

    # Create GeoDataFrame
    gdf = gpd.GeoDataFrame(
        sub,
        geometry=gpd.points_from_xy(sub['lon'], sub['lat']),
        crs='EPSG:4326'
    )

    # Try creating spatial weights
    try:
        w = KNN.from_dataframe(gdf, k=6)
        w.transform = 'R'
    except:
        # Cannot compute spatial weights
        continue

    # Compute Moran's I for every pollutant
    for col in pollutants:

        series = gdf[col]

        # If column has no valid values at this timestamp
        if series.isna().all():
            final_counts[col]["None"] += 1
            continue

        # Fill missing values
        series = series.fillna(series.mean())

        # If zero variance (all values identical)
        if series.std() == 0:
            final_counts[col]["None"] += 1
            continue

        # Compute Moran’s I
        try:
            mi = Moran(series, w)
            interpretation = interpret_moran(mi.I, mi.p_sim)
        except:
            interpretation = "None"

        # Store interpretation
        final_counts[col][interpretation] += 1


# -----------------------------------------------------------
# Final Summary Table
# -----------------------------------------------------------
summary_df = pd.DataFrame([
    [
        col,
        final_counts[col]["Strong"],
        final_counts[col]["Moderate"],
        final_counts[col]["None"]
    ]
    for col in pollutants
], columns=["Pollutant", "Strong_Count", "Moderate_Count", "None_Count"])

print("\n=== Final Moran's I Spatial Pattern Summary ===\n")
print(summary_df)


 There are 2 disconnected components.
  W.__init__(self, neighbors, id_order=ids, **kwargs)
 There are 2 disconnected components.
  W.__init__(self, neighbors, id_order=ids, **kwargs)
 There are 2 disconnected components.
  W.__init__(self, neighbors, id_order=ids, **kwargs)
 There are 2 disconnected components.
  W.__init__(self, neighbors, id_order=ids, **kwargs)
 There are 2 disconnected components.
  W.__init__(self, neighbors, id_order=ids, **kwargs)
 There are 2 disconnected components.
  W.__init__(self, neighbors, id_order=ids, **kwargs)
 There are 2 disconnected components.
  W.__init__(self, neighbors, id_order=ids, **kwargs)



=== Final Moran's I Spatial Pattern Summary ===

    Pollutant  Strong_Count  Moderate_Count  None_Count
0      AQI_IN           140             475         979
1      CO_PPB           289             546         759
2     NO2_PPB           848             537         209
3      O3_PPB           645             459         490
4   PM10_UGM3           124             396        1074
5  PM2_5_UGM3           143             471         980
6     SO2_PPB           289             322         983


- **Strong_Norm** → Strong spatial clustering
- **Moderate_Norm** → Moderate spatial clustering
- **None_Norm** → No meaningful spatial clustering

*(spatial clustering - whether nearby monitoring stations have similar pollutant levels)*


In [19]:
summary_df["Total"] = (
    summary_df["Strong_Count"] +
    summary_df["Moderate_Count"] +
    summary_df["None_Count"]
)

# Normalize each column
summary_df["Strong_Norm"] = summary_df["Strong_Count"] / summary_df["Total"]
summary_df["Moderate_Norm"] = summary_df["Moderate_Count"] / summary_df["Total"]
summary_df["None_Norm"] = summary_df["None_Count"] / summary_df["Total"]

# Final display
print("\n=== Normalized Moran's I Spatial Pattern Scores ===\n")
print(
    summary_df[[
        "Pollutant",
        "Strong_Norm",
        "Moderate_Norm",
        "None_Norm"
    ]].round(4)
)


=== Normalized Moran's I Spatial Pattern Scores ===

    Pollutant  Strong_Norm  Moderate_Norm  None_Norm
0      AQI_IN       0.0878         0.2980     0.6142
1      CO_PPB       0.1813         0.3425     0.4762
2     NO2_PPB       0.5320         0.3369     0.1311
3      O3_PPB       0.4046         0.2880     0.3074
4   PM10_UGM3       0.0778         0.2484     0.6738
5  PM2_5_UGM3       0.0897         0.2955     0.6148
6     SO2_PPB       0.1813         0.2020     0.6167
