In [1]:
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
from shapely.geometry import Point, box
import contextily as ctx

In [2]:
dfsurvey = pd.read_excel('survey.xlsx')
dfclusters = pd.read_csv('clusters.csv')

surveycords = [Point(lon,lat) for lon, lat in zip(dfsurvey["GPS-Longitude"].astype(float), dfsurvey["GPS-Latitude"].astype(float))]
clusterscords =  [Point(lon,lat) for lon, lat in zip(dfclusters["X"], dfclusters["Y"])]
gdf_clusters = gpd.GeoDataFrame(dfclusters, geometry=clusterscords, crs="EPSG:4326") # crs needed to work in meters
gdf_survey = gpd.GeoDataFrame(dfsurvey, geometry=surveycords, crs="EPSG:4326")

gdf_clusters = gdf_clusters.to_crs(epsg=32643) # zone on map is delhi, so 32643
gdf_survey = gdf_survey.to_crs(epsg=32643)


def create_grid(point, size=100):
    half_size = size / 2.0
    return box(point.x - half_size, point.y - half_size, point.x + half_size, point.y + half_size)

gdf_clusters['grid'] = gdf_clusters.geometry.apply(create_grid)



In [3]:
gdf_grids = gdf_clusters.drop(columns='geometry').copy()
gdf_grids['geometry'] = gdf_clusters['grid']

combined_data = gpd.sjoin(gdf_survey, gdf_grids, predicate='within')

non_overlapping_points = gdf_survey[~gdf_survey.index.isin(combined_data.index)]

cluster_columns = [col for col in combined_data.columns 
                  if col not in non_overlapping_points.columns and col != 'geometry']

for col in cluster_columns:
    non_overlapping_points[col] = 'No Cluster'

all_points = pd.concat([combined_data, non_overlapping_points], ignore_index=True)


In [4]:
fig, ax = plt.subplots(figsize=(20, 20))

gdf_clusters.plot(ax=ax, color='orange', edgecolor='red', alpha=0.5, label='Clusters')
gdf_survey.plot(ax=ax, color='black', markersize=10, label='Survey Points')

ctx.add_basemap(ax, crs=gdf_clusters.crs, source=ctx.providers.OpenStreetMap.Mapnik)

ax.set_xlabel('Easting (meters)')
ax.set_ylabel('Northing (meters)')
ax.set_title('Survey Data and Clusters')
ax.legend()
plt.grid(True)
plt.tight_layout()

plt.show()
fig.savefig("all-overlap.jpeg", format='jpeg')

In [5]:
colors = {1: 'purple', 2: 'blue', 3: 'lightgreen', 4: 'yellow', 5: 'orange', 6: 'red'}

fig, ax = plt.subplots(figsize=(20, 20))

for cluster_group, color in colors.items():
    subset = gdf_clusters[gdf_clusters['Cluster_Group'] == cluster_group]
    subset.plot(ax=ax, color=color, alpha=0.5, label=f'T{cluster_group}')

gdf_survey.plot(ax=ax, color='black', markersize=5, label='Survey Points')

ctx.add_basemap(ax, crs=gdf_clusters.crs, source=ctx.providers.OpenStreetMap.Mapnik)

ax.set_xlabel('Easting (meters)')
ax.set_ylabel('Northing (meters)')
ax.set_title('Survey Data Over Cluster Type')
ax.legend()
plt.grid(True)
plt.tight_layout()

plt.show()
fig.savefig("labeled-overlap.jpeg", format='jpeg')

In [8]:
cluster_type = all_points['Cluster_Group'].value_counts(normalize=True) * 100
cluster_type = cluster_type.reset_index()
cluster_type.columns = ['Cluster_Group', 'Percentage']


cluster_type['Cluster_Group'] = pd.to_numeric(
    cluster_type['Cluster_Group'], downcast='integer', errors='coerce'
)

labels = {1: "Cluster T1", 2: "Cluster T2", 3: "Cluster T3", 4: "Cluster T4", 5: "Cluster T5", 6: "Cluster T6", "No Cluster": "No Cluster"}
label_map = [labels.get(grp, "No Cluster") for grp in cluster_type['Cluster_Group']]
color_map = [colors[grp] if pd.notna(grp) else "#B0B0B0" for grp in cluster_type['Cluster_Group']]

cluster_type['Count'] = (cluster_type['Percentage'] / 100 * len(all_points)).round().astype(int)


def make_autopct(counts):
    def my_autopct(pct):
        total = sum(counts)
        count = int(round(pct * total / 100.0))
        return '{:d} ({:.1f}%)'.format(count, pct)
    return my_autopct

fig = plt.figure(figsize=(10, 10))
wedges, texts, autotexts = plt.pie(
    cluster_type['Percentage'],
    labels=label_map,
    autopct=make_autopct(cluster_type['Count']),
    colors=color_map,
)


for autotext in autotexts:
    autotext.set_color('white')

plt.title('Percentage Distribution of Surveyed Cluster Groups', pad=40)
plt.axis('equal')
plt.tight_layout()
plt.show()
fig.savefig("cluster-distribution.jpeg", format='jpeg')

In [None]:
filtered_df = all_points[(all_points['income'] <= 7) & (all_points['hh_count'] <= 10) & (all_points['education'] <= 6)]
cluster_sizes = filtered_df['Cluster_Group'].astype(str).value_counts().sort_index()
fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(25, 10))

income_labels = ["Not earning", "1 - 25,000", "25,001 - 40,000",
                 "40,001 - 60,000", "60,001 - 75,000", "75,001 - 100,000", "100,001+"]

education_labels = ["No schooling", "5th Class", "8th Class", "12th Class", "Graduate", "Post Graduate"]

cluster_labels = [f'{group} ({size})' for group, size in cluster_sizes.items()]

axes[0].set_yticks(range(1, 8))
filtered_df.boxplot(column='income', by='Cluster_Group', ax=axes[0], grid=False)
axes[0].set_title('Income')
axes[0].set_yticklabels(income_labels)
axes[0].set_xlabel('Cluster Group')
axes[0].set_ylabel('Income Categories')
axes[0].set_xticklabels(cluster_labels)

filtered_df.boxplot(column='hh_count', by='Cluster_Group', ax=axes[1], grid=False)
axes[1].set_title('Household Size')
axes[1].set_xlabel('Cluster Group')
axes[1].set_ylabel('Household Size')
axes[1].set_xticklabels(cluster_labels)

axes[2].set_yticks(range(1, 7))
filtered_df.boxplot(column='education', by='Cluster_Group', ax=axes[2], grid=False)
axes[2].set_title('Education')
axes[2].set_yticklabels(education_labels)
axes[2].set_xlabel('Cluster Group')
axes[2].set_ylabel('Education Completed')
axes[2].set_xticklabels(cluster_labels)

fig.suptitle('Cluster Groups Variations', fontsize=15, y=1.02)
fig.tight_layout(rect=[0, 0, 1, 0.95])

fig.savefig("cluster-group-by-data.jpeg", format='jpeg', bbox_inches='tight', dpi=300)

