In [1]:
import pandas as pd
import folium
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
df = pd.read_csv("ReallyFinalDFwithClustering.csv")
df.columns

Index(['Unnamed: 0', 'City', 'Latitude', 'Longitude', 'Population',
       'GrowthRate', 'SexRatio', 'LiteracyRate', 'Total Schools', 'Teachers',
       'Venue Counts', 'ClusterLabels'],
      dtype='object')

In [3]:
df = df[['City', 'Latitude', 'Longitude', 'Population', 'GrowthRate', 'SexRatio', 'LiteracyRate', 'Total Schools', 'Teachers',\
         'Venue Counts', 'ClusterLabels']] 
df

Unnamed: 0,City,Latitude,Longitude,Population,GrowthRate,SexRatio,LiteracyRate,Total Schools,Teachers,Venue Counts,ClusterLabels
0,AHMEDABAD,23.021624,72.579707,7045313,22.31,903.0,86.65,2912,28401,100,1
1,SURAT,21.10053,72.86654,6079231,42.19,788.0,86.65,2447,25624,100,1
2,PALANPUR,21.20874,72.77632,3116045,24.43,936.0,66.39,2730,19361,92,3
3,VADODARA,22.32228,73.17644,3093795,14.16,934.0,81.21,1724,15259,100,3
4,RAJKOT,22.304587,70.802098,3015229,19.87,924.0,82.2,1990,16583,42,3
5,BHAVNAGAR,23.02643,72.64431,2393272,16.53,931.0,76.84,1329,12448,50,3
6,BHUJ,23.247245,69.668339,2090313,32.03,907.0,71.58,2121,13397,13,3
7,ANAND,22.5632,72.96028,2090276,12.57,921.0,85.79,1382,10633,19,2
8,NADIAD,22.69801,72.85675,2053769,12.81,937.0,84.31,1633,10557,5,2
9,MEHSANA,23.58272,72.14589,2027727,9.91,925.0,84.26,1291,10893,4,2


# Data Used for the Project 
<ul>
<li>List of Cities(District Headquarters) from District Table - https://en.wikipedia.org/wiki/List_of_districts_of_Gujarat
<li>Kaggle Dataset - 2015-16-District-wise Education Data India
<li>Geo Location Data - Geocoder Library/API
<li>Location Data - Total Venues per City - Foursquare API
</ul>

# Features used for Clustering 
<ul>
<li>Population
<li>Population Growth Rate
<li>Sex Ratio of Female:Male (per 1000)
<li>Literacy Rate
<li>Total Schools in the District
<li>Teachers Count in the District
<li>Foursquare Venue Counts for the city
</ul>

# Clustered Cities 
<ul>
<li>Cluster 0 - GODHRA, SURENDRANAGAR, JUNAGADH, AMRELI, JAMNAGAR, HIMMATNAGAR, PATAN
<li>Cluster 1 - AHMEDABAD, SURAT
<li>Cluster 2 - ANAND, NADIAD, MEHSANA, VALSAD, BHARUCH, GANDHINAGAR
<li>Cluster 3 - PALANPUR, VADODARA, RAJKOT, BHAVNAGAR, BHUJ
</ul>

In [6]:
df.groupby('ClusterLabels').mean()

Unnamed: 0_level_0,Latitude,Longitude,Population,GrowthRate,SexRatio,LiteracyRate,Total Schools,Teachers,Venue Counts
ClusterLabels,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,22.733366,72.079362,1486544.0,13.982857,944.714286,74.478571,1226.285714,8595.142857,6.285714
1,22.061077,72.723123,6562272.0,32.25,845.5,86.65,2679.5,27012.5,100.0
2,22.392753,72.751683,1802190.0,13.553333,925.5,84.018333,1302.166667,9972.833333,11.166667
3,22.421856,71.813501,2741731.0,21.404,926.4,75.644,1978.8,15409.6,59.4


# Cluster Analysis Based on Feature Mean Values
<ul>
<li>(Tier 0 Cities) - Cluster Label 0 - Low Population, High Sex Ratio, Low Growth Rate, Low Literacy Rate, Low Schools and Teachers, Low Venue Counts
<li>(Tier 1 Cities) - Cluster Label 2 - Low Population, High Sex Ratio, Low Growth Rate, High Literacy Rate, Low Schools and Teachers, Low Venue Counts
<li>(Tier 2 Cities) - Cluster Label 3 - Medium Population, High Sex Ratio, Medium Growth Rate, Low Literacy Rate, Medium Schools and Teachers, Medium Venue Counts 
<li>(Tier 3 Cities) - Cluster Label 1 - High Population, Low Sex Ratio, High Growth Rate, High Literacy Rate, High Schools and Teachers, High Venue Counts
</ul>


In [7]:
df.corr()

Unnamed: 0,Latitude,Longitude,Population,GrowthRate,SexRatio,LiteracyRate,Total Schools,Teachers,Venue Counts,ClusterLabels
Latitude,1.0,-0.092551,-0.198267,-0.348154,0.277101,-0.033665,-0.236904,-0.227164,-0.290056,-0.14298
Longitude,-0.092551,1.0,0.124255,-0.10809,-0.020639,0.227125,0.039402,0.089374,0.188953,-0.01534
Population,-0.198267,0.124255,1.0,0.634083,-0.689175,0.408182,0.827382,0.973093,0.835688,0.171648
GrowthRate,-0.348154,-0.10809,0.634083,1.0,-0.807444,-0.012587,0.718314,0.69749,0.541941,0.211226
SexRatio,0.277101,-0.020639,-0.689175,-0.807444,1.0,-0.446017,-0.484647,-0.663015,-0.526805,-0.140528
LiteracyRate,-0.033665,0.227125,0.408182,-0.012587,-0.446017,1.0,0.052356,0.302191,0.187597,0.210813
Total Schools,-0.236904,0.039402,0.827382,0.718314,-0.484647,0.052356,1.0,0.914677,0.767134,0.353084
Teachers,-0.227164,0.089374,0.973093,0.69749,-0.663015,0.302191,0.914677,1.0,0.877748,0.297138
Venue Counts,-0.290056,0.188953,0.835688,0.541941,-0.526805,0.187597,0.767134,0.877748,1.0,0.396876
ClusterLabels,-0.14298,-0.01534,0.171648,0.211226,-0.140528,0.210813,0.353084,0.297138,0.396876,1.0


# Some Observations based on Correlation Metrics

<ol>
    <li> No Feature is directly correlated to the Cluster Label
    <li> High Population equals Higher number of schools, teachers and venue counts
    <li> There is a negative correlation between High Population and SexRatio of Male:Female
    <li> Venue Counts has the largest correlation value for the Cluster Label
</ol>

In [13]:
import numpy as np
import matplotlib.cm as cm
import matplotlib.colors as colors

latitude = 22.243571 
longitude = 71.221622
nclusters = 4

map_clusters = folium.Map(location=[latitude, longitude], zoom_start=7)

# set color scheme for the clusters
x = np.arange(nclusters)
ys = [i + x + (i*x)**2 for i in range(nclusters)]
colors_array = cm.seismic(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df['Latitude'], df['Longitude'], df['City'], df['ClusterLabels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters