# Population Ratio per Cluster

In [1]:
import pickle
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

In [2]:
with open('cluster_model.pkl', 'rb') as model_in:
    km = pickle.load(model_in)

In [3]:
km

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=27, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=510, tol=0.0001, verbose=0)

In [4]:
df_adverts = pd.read_csv('adverts_geocoded.csv')
df_adverts.head(3)

Unnamed: 0.1,Unnamed: 0,id,category,list_type,beds,baths,sq_ft_price,price_KES,district,neighborhood,street,latitude,longitude,clean_grab
0,0,3394167,Flat & Apartment,For Rent,2,0,0,120000,Westlands,Brookside,0,-1.256163,36.79194,OK
1,1,3399035,Flat & Apartment,For Rent,2,0,77,96000,Ridgeways,0,0,-1.230687,36.849247,OK
2,2,3433894,Flat & Apartment,For Rent,2,0,46,70000,Dagoretti North,Valley Arcade,Lavington,-1.290723,36.769157,OK


In [5]:
advert_locs = df_adverts.loc[:,['latitude', 'longitude']]
advert_locs.head()

Unnamed: 0,latitude,longitude
0,-1.256163,36.79194
1,-1.230687,36.849247
2,-1.290723,36.769157
3,-1.258453,36.804754
4,-1.26327,36.815948


In [6]:
advert_locs.shape

(4000, 2)

In [7]:
advert_locs.dtypes

latitude     float64
longitude    float64
dtype: object

In [8]:
advert_locs.describe()

Unnamed: 0,latitude,longitude
count,4000.0,4000.0
mean,-1.264698,36.693415
std,0.604558,2.909096
min,-1.430525,-119.417932
25%,-1.29427,36.781686
50%,-1.279206,36.79937
75%,-1.265717,36.803513
max,36.778261,36.963968


In [9]:
extreme = df_adverts['latitude'] >= 0
df_adverts.loc[extreme, :]

Unnamed: 0.1,Unnamed: 0,id,category,list_type,beds,baths,sq_ft_price,price_KES,district,neighborhood,street,latitude,longitude,clean_grab
54,54,3433417,Studio,For Rent,1,1,0,100000,Dagoretti North,Kilimani,Tigoni,0.0,0.0,ZERO_RESULTS
156,156,3430967,Flat & Apartment,For Rent,4,0,54,135000,Dagoretti North,Kilimani,Nairobi,0.0,0.0,ZERO_RESULTS
258,258,3426717,Flat & Apartment,For Rent,3,4,0,90000,Dagoretti North,Kilimani,Nairobi,0.0,0.0,ZERO_RESULTS
1838,1838,3421610,Studio,For Rent,1,0,0,40000,Dagoretti North,Kilimani,Ring Road,0.0,0.0,ZERO_RESULTS
1881,1881,3433862,Flat & Apartment,For Rent,4,2,0,75000,Dagoretti North,Kilimani,Walk To Yaya,0.0,0.0,ZERO_RESULTS
1947,1947,3429200,Flat & Apartment,For Rent,3,3,0,75000,Dagoretti North,Valley Arcade,Walk To Valley Arcade Shopping,0.0,0.0,ZERO_RESULTS
3190,3190,3432233,Flat & Apartment,For Sale,2,0,0,26000000,Kamukunji Constituency,Eastleigh,California,36.778261,-119.417932,OK
3304,3304,3429197,Flat & Apartment,For Sale,4,4,8029,21000000,Dagoretti North,Lavington,Walk To Valley Arcade,0.0,0.0,ZERO_RESULTS


In [10]:
# drop outliers
advert_locs.drop(index = (df_adverts.loc[extreme, :].index), inplace = True)

advert_locs.describe()

Unnamed: 0,latitude,longitude
count,3992.0,3992.0
mean,-1.276445,36.796863
std,0.025429,0.033817
min,-1.430525,36.603554
25%,-1.29427,36.781686
50%,-1.279513,36.79937
75%,-1.265717,36.803513
max,-1.130738,36.963968


In [11]:
ss = StandardScaler()
advert_locs = ss.fit_transform(advert_locs)

In [12]:
advert_locs

array([[ 0.79767287, -0.14558427],
       [ 1.79963681,  1.54923592],
       [-0.56155044, -0.81938729],
       ...,
       [-0.43013412, -0.53743826],
       [-2.30262994, -3.67439659],
       [-0.70104509,  0.07414285]])

In [13]:
advert_locs = pd.DataFrame(advert_locs, columns = ['latitude', 'longitude'])

In [14]:
# assign cluster labels to advert data
advert_locs['cluster'] = km.predict(advert_locs)

In [15]:
len(advert_locs['cluster'].unique())

27

In [16]:
adverts = []
for i in range(26):
    adverts.append(advert_locs.groupby('cluster').count().loc[i][0])

In [17]:
pop = pd.read_csv('./datasets/Nairobi_Area_Population_Clusters.csv')
pop.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Lat,Lon,Population,clusters
0,0,9429455,-1.050139,36.698194,4.18689,22
1,1,9429456,-1.050139,36.699306,4.18689,22
2,2,9429457,-1.050139,36.699583,4.18689,22
3,3,9429458,-1.050139,36.70125,4.18689,22
4,4,9429459,-1.050139,36.702639,4.18689,22


In [18]:
advert_pop_ratio = []
for i in range(26):
    advert_pop_ratio.append(adverts[i] / pop.groupby('clusters')['Population'].mean()[i])

In [19]:
advert_pop_ratio

[0.4138615600228044,
 0.38876832172369763,
 15.484692780414552,
 0.1869707039525563,
 7.013443544767667,
 8.677806028216148,
 13.905433374146378,
 3.8918651695858117,
 7.285022756596774,
 19.072198902002317,
 1.8933190504003379,
 17.667609339873216,
 7.5316742779287145,
 0.5520075987393818,
 4.454998177595786,
 11.486704766442559,
 28.080449520180938,
 20.143562296702154,
 2.6626519200886385,
 0.3301990744820585,
 2.6913768535135545,
 6.514870455178825,
 2.8970256699956436,
 10.304434522855544,
 38.010932151911206,
 4.089884204236117]

In [20]:
advert_pop_ratio = [0] + advert_pop_ratio

In [22]:
pop['advert ratio'] = pop['clusters'].map(lambda x: advert_pop_ratio[x])

In [25]:
pop

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Lat,Lon,Population,clusters,advert ratio
0,0,9429455,-1.050139,36.698194,4.18689,22,6.514870
1,1,9429456,-1.050139,36.699306,4.18689,22,6.514870
2,2,9429457,-1.050139,36.699583,4.18689,22,6.514870
3,3,9429458,-1.050139,36.701250,4.18689,22,6.514870
4,4,9429459,-1.050139,36.702639,4.18689,22,6.514870
...,...,...,...,...,...,...,...
238612,322062,10287703,-1.499861,36.968750,4.25467,4,0.186971
238613,322063,10287704,-1.499861,36.969028,4.25467,4,0.186971
238614,322064,10287705,-1.499861,36.969306,4.25467,4,0.186971
238615,322065,10287706,-1.499861,36.969583,4.25467,4,0.186971


In [30]:
pop.groupby('clusters')['advert ratio'].mean().sort_values(ascending=False).to_frame()

Unnamed: 0_level_0,advert ratio
clusters,Unnamed: 1_level_1
25,38.010932
17,28.08045
18,20.143562
10,19.072199
12,17.667609
3,15.484693
7,13.905433
16,11.486705
24,10.304435
6,8.677806


In [231]:
pop.to_csv('./datasets/Nairobi_Advert_Ratios.csv')