In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import warnings

from sklearn.preprocessing import MinMaxScaler

# #############################################################################

warnings.filterwarnings('ignore')
sns.set()

# #############################################################################

def full_display(self):
    with pd.option_context('display.max_rows',None,'display.max_columns',None):
        display(self)

In [None]:
# please check my study's report to learn more about what I'm doing here. in fact, if you don't, this will hardly make any 
# sense. you can retrieve it from https://bit.ly/2m6vRE8

In [None]:
# we'll start by loading the rsf dataset

# unfortunarely, I had to get rid of the arabic and farsi characters first, otherwise pandas won't parse the file 
# I'm also specifying comma as a decimal separator as well as converting data from the 'Score 2019' column to float

url='https://raw.githubusercontent.com/ltripo/democracy_and_prosperity/master/datasets/A.1.I_rsf_raw_dataset.csv'

df_rsf=pd.read_csv(url,decimal=',',dtype={'Score 2019':np.float})

full_display(df_rsf)

In [None]:
# all I am interested in are country names and the latest (ie. 2019) score for each of them, which is why I'm' dropping all
# columns except those two

# on an afterthought though, I'll keep the 'zone' column, this will help handle missing values in what comes next

df_rsf=df_rsf[['EN_country','Score 2019','Zone']]

# I'll rename the remaining columns

df_rsf=df_rsf.rename(columns={'EN_country':'country_name','Score 2019':'rsf_index','Zone':'region'})

# using capitalised country names is nevertheless a bit clumsy, so I'm converting them to lower case. by the way I'm doing the
# same to zone names, all in one loop

for col in df_rsf.columns:
    if df_rsf[col].dtype=='object':
        df_rsf[col]=df_rsf[col].str.lower()

# plus I want rsf_index to go down, instead of up, from more freedom of speech to less, so I'm reckoning its reciprocal

df_rsf['rsf_index']=df_rsf['rsf_index']**(-1)
    
full_display(df_rsf)

In [None]:
# I'll now scale the data to some arbitrary range (my personal pick is 5-10) by using minmaxscaler

mms=MinMaxScaler(feature_range=(5,10))

df_rsf[['rsf_index']]=mms.fit_transform(df_rsf[['rsf_index']])

full_display(df_rsf)

In [None]:
# all that's left to do now is replace 'russian federation' with 'russia', so that we don't run into inconsistencies 
# later on

df_rsf.replace(to_replace='russian federation',value='russia',inplace=True)

In [None]:
# I'm done with this dataset for now, let's move on to the next one

url='https://raw.githubusercontent.com/ltripo/democracy_and_prosperity/master/datasets/A.1.II_pew_raw_dataset.csv'

df_pew=pd.read_csv(url)

df_pew

In [None]:
# because of the way I've pulled the data from the original source (which was a PDF file) all numeric columns are
# overfilled, so I need to clean them. only the first two digits in each cell matter, and the resulting data 
# type should be float

for col in df_pew.columns[1:]:
    df_pew[col]=df_pew[col].str[:2]
    df_pew=df_pew.astype({col: 'float'})

df_pew

In [None]:
# I now have to deal with the NaN at line 19. taking into account what Pew say in their documentation,
# I believe the best alternative is to impute that NaN with its column's lowest value

df_pew['explicit'].fillna(df_pew['explicit'].min(),inplace=True)

df_pew

In [None]:
# it so happens that I have no interest in so many columns, I want just the one. so I'm reckoning the overall geometric mean 
# (as in the present situation, factors contribute towards a final result) and contracting columns so as to end up with the
# essential information only

# additionally, I'm converting country names to lower case

df_pew['pew_index']=(df_pew['policies']*df_pew['beliefs']*df_pew['protests_v']*df_pew['explicit']*df_pew['protests_e']*df_pew['destabilise']*df_pew['security'])**(1/7)

df_pew['country_name']=df_pew['country_name'].str.lower()

df_pew=df_pew[['country_name','pew_index']]

# I would also like to scale the data and order the rows by score from highest to lowest so I can take a decent look at the
# dataframe

df_pew[['pew_index']]=mms.fit_transform(df_pew[['pew_index']])

df_pew.sort_values('pew_index',ascending=False,inplace=True)

df_pew=df_pew.reset_index(drop=True)

df_pew

In [None]:
# there it is. now I need to assign each country its geographical region so I can successfully merge this dataframe with the  
# previous one and optimise the treatment of missing values. I will go on and create a new dataframe already, for this will
# make things easier

df_rsf_pew=pd.merge(df_rsf,df_pew,how='left',on='country_name',sort=True)

df_rsf_pew=df_rsf_pew[['country_name','region','rsf_index','pew_index']]

full_display(df_rsf_pew)

In [None]:
# now I'm not sure all these region names are easy to deal with, so I'm renaming them to current usage (in english)

df_rsf_pew['region']=df_rsf_pew['region'].map({'afrique':'africa','asie-pacifique':'asia_pacific','eeac':'east_asia','mena':'middle_east','ue balkans':'europe','north_am':'north_am','latam_carib':'latam_carib'})

full_display(df_rsf_pew)

In [None]:
# I am now left with a lot of missing values to handle. fortunately, I already have countries sorted out by region so  
# I can simply impute to each country marked with a NaN in pew_index the region average for that index

def region_average(self,chosen_index):
    for r in self['region'].unique():
        region_average=np.mean(self[chosen_index][self['region']==r])
        for i in range(len(self.index)):
            if (self['region'][i]==r and np.isnan(self[chosen_index][i])==True):
                self[chosen_index][i]=region_average

region_average(df_rsf_pew,'pew_index')
        
full_display(df_rsf_pew)

In [None]:
# now it's time to face the truth. I've picked both these indices (rsf and pew) in the hope that each tells a different
# story, ie. that they are not significantly correlated, otherwise I'd would be inflating variance. let's see if
# data science proves me wrong

# I'm using spearman because it's more appropriate for rank-based comparisons

df_rsf_pew.corr(method='spearman')

In [None]:
# these variables aren't uncorrelated, and maybe in a different context one of them could be dropped, but it's still nothing
# scary for the purposes of our study. they still tell a slightly different story each. therefore, I will keep both onboard for
# the time being

# I'm now moving on to a different kind of information, viz. that on social mobility but remember, I'm still working on
# the democracy side 

url='https://raw.githubusercontent.com/ltripo/democracy_and_prosperity/master/datasets/A.2.I_oecd_migration_raw_dataset.csv'

df_oecd_migration_1=pd.read_csv(url)

full_display(df_oecd_migration_1)

In [None]:
# I've tried to make the above more usable with the spreadsheet editor, but it still looks terrible. I only need every other
# row as well as the third column's values, so here's what I'll do

df_oecd_migration=df_oecd_migration_1.copy()

x=np.arange(0,74)
y=np.arange(0,74,2)

for i,j in zip(x,y):
    for col in df_oecd_migration_1.columns:
        df_oecd_migration[col][i]=df_oecd_migration_1[col][j]

df_oecd_migration=df_oecd_migration.drop(df_oecd_migration.index[[range(37,74)]])

del df_oecd_migration['percentage']

df_oecd_migration=df_oecd_migration.rename(columns={'% of total population':'percentage'})        

df_oecd_migration

In [None]:
# looks pretty clean right now; we still have a few missing values, though. now, there's no point averaging them out here,
# let me worry about that later. we can simply drop those rows for now

df_oecd_migration=df_oecd_migration.drop(df_oecd_migration[df_oecd_migration['percentage']=='..'].index,axis=0)

df_oecd_migration=df_oecd_migration.reset_index(drop=True)
        
df_oecd_migration

In [None]:
# there you go. now let's move on to the next dataset

url='https://raw.githubusercontent.com/ltripo/democracy_and_prosperity/master/datasets/A.2.II_oecd_generations_raw_dataset.csv'

df_oecd_generations=pd.read_csv(url,dtype={'generations':np.float})

df_oecd_generations

In [None]:
# there are no major issues here since I had to type this one from scratch. next one on the list is the world bank's ease of
# doing business score

url='https://raw.githubusercontent.com/ltripo/democracy_and_prosperity/master/datasets/A.2.III_ease_db_world_bank_raw_dataset.csv'

df_wb_ease=pd.read_csv(url)

full_display(df_wb_ease)

In [None]:
# I'm converting country names to lower case

df_oecd_migration=df_oecd_migration.reset_index(drop=True)

df_oecd_migration['country_name']=df_oecd_migration['country_name'].str.lower()
df_wb_ease['country_name']=df_wb_ease['country_name'].str.lower()
    
# next thing I'm doing is, I'm sorting these dataframes' rows in ascending order, then scaling them

# let's see: a high percentage of migrants is good, a great ease of doing business is also good. so I just have to apply
# inverse to the df_oecd_generations dataframe's 'generations' colum

df_oecd_generations['generations']=df_oecd_generations['generations']**(-1)

df_oecd_migration[['percentage']]=mms.fit_transform(df_oecd_migration[['percentage']])
df_oecd_generations[['generations']]=mms.fit_transform(df_oecd_generations[['generations']])
df_wb_ease[['ease_db']]=mms.fit_transform(df_wb_ease[['ease_db']])

df_oecd_migration=df_oecd_migration.rename(columns={'percentage':'oecd_mig_index'})
df_oecd_generations=df_oecd_generations.rename(columns={'generations':'oecd_gen_index'})
df_wb_ease=df_wb_ease.rename(columns={'ease_db':'wb_index'})

In [None]:
# we'll now do pretty much the same as we did before, by merging data and treating missing values. by performing a full 
# outer merge, we'll preserve all countries from all datasets and therefore, avoid losing information

df_democracy=pd.merge(df_wb_ease,df_rsf_pew,how='outer',on='country_name',sort=True)
df_democracy=pd.merge(df_democracy,df_oecd_migration,how='outer',on='country_name',sort=True)
df_democracy=pd.merge(df_democracy,df_oecd_generations,how='outer',on='country_name',sort=True)

df_democracy=df_democracy[['country_name','region','rsf_index','pew_index','oecd_mig_index','oecd_gen_index','wb_index']]

full_display(df_democracy)

In [None]:
# it's never too late to search for inconsistencies

isna_rsf=df_democracy['rsf_index'].isnull()==True
isna_wb=df_democracy['wb_index'].isnull()==True

df_democracy[isna_rsf]

In [None]:
df_democracy[isna_wb]

In [None]:
# we seem to be quite alright, except for the oecs which is not really a country, which is why I'm dropping it

df_democracy=df_democracy.drop(130).reset_index(drop=True)

In [None]:
# we still have a lot of missing values left to work on though

df_democracy.isnull().values.ravel().sum()

In [None]:
# number of rows with at least one missing value

df_democracy.isnull().any(axis=1).sum()

In [None]:
# first, let's worry about the regions column

isna_region=df_democracy['region'].isnull()==True

df_democracy[isna_region]

In [None]:
# there seem to be no major issues here. I'll just jump in and set the region manually

df_democracy.at[5,'region']='latam_carib'
df_democracy.at[11,'region']='latam_carib'
df_democracy.at[14,'region']='latam_carib'
df_democracy.at[46,'region']='latam_carib'
df_democracy.at[67,'region']='asia_pacific'
df_democracy.at[90,'region']='asia_pacific'
df_democracy.at[109,'region']='asia_pacific'
df_democracy.at[113,'region']='asia_pacific'
df_democracy.at[132,'region']='asia_pacific'
df_democracy.at[140,'region']='north_am'
df_democracy.at[146,'region']='europe'
df_democracy.at[155,'region']='asia_pacific'
df_democracy.at[162,'region']='latam_carib'
df_democracy.at[163,'region']='latam_carib'
df_democracy.at[164,'region']='latam_carib'
df_democracy.at[170,'region']='latam_carib'
df_democracy.at[189,'region']='asia_pacific'

In [None]:
# it shouldn't be a problem now to fill in missing values with regional averages

region_average(df_democracy,'rsf_index')
region_average(df_democracy,'pew_index')
region_average(df_democracy,'oecd_gen_index')
region_average(df_democracy,'wb_index')

In [None]:
# as for the oecd_mig_index: apparently, none of the countries with missing values seem to be particularly sought
# after by international migrants, which is probably why they were not surveyed by the oecd. it is therefore appropriate to 
# impute a value of 5 to each of them (remember, we are working on a normalised 5-10 scale)

df_democracy['oecd_mig_index'][df_democracy['oecd_mig_index'].isnull()]=5
df_democracy['oecd_mig_index']=pd.to_numeric(df_democracy['oecd_mig_index'])
        
full_display(df_democracy)

In [None]:
# as to the oecd_gen_index: there are no averages for either east_asia or middle_east countries. by peeking into these
# countries' gini coefficients, it appears that we're rather safe imputing to them the asia_pacific average 

df_democracy['oecd_gen_index'][df_democracy['oecd_gen_index'].isnull()]=np.mean(df_democracy['oecd_gen_index'][df_democracy['region']=='asia_pacific']) 

full_display(df_democracy)

In [None]:
# before we move on to the next step, I seem to recall that some countries from the df_oecd_migration dataset had no data
# associated with them. this doesn't necessarily mean that they are not sought after by international migrants. it's hard to
# infer the data, though, from other datasets, even from the same survey, but it's certainly possible to infer which
# ones are, and which are not, sought after by international migrants and/or asylum seekers based on the respective inflows.
# so I'll go ahead and attribute these countries mean overall, instead of minimum, value to avoid their getting penalised

oecd_mig_average=np.mean(df_democracy['oecd_mig_index'])

l_countries=['canada','czech republic','ireland','israel','japan','south korea','mexico','new zealand','russia']

for c in l_countries:
    df_democracy['oecd_mig_index'][df_democracy['country_name']==c]=oecd_mig_average

df_democracy[df_democracy['oecd_mig_index']==oecd_mig_average]

In [None]:
# right now we're getting a bit more scientific. let's compare the above variables' variance to see if there's some one we 
# can drop

l_labels=['rsf_index','pew_index','oecd_mig_index','oecd_gen_index','wb_index']

l_variances=[]

for var in l_labels:
    l_variances.append(round(df_democracy[var].var(),2))
    print(f'{var}: {round(df_democracy[var].var(),2)}')

In [None]:
# or graphically

def bar_plot_var_1():
    sns.set()
    index=np.arange(len(l_labels))
    plt.bar(index,l_variances,color=('b'))
    plt.ylabel('variance',fontsize=12)
    plt.xticks(index,l_labels,fontsize=11,rotation=30)
    plt.title('democracy variables as compared by variance',fontsize=12)
    plt.show()

bar_plot_var_1()

In [None]:
# let's take a look at how they correlate with one another

df_democracy.corr(method='spearman')

In [None]:
# or graphically

def heat_plot_var(df):
    fig,ax=plt.subplots(figsize=(5,5))
    mask=np.zeros_like(df.corr(method='spearman').abs())
    mask[np.triu_indices_from(mask)]=1
    sns.heatmap(df.corr(method='spearman').abs(),mask= mask,ax=ax,cmap='coolwarm',annot=True)
    
heat_plot_var(df_democracy)

In [None]:
# by looking at the above graphs, I don't like the way pew_index shows a lot of variance and rather minor covariance, 
# since it was built on regional averages, so it's not supposed to be a quality index. besides that, the oecd_mig_index has
# rather poor relative variance. also, I'm now afraid the oecd_gen_index does not really tell a different story from the 
# wb_index and if that checks out, we would be inflating variance and reinforcing trends, which is obviously against good
# practice. therefore, I am dropping these indices, based on a more informed analysis which I'm now able to perform

# for the purposes of the machine learning part I'll prefer to calculate a democratic index as a combination  
# of its parts. for the lack of a solid basis to attribute weights, this calculation is going to be a simple
# geometric mean. I'm using the geometric mean because in the present situation, factors contribute towards a final result

# before we proceed, I have decided to recast the wb_index as its natural logarithm, as it is more likely than not,
# in my opinion, that the ease of doing business makes a lot of difference at the bottom, yet has only marginal utility 
# at the top. this may not make a great difference but still makes sense to me

mms_wb=MinMaxScaler(feature_range=(1,np.exp(1)))

df_democracy[['wb_index']]=mms_wb.fit_transform(df_democracy[['wb_index']])
df_democracy['wb_index']=np.log(df_democracy['wb_index'])
df_democracy[['wb_index']]=mms.fit_transform(df_democracy[['wb_index']])

df_democracy['democracy_index']=((df_democracy['rsf_index'])*(df_democracy['wb_index']))**(1/2)
df_democracy[['democracy_index']]=mms.fit_transform(df_democracy[['democracy_index']])
    
df_democracy=df_democracy.drop(columns=['pew_index','oecd_mig_index','oecd_gen_index'],axis=1)

df_democracy['country_name']=df_democracy['country_name'].str.rstrip()
df_democracy.sort_values('country_name',inplace=True)

df_democracy=df_democracy.reset_index(drop=True)

full_display(df_democracy)

In [None]:
# there is still one thing left to do before we move on to the prosperity prong, which is to study both the variance, and
# therefore relevance, of the democracy_index variable as well as its covariance with the previous indices

l_labels=['rsf_index','wb_index','democracy_index']

l_variances=[]

for var in l_labels:
    print(f'{var}: {round(df_democracy[var].var(),2)}')
    l_variances.append(round(df_democracy[var].var(),2))

In [None]:
# or graphically

def bar_plot_var_2():
    sns.set()
    index=np.arange(len(l_labels))
    plt.bar(index,l_variances,color=['b','b','r'])
    plt.ylabel('variance',fontsize=12)
    plt.xticks(index,l_labels,fontsize=11,rotation=30)
    plt.title('democracy variables as compared by variance',fontsize=12)
    plt.hlines(round(df_democracy['democracy_index'].var(),2),0,len(l_labels)-1,linestyle='dashed',color='red')
    plt.show()
    
bar_plot_var_2()

In [None]:
# there is still significant variance associated with democracy_index, though it has shrinked which I assume is due to a
# preexisting correlation between rsf_index and wb_index. but what about covariance? let's look it over graphically
 
heat_plot_var(df_democracy)

In [None]:
# that looks all right to me, insofar as democracy_index was calculated on the basis of the other two, yet probably
# no single one of them could simply replace it. we are now ready to move on to prosperity
# as you can see in my report file, I have decided to use the world happiness ranking alone for that

url='https://raw.githubusercontent.com/ltripo/democracy_and_prosperity/master/datasets/B.I_gwp_raw_dataset.csv'

df_gwp=pd.read_csv(url)

full_display(df_gwp)

In [None]:
# now, I have to sort this mess out thereby getting a clean dataframe
# using regular expressions is best

clean_cn=df_gwp['country_name'].str.extract(r'([^0-9\.()]+)')
clean_ind=df_gwp['country_name'].str.extract(r'(\d.\d\d\d)')

df_gwp['country_name']=clean_cn
df_gwp['gwp_index']=clean_ind

df_gwp['gwp_index']=pd.to_numeric(df_gwp['gwp_index'])

# I need to set country names to lower case, and also scale gwp_index

df_gwp['country_name']=df_gwp['country_name'].str.lower()

df_gwp[['gwp_index']]=mms.fit_transform(df_gwp[['gwp_index']])
    
df_gwp.sort_values('country_name',inplace=True)

df_gwp=df_gwp.reset_index(drop=True)

full_display(df_gwp)

In [None]:
# there you go. it's now time to merge the df_democracy and df_gwp dataframes into a df_dem_prosp consolidated dataframe

df_democracy['country_name']=df_democracy['country_name'].str.strip()

df_gwp['country_name']=df_gwp['country_name'].str.strip()

df_dem_prosp=pd.merge(df_democracy,df_gwp,how='outer',on='country_name',sort=True)
df_dem_prosp=df_dem_prosp[['country_name','region','democracy_index','gwp_index']]
df_dem_prosp=df_dem_prosp.rename(columns={'gwp_index':'prosperity_index'})

full_display(df_dem_prosp)

In [None]:
# last missing values to impute

region_average(df_dem_prosp,'prosperity_index')

full_display(df_dem_prosp)

In [None]:
# let's take a look now at what we've got. this is our final dataset as plotted on a scatter chart

def scat_plot(df):
    sns.set()
    plt.scatter(df[:,0],df[:,1],c='b',s=10)
    plt.title('Democracy versus Prosperity for 196 Countries')
    plt.xlabel('Democracy')
    plt.ylabel('Prosperity')
    plt.show()
    
X=df_dem_prosp.as_matrix(columns=df_dem_prosp.columns[2:])
labels_true=df_dem_prosp['country_name']

scat_plot(X)

In [None]:
# please refer to my study's report in order to understand why I chose affinity propagation and learn more about how it works.
# the use of a machine learning algorithm involves trade-offs. in this case, a number of clusters which is either too
# narrow or too large has poor explanatory value. explaining relationships between countries requires assigning 
# meaningful characteristics to country clusters, but one can only do that if those clusters appear to make 
# sense when visualised. you can toy with the algorithm's parameters yourself and come to your own conclusions. when I took
# my turn I kept in mind that I needed to keep the silhouette score at an acceptable level (which I 
# made sure I did by gauging the damping and preference parameters)

# the snippet below was adapted from:
# https://scikit-learn.org/stable/auto_examples/cluster/plot_affinity_propagation.html#sphx-glr-auto-examples-cluster-plot-
# affinity-propagation-py

from sklearn.cluster import AffinityPropagation
from sklearn import metrics

# #############################################################################
# Compute Affinity Propagation
af = AffinityPropagation(damping=.8,preference=-5,verbose=True).fit(X)
cluster_centers_indices = af.cluster_centers_indices_
labels = af.labels_

n_clusters_ = len(cluster_centers_indices)

print('Estimated number of clusters: %d' % n_clusters_)
print("Silhouette Score: %0.3f"
      % metrics.silhouette_score(X, labels, metric='sqeuclidean'))

# #############################################################################
# Plot result
from itertools import cycle

sns.set()
plt.close('all')
plt.figure(1)
plt.clf()

colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
for k, col in zip(range(n_clusters_), colors):
    class_members = labels == k
    cluster_center = X[cluster_centers_indices[k]]
    plt.plot(X[class_members, 0], X[class_members, 1], col + '.')
    plt.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col,
             markeredgecolor='k', markersize=14)
    
    for x in X[class_members]:
        plt.plot([cluster_center[0], x[0]], [cluster_center[1], x[1]], col)

plt.title('Democracy versus Prosperity for 196 Countries')
plt.xlabel('Democracy')
plt.ylabel('Prosperity')
plt.show()

In [None]:
# for the sake of sound methodology, I will now perform a simple demonstration of how both k-means and dbscan performed 
# poorly on this dataset

# the snippet below was adapted from:
# https://www.geeksforgeeks.org/elbow-method-for-optimal-value-of-k-in-kmeans/

from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist

def elbow(X):
    distortions = [] 
    inertias = [] 
    mapping1 = {} 
    mapping2 = {} 
    K = range(1,10)    
    for k in K:
        #Building and fitting the model 
        kmeanModel = KMeans(n_clusters=k).fit(X) 
        kmeanModel.fit(X)
        distortions.append(sum(np.min(cdist(X, kmeanModel.cluster_centers_, 'euclidean'),axis=1)) / X.shape[0]) 
        inertias.append(kmeanModel.inertia_)
        mapping1[k] = sum(np.min(cdist(X, kmeanModel.cluster_centers_, 'euclidean'),axis=1)) / X.shape[0] 
        mapping2[k] = kmeanModel.inertia_ 

    # Using the different values of Distortion
    print('Distortion:')
    for key,val in mapping1.items():
        print(str(key)+' : '+str(val))

    plt.plot(K, distortions, 'bx-')
    plt.xlabel('Values of K')
    plt.ylabel('Distortion')
    plt.title('The Elbow Method using Distortion')
    plt.show() 

    # Using the different values of Inertia
    print('Inertia:')
    for key,val in mapping2.items():
        print(str(key)+' : '+str(val))

    plt.plot(K, inertias, 'bx-')
    plt.xlabel('Values of K')
    plt.ylabel('Inertia') 
    plt.title('The Elbow Method using Inertia')
    plt.show()
    
elbow(X)

In [None]:
# both above elbow graphs point to 4 as the optimal number of clusters. let's see how the silhouette score turns out  
# for both 4 and 8 clusters

range_n_clusters=[4,8]

for n_cluster in range_n_clusters:
    kmeans = KMeans(n_clusters=n_cluster)
    kmeans.fit(X)
    labels = kmeans.predict(X)
    print (n_cluster, metrics.silhouette_score(X,labels))

In [None]:
# now dbscan

from sklearn.cluster import DBSCAN

# Compute DBSCAN
db = DBSCAN(eps=0.3, min_samples=10).fit(X)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_

# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)

print('Estimated number of clusters: %d' % n_clusters_)
print('Estimated number of noise points: %d' % n_noise_)
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X, labels))

# #############################################################################
# Plot result
import matplotlib.pyplot as plt

# Black removed and is used for noise instead.
unique_labels = set(labels)
colors = [plt.cm.Spectral(each)
          for each in np.linspace(0, 1, len(unique_labels))]
for k, col in zip(unique_labels, colors):
    if k == -1:
        # Black used for noise.
        col = [0, 0, 0, 1]

    class_member_mask = (labels == k)

    xy = X[class_member_mask & core_samples_mask]
    plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
             markeredgecolor='k', markersize=14)

    xy = X[class_member_mask & ~core_samples_mask]
    plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
             markeredgecolor='k', markersize=6)

plt.title('Estimated number of clusters: %d' % n_clusters_)
plt.show()

In [None]:
# as you can see these algorithms did not yield a convenient partition of the dataset (if at all)

# we are now ready to export the final data for further analysis. but first we need to create a new column in the 
# dataframe to accomodate cluster number information

df_dem_prosp['cluster_#']=AffinityPropagation(damping=.8,preference=-5).fit_predict(X)

df_dem_prosp.to_csv('C:/foo/bar/export_dem_prosp.csv',index=False)

print('Done.')