In [1]:
import pandas as pd
import os
import csv
import numpy as np
from scipy import ndimage
from matplotlib import pyplot as plt
from sklearn import manifold, datasets
from sklearn.cluster import AgglomerativeClustering

In [2]:
cur_dir = os.path.dirname('__file__')
data = pd.read_csv(os.path.join(cur_dir, "WDI_Data.csv"))

In [3]:
def preprocess(df):
    df = df.copy()
    df['21st'] = df.ix[:, '2000':].mean(axis=1)
    return df

In [4]:
def one_year(data, year='21st'):
    grouped = data.groupby('Indicator Name')
    indic_dict = {}
    for indicator, group in grouped:
        for index, row in group.iterrows():
            if indicator not in indic_dict:
                indic_dict[indicator] = []
            indic_dict[indicator].append(row[year])
    for indicator, group in grouped:
        names = group['Country Name'].tolist()
        break   
        
    return pd.DataFrame(indic_dict, index=names)


def only_countries(df):
    words = ['World', 'income', '(developing only)', 'OECD', 'countries', 'Euro', 'Asia', 'America', 'situations']
    sel = []
    for i in df.index:
        temp = True
        for w in words:
            if w in i:
                temp = False
        sel.append(temp)
        
    return df[sel]

def few_na_cols(df, thresh=0.95, required_countries=['United States']):
    res = []
    for c in df.columns:
        perc = float(df[c].notnull().sum()) / len(df)
        meets_required = True
        for count in required_countries:
            if df[c].isnull().loc[count]:
                meets_required = False
        if perc >= thresh and meets_required:
            res.append(c)
    return df[res]

In [5]:
data = preprocess(data)

In [7]:
df = one_year(data)
# print df.index

In [8]:
null_pcts = []

for column in df.columns:
    col = df[column]
    null_pct = float(col.notnull().sum()) / len(col)
    null_pcts.append(null_pct)

    
temp = pd.DataFrame(data=null_pcts, index=df.columns,
                    columns=['Data Availability']).sort_values('Data Availability', ascending=False)
print temp.head()

                                                    Data Availability
Surface area (sq. km)                                        0.995968
Population, total                                            0.995968
Population growth (annual %)                                 0.995968
Land area (sq. km)                                           0.991935
Population density (people per sq. km of land a...           0.991935


In [9]:
# call this function after creating a csv for a single country, for a single year, for a group of indicators.
# this function exports the indicator names to a csv.
def create_group_csv(csv_dwnld, save_to):

    data = pd.read_csv(os.path.join(cur_dir, csv_dwnld))
    education_stats = data['Series Name'].tolist()

    with open(save_to, 'w') as f:
        writer = csv.writer(f)
        writer.writerow(education_stats)
        
## And then this gets the corresponding indicator list.
def get_indicator_list_from_csv(csv_file):
    with open(csv_file) as f:
        reader = csv.reader(f)
        your_list = list(reader)
    l = your_list[0]
    l = [s for s in l if s != "nan"]
    return l

## example usage
# create_group_csv('Data_Extract_From_World_Development_Indicators_Data.csv', 'health.csv')
# health_indicators = get_indicator_list_from_csv('health.csv')

In [10]:
def highest_pop_df(df, n=None, by='Population, total'):
    if n is None:
        n = len(df)
    df = df.sort_values(by,ascending=False)
    return df.iloc[:n]

In [11]:
def cluster_label_dict(countries, clustering):
    label_dict = {}
    for index, label in enumerate(clustering.labels_):
        if label not in label_dict:
            label_dict[label] = []    
        label_dict[label].append(countries[index])
    return label_dict

In [12]:
features = {}
features['Simple'] = ['Population, total', 'CO2 emissions (kt)', 'Net migration', 'Trade (% of GDP)']

groups = ['health', 'education', 'economics']

for g in groups:
    features[g] = get_indicator_list_from_csv(g + '.csv')

In [16]:
highest_pop = highest_pop_df(df)

countries = only_countries(highest_pop)
top = countries.index[:30]
economics = few_na_cols(countries[features['economics']], required_countries=top)

sample = economics.dropna().iloc[:70]
countries = sample.index.tolist()

X = sample.as_matrix()
n_samples, n_features = X.shape

X_red = manifold.SpectralEmbedding(n_components=2).fit_transform(X)

# for linkage in ('ward', 'average', 'complete'):
for linkage in ('average',):
    clustering = AgglomerativeClustering(linkage=linkage, n_clusters=10)
    clustering.fit(X_red)
    clust_dict = cluster_label_dict(countries, clustering)
    print clust_dict

{0: ['Indonesia', 'Nigeria', 'Iran, Islamic Rep.', 'Turkey', 'Thailand', 'South Africa', 'Colombia', 'Argentina', 'Poland', 'Venezuela, RB', 'Malaysia', 'Saudi Arabia'], 1: ['China', 'United States', 'Japan', 'Australia', 'Netherlands'], 2: ['Pakistan', 'Bangladesh', 'Philippines', 'Vietnam', 'Egypt, Arab Rep.', 'Ukraine', 'Algeria', 'Iraq', 'Peru', 'Romania', 'Chile', 'Kazakhstan'], 3: ['India', 'Brazil', 'Russian Federation', 'Mexico', 'Germany', 'France', 'United Kingdom', 'Italy', 'Korea, Rep.', 'Spain', 'Canada'], 4: ['Ethiopia', 'Tanzania', 'Ghana', 'Yemen, Rep.', 'Syrian Arab Republic', 'Cameroon', "Cote d'Ivoire"], 5: ['Nepal', 'Afghanistan', 'Niger', 'Burkina Faso', 'Mali', 'Senegal'], 6: ['Congo, Dem. Rep.'], 7: ['Myanmar', 'Kenya', 'Sudan', 'Morocco', 'Uzbekistan', 'Sri Lanka', 'Angola', 'Ecuador', 'Guatemala'], 8: ['Uganda', 'Malawi', 'Zambia'], 9: ['Mozambique', 'Madagascar', 'Cambodia', 'Zimbabwe']}
