In [1]:
#conda install -c conda-forge folium

In [2]:
# Import Libraries
import numpy as np
import pandas as pd
import re

In [352]:
# Initilize folium and get geojson country data
import folium
import json

country_shapes = 'data/countries.geojson'

f = open(country_shapes)
map_data = json.load(f)
f.close()

map_data['features'][3]['properties']['ADMIN']

country_names = []
for i in range(len(map_data['features'])):
    name = map_data['features'][i]['properties']['ADMIN']
    if name in ['French Southern and Antarctic Lands', 'United States Minor Outlying Islands', 'United States Virgin Islands',
                'Saint Martin', 'Western Sahara']:
        pass
    else:
        country_names.append(name)

country_names[0:3]

['Aruba', 'Afghanistan', 'Angola']

In [349]:
# Define normalization function
def normalize(data, column_list):
    for i,cols in enumerate(column_list):    
        data[cols] = data[cols] - data[cols].min()
        data[cols] = data[cols] / data[cols].max()
        data[cols] = data[cols]

    return data

# Define a function to retrive and clean wikipedia data
def get_wiki(link, table_index, column_dict, verbose, bad_rows):
    df = pd.read_html(link)[table_index]

    for i in bad_rows:
        df.drop(i, axis=0, inplace=True)

    column_names = list(range(df.shape[1]))
    
    for keys in list(column_dict.keys()):
        column_names[keys] = column_dict[keys]

    df = pd.DataFrame(np.asarray(df), columns=column_names)
    df = df[list(column_dict.values())]
    
    for i in df.index:
        country = str(df.loc[i,'Country'])
        country = country.split(' (')[0]
        country = country.split(' [')[0]
        country = country.split(' *')[0]
        country = country.split(' *')[0]
        country = country.split('(')[0]
        country = country.split('[')[0]
        country = country.split('*')[0]
        df.loc[i,'Country'] = country

        for col in df.columns[1:]:
            number = str(df.loc[i,col])
            if number[0] == '-':
                multiplier = -1
                number = number[1:]
            elif number[0] == '−':
                multiplier = -1
                number = number[1:]
            else:
                multiplier = 1
            
            number = number.replace(' ','')
            number = number.split('(')[0]
            number = number.split('[')[0]
            number = number.split('-')[0]
            number = number.split('−')[0]
            number = number.split('–')[0]

            if '%' in number:
                multiplier = multiplier * .01

            number = number.replace('%', '')
            number = number.replace(',', '')
            number = number.replace('<', '')
            number = number.replace('>', '')
            
            try:
                number = float(number) * multiplier
                df.loc[i,col] = number
            except:
                print(i,df.loc[i,'Country'],col,number)

    # Aligns country names with master list
    extra = set(df['Country']).difference(set(country_names))
    missing = set(country_names).difference(set(df['Country']))
    replacements = {}

    for names in list(missing):
        tag = names.replace('United ', '')
        tag = tag.replace('North ', '')
        tag = tag.replace('Northern ', '')
        tag = tag.replace('South ', '')
        tag = tag.replace('Southern ', '')
        tag = tag.replace('Republic of ', '')
        tag = tag.replace('Saint ', '')
        tag = tag[0:4]
        
        good_options = []

        for options in list(extra):
            if tag in options:
                good_options.append(options)

        if len(good_options) == 1:
            replacements[good_options[0]] = names

    if verbose == True:
        print(replacements)

    replacements['United States'] = 'United States of America'
    replacements['Democratic Republic of Congo'] = 'Democratic Republic of the Congo'
    replacements['Congo, Democratic Republic of the'] = 'Democratic Republic of the Congo'
    replacements['DR Congo'] = 'Democratic Republic of the Congo'
    replacements['Congo'] = 'Republic of Congo'
    replacements['Republic of the Congo'] = 'Republic of Congo'
    replacements['Czechia'] = 'Czech Republic'
    replacements['Eswatini'] = 'Swaziland'
    replacements['São Tomé and Príncipe'] = 'Sao Tome and Principe'
    replacements['North Macedonia'] = 'Macedonia'
    replacements['Guinea-Bissau'] = 'Guinea Bissau'
    replacements["Côte d'Ivoire"] = 'Ivory Coast'
    replacements["Lao People's Democratic Republic"] = 'Laos'
    replacements["Korea, Democratic People's Republic of"] = 'North Korea'

    for i in df.index:
        if df.loc[i,'Country'] in list(replacements.keys()):
            df.loc[i,'Country'] = replacements[df.loc[i,'Country']]

    df.set_index('Country', inplace=True)
    
    return df

def join_all(data_list, drop_na):
    df = data_list[0]
    
    for data in data_list[1:]:
        df = df.join(data, how='outer')

    if drop_na == True:
        df.dropna(inplace=True)

    return df

In [343]:
# Get age range data
link = 'https://en.wikipedia.org/wiki/List_of_countries_by_age_structure'
column_dict = {0:'Country',
               1:'0-14_PCT',
               2:'15-64_PCT',
               3:'Over65_PCT'}

ages = get_wiki(link, 0, column_dict, False, [])

# Get population and population density data
link = 'https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_population_density'
column_dict = {0:'Country',
               1:'TotalPop',
               4:'Density_POPKM2'}

pop = get_wiki(link, 0, column_dict, False, [])
pop = normalize(pop, list(pop.columns))

# Get fertility rate data
link = 'https://en.wikipedia.org/wiki/List_of_countries_by_past_fertility_rate'
column_dict = {0:'Country',
               9:'90sFertilityRate',
               13:'10sFertilityRate'}

fert = get_wiki(link, 1, column_dict, False, [])
fert = normalize(fert, list(fert.columns))

# Get child mortality data
link = 'https://en.wikipedia.org/wiki/List_of_countries_by_infant_and_under-five_mortality_rates'
column_dict = {0:'Country',
               1:'ChildMortality'}

mort = get_wiki(link, 0, column_dict, False, [])
mort = normalize(mort, list(mort.columns))

# Get obesity rate data
link = 'https://en.wikipedia.org/wiki/List_of_countries_by_obesity_rate'
column_dict = {0:'Country',
               2:'ObesityRate'}

obi = get_wiki(link, 0, column_dict, False, [])
obi = normalize(obi, list(obi.columns))

demographics = join_all([ages, pop, fert, mort, obi], True)

demographics


Unnamed: 0_level_0,0-14_PCT,15-64_PCT,Over65_PCT,TotalPop,Density_POPKM2,90sFertilityRate,10sFertilityRate,ChildMortality,ObesityRate
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Afghanistan,0.4062,0.567,0.0268,0.005033,0.00293,0.896254,0.659777,0.507359,0.057725
Albania,0.176,0.6937,0.1303,0.000358,0.004757,0.220461,0.09539,0.069264,0.332767
Algeria,0.2958,0.6425,0.0617,0.005545,0.000912,0.412104,0.294118,0.187013,0.429542
Angola,0.4783,0.5013,0.023,0.004331,0.001344,0.841499,0.769475,0.632035,0.103565
Antigua and Barbuda,0.2252,0.6857,0.0891,0.000012,0.01014,0.119597,0.157393,0.042424,0.285229
...,...,...,...,...,...,...,...,...,...
Venezuela,0.2566,0.6616,0.0818,0.00354,0.001489,0.286744,0.205087,0.194805,0.398981
Vietnam,0.2261,0.7048,0.0691,0.012234,0.014177,0.283862,0.135135,0.157576,0.0
Yemen,0.3916,0.5804,0.028,0.00414,0.002978,1.0,0.523052,0.490909,0.254669
Zambia,0.4574,0.5199,0.0227,0.002444,0.001248,0.726225,0.650238,0.519481,0.101868


In [344]:
# Get average tempature data
link = 'https://en.wikipedia.org/wiki/List_of_countries_by_average_yearly_temperature'
column_dict = {0:'Country',
               1:'AveTemp_C'}

temp = get_wiki(link, 0, column_dict, False, [])
temp.loc['South Sudan','AveTemp_C'] = 28.0
temp = normalize(temp, list(temp.columns))

# Get annual rainfall data
link = 'https://en.wikipedia.org/wiki/List_of_countries_by_average_annual_precipitation'
column_dict = {1:'Country',
               2:'AvePrecip_MM'}

rain = get_wiki(link, 0, column_dict, False, [])
rain.loc['Republic of Serbia', 'AvePrecip_MM'] = 896.0
rain = normalize(rain, list(rain.columns))

# Get capital city latitude data
link = 'https://en.wikipedia.org/wiki/List_of_national_capitals_by_latitude'
column_dict = {2:'Country',
               0:'CapitalLat'}

cap_lat = get_wiki(link, 1, column_dict, False, [])
cap_lat.loc['South Africa','CapitalLat'] = -29
cap_lat.loc['Georgia','CapitalLat'] = 41.72
cap_lat = normalize(cap_lat, list(cap_lat.columns))

# Get area data
link = 'https://simple.wikipedia.org/wiki/List_of_countries_by_area'
column_dict = {1:'Country',
               2:'Area_KM2'}

area = get_wiki(link, 0, column_dict, False, [])
area.loc['Denmark','Area_KM2'] = 43094.0
area.loc['France','Area_KM2'] = 640679.0

# Get forest area data
link = 'https://en.wikipedia.org/wiki/List_of_countries_by_forest_area'
column_dict = {0:'Country',
               4:'ForestArea_HCT'}

forest = get_wiki(link, 1, column_dict, False, [])

for i in area.index:
    try:
        forest.loc[i,'FroestRatio'] = forest.loc[i,'ForestArea_HCT'] / area.loc[i,'Area_KM2']
    except: 
        forest.loc[i,'FroestRatio'] = 0

forest.drop('ForestArea_HCT', axis=1, inplace=True)
forest = normalize(forest, list(forest.columns))
area = normalize(area, list(area.columns))

# Get coastline ratio data
link = 'https://en.wikipedia.org/wiki/List_of_countries_by_length_of_coastline'
column_dict = {0:'Country',
               8:'CoastRatio'}

coast = get_wiki(link, 0, column_dict, False, [])

for i in area.index:
    try:
        if coast.loc[i,'CoastRatio'] > 0:
            pass
        else:
            coast.loc[i,'CoastRatio'] = 0
    except:
        coast.loc[i,'CoastRatio'] = 0

coast = normalize(coast, list(coast.columns))

coast.head()

climate = join_all([temp, rain, cap_lat, area, forest, coast], True)

climate

Unnamed: 0_level_0,AveTemp_C,AvePrecip_MM,CapitalLat,Area_KM2,FroestRatio,CoastRatio
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Afghanistan,0.533591,0.095903,0.67034,0.038146,0.019967,0.0
Albania,0.497919,0.455307,0.721597,0.001681,0.295874,0.009481
Algeria,0.827883,0.022036,0.68725,0.139297,0.008822,0.002801
Angola,0.799643,0.307883,0.343021,0.072914,0.575964,0.00619
Antigua and Barbuda,0.927467,0.314091,0.538914,0.000026,0.195122,0.031472
...,...,...,...,...,...,...
Venezuela,0.912604,0.628802,0.488941,0.053599,0.543832,0.0129
Vietnam,0.88585,0.55959,0.568431,0.019371,0.476608,0.026753
Yemen,0.868014,0.046245,0.525402,0.030878,0.011210,0.011342
Zambia,0.795184,0.310987,0.293274,0.044017,0.641919,0.0


In [345]:
# Get economic sector data
link = 'https://en.wikipedia.org/wiki/List_of_countries_by_GDP_sector_composition'
column_dict = {0:'Country',
               2:'Agr_PCT',
               3:'Ind_PCT',
               4:'Serv_PCT'}

comp = get_wiki(link, 2, column_dict, False, [])
comp.loc['South Sudan', 'Agr_PCT'] = comp.loc['Sudan', 'Agr_PCT']
comp.loc['South Sudan', 'Ind_PCT'] = comp.loc['Sudan', 'Ind_PCT']
comp.loc['South Sudan', 'Serv_PCT'] = comp.loc['Sudan', 'Serv_PCT']
comp.loc['Bolivia', 'Agr_PCT'] = 4.7
comp.loc['Bolivia', 'Ind_PCT'] = 40.4
comp.loc['Bolivia', 'Serv_PCT'] = 54.9
comp.loc['South Korea', 'Agr_PCT'] = 2.7
comp.loc['South Korea', 'Ind_PCT'] = 39.8
comp.loc['South Korea', 'Serv_PCT'] = 57.5

# Get GDP data
link = 'https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(PPP)_per_capita'
column_dict = {0:'Country',
               6:'GDPpC_PPP'}

ppp = get_wiki(link, 1, column_dict, False, [])
ppp.loc['Palestine','GDPpC_PPP'] = 5400.0
ppp = normalize(ppp, list(ppp.columns))

#Get poverty data
link = 'https://en.wikipedia.org/wiki/List_of_countries_by_percentage_of_population_living_in_poverty'
column_dict = {0:'Country',
               1:'ExtremePoverty_PCT',
               3:'Poverty_PCT'}

pov = get_wiki(link, 1, column_dict, False, [])

# Get HDI data
link = 'https://en.wikipedia.org/wiki/List_of_countries_by_Human_Development_Index'
column_dict = {2:'Country',
               3:'HDI'}

hdi = get_wiki(link, 0, column_dict, False, [])
hdi.loc['Somalia','HDI'] = 0
hdi.loc['North Korea','HDI'] = 0
hdi.loc['Kosovo','HDI'] = 0.80

# Get median wealth and inequality data
link = 'https://en.wikipedia.org/wiki/List_of_countries_by_wealth_per_adult'
column_dict = {0:'Country',
               1:'MedianWealth',
               3:'Gini_PCT'}

wealth = get_wiki(link, 1, column_dict, False, [])
wealth.loc['South Sudan','MedianWealth'] = wealth.loc['Sudan','MedianWealth']
wealth.loc['South Sudan','Gini_PCT'] = wealth.loc['Sudan','Gini_PCT']
wealth.loc['Ivory Coast','MedianWealth'] = wealth.loc['Nigeria','MedianWealth']
wealth.loc['Ivory Coast','Gini_PCT'] = wealth.loc['Nigeria','Gini_PCT']
wealth.loc['Somalia','MedianWealth'] = wealth.loc['Haiti','MedianWealth']
wealth.loc['Somalia','Gini_PCT'] = wealth.loc['Haiti','Gini_PCT']
wealth.loc['Uzbekistan','MedianWealth'] = wealth.loc['Bangladesh','MedianWealth']
wealth.loc['Uzbekistan','Gini_PCT'] = wealth.loc['Bangladesh','Gini_PCT']
wealth.loc['North Korea','MedianWealth'] = wealth.loc['Brunei','MedianWealth']
wealth.loc['North Korea','Gini_PCT'] = wealth.loc['Brunei','Gini_PCT']
wealth.loc['Cuba','MedianWealth'] = wealth.loc['Colombia','MedianWealth']
wealth.loc['Cuba','Gini_PCT'] = wealth.loc['Colombia','Gini_PCT']
wealth.loc['Dominican Republic','MedianWealth'] = wealth.loc['Maldives','MedianWealth']
wealth.loc['Dominican Republic','Gini_PCT'] = wealth.loc['Maldives','Gini_PCT']
wealth.loc['Guatemala','MedianWealth'] = wealth.loc['Albania','MedianWealth']
wealth.loc['Guatemala','Gini_PCT'] = wealth.loc['Albania','Gini_PCT']
wealth.loc['Honduras','MedianWealth'] = wealth.loc['Philippines','MedianWealth']
wealth.loc['Honduras','Gini_PCT'] = wealth.loc['Philippines','Gini_PCT']
wealth.loc['Bhutan','MedianWealth'] = wealth.loc['Indonesia','MedianWealth']
wealth.loc['Bhutan','Gini_PCT'] = wealth.loc['Indonesia','Gini_PCT']
wealth.loc['Kosovo','MedianWealth'] = wealth.loc['Republic of Serbia','MedianWealth']
wealth.loc['Kosovo','Gini_PCT'] = wealth.loc['Republic of Serbia','Gini_PCT']
wealth.loc['Macedonia','MedianWealth'] = wealth.loc['Romania','MedianWealth']
wealth.loc['Macedonia','Gini_PCT'] = wealth.loc['Romania','Gini_PCT']

wealth = normalize(wealth, list(wealth.columns))

economy = join_all([comp, ppp, hdi, wealth], True)

economy

Unnamed: 0_level_0,Agr_PCT,Ind_PCT,Serv_PCT,GDPpC_PPP,HDI,MedianWealth,Gini_PCT
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Afghanistan,0.23,0.211,0.559,0.009491,0.511,0.002083,0.544794
Albania,0.217,0.242,0.541,0.095506,0.795,0.058412,0.433414
Algeria,0.133,0.393,0.474,0.077768,0.748,0.008121,0.835351
Angola,0.102,0.614,0.284,0.042776,0.581,0.003612,0.733656
Argentina,0.108,0.281,0.611,0.154046,0.845,0.007562,0.748184
...,...,...,...,...,...,...,...
Venezuela,0.047,0.404,0.549,0.05025,0.711,0.027523,0.673123
Vietnam,0.153,0.333,0.513,0.052686,0.704,0.016811,0.723971
Yemen,0.203,0.118,0.679,0.012635,0.47,0.003966,0.912833
Zambia,0.075,0.353,0.57,0.019646,0.584,0.001921,0.905569


In [360]:
# Get general religion data
def get_religion(table_number):
    link = 'https://en.wikipedia.org/wiki/Religions_by_country'
    column_dict = {0:'Country',
                   3:'Christain_PCT',
                   5:'Muslim_PCT',
                   7:'Irreligion_PCT',
                   9:'Hindu_PCT',
                   11:'Buddhist_PCT',
                   13:'Folk_PCT',
                   15:'Other_PCT',
                   17:'Jewish_PCT'}
    df = get_wiki(link, table_number, column_dict, False, [])

    return df

relg_dfs = []
for i in [10,11,12,13,14,16,17,18,19,21,22,23,24,25,29,30,31,32,33,34,36,37,38,39]:
    relg_dfs.append(get_religion(i))

relg = pd.concat(relg_dfs, axis=0)

for col in relg.columns:
    relg[col] = relg[col] * .01

relg.head(3)

4 Congo, Democratic Republic of the Jewish_PCT 3e


Unnamed: 0_level_0,Christain_PCT,Muslim_PCT,Irreligion_PCT,Hindu_PCT,Buddhist_PCT,Folk_PCT,Other_PCT,Jewish_PCT
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Angola,0.905,0.01,0.051,0.0,0.0,0.042,0.0,0.0
Cameroon,0.703,0.3,0.053,0.0,0.0,0.033,0.027,0.0
Central African Republic,0.895,0.15,0.01,0.0,0.0,0.01,0.0,0.0


In [362]:
# Get catholicism data
link = 'https://en.wikipedia.org/wiki/Catholic_Church_by_country'
column_dict = {0:'Country',
               2:'Catholic_PCT'}

cath = get_wiki(link, 2, column_dict, False, [])

# Get eastern orthodox data
link = 'https://en.wikipedia.org/wiki/Eastern_Orthodoxy_by_country'
column_dict = {0:'Country',
               2:'Orthodox_PCT'}

orth = get_wiki(link, 1, column_dict, False, [])

# Combine and fill data
for i in relg.index:
    try:
        if cath.loc[i,'Catholic_PCT'] > 0:
            pass
        else: cath.loc[i,'Catholic_PCT'] = 0
    except:
        cath.loc[i,'Catholic_PCT'] = 0

    try:
        if orth.loc[i,'Orthodox_PCT'] > 0:
            pass
        else: orth.loc[i,'Orthodox_PCT'] = 0
    except:
        orth.loc[i,'Orthodox_PCT'] = 0

religion = join_all([relg, cath, orth], True)
religion

Unnamed: 0_level_0,Christain_PCT,Muslim_PCT,Irreligion_PCT,Hindu_PCT,Buddhist_PCT,Folk_PCT,Other_PCT,Jewish_PCT,Catholic_PCT,Orthodox_PCT
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Afghanistan,0.001,0.997,0.0,0.0009,0.0,0.0,0.0006,0.0,0.000003,0
Albania,0.0018,0.00803,0.00014,0.0,0.0,0.0,0.00002,0.0,0.1,6.75
Algeria,0.01,0.98,0.018,0.0,0.0,0.002,0.0,0.008,0.0014,0
American Samoa,0.00983,0.00001,0.00007,0.0,0.00003,0.00004,0.00003,0.0,0,0
Andorra,0.00895,0.00008,0.00088,0.00005,0.0,0.0,0.00001,0.00003,0.882,0
...,...,...,...,...,...,...,...,...,...,...
Vietnam,0.082,0.002,0.296,0.0021,0.164,0.453,0.004,0.0,0.069,0
Wallis and Futuna,0.00974,0.0,0.00006,0.0,0.0,0.00012,0.00008,0.0,0,0
Yemen,0.002,0.991,0.001,0.006,0.0,0.0,0.0,0.0,0.0002,0
Zambia,0.976,0.01,0.005,0.001,0.0,0.003,0.009,0.0,0.21,0


In [10]:
# Get a list of languages, their families, and their branches
link = 'https://en.wikipedia.org/wiki/List_of_languages_by_number_of_native_speakers'
all_lang = pd.read_html(link)[0]

def add_lang(data, name, family, branch):
    df = pd.DataFrame({'Rank':0, 
                       'Language':name, 
                       'Native Speakers(millions)':0, 
                       'Percentageof world pop.(March 2019)[10]':0,
                       'Language family': family,
                       'Branch': branch},
                       index=[(data.shape[0] + 1)])
    return pd.concat([data,df],axis=0)

all_lang = add_lang(all_lang, 'Pashto', 'Indo-European', 'Iranian')
all_lang = add_lang(all_lang, 'Dari', 'Indo-European', 'Iranian')
all_lang = add_lang(all_lang, 'Azerbaijani', 'Turkic', 'Oghuz')
all_lang = add_lang(all_lang, 'Dzongkha', 'Sino-Tibetan', 'Tibetic')
all_lang = add_lang(all_lang, 'Bosnian', 'Indo-European', 'Balto-Slavic')
all_lang = add_lang(all_lang, 'Croatian', 'Indo-European', 'Balto-Slavic')
all_lang = add_lang(all_lang, 'Serbian', 'Indo-European', 'Balto-Slavic')
all_lang = add_lang(all_lang, 'Bulgarian', 'Indo-European', 'Balto-Slavic')
all_lang = add_lang(all_lang, 'Georgian', 'Kartvelian', 'Karto-Zan')
all_lang = add_lang(all_lang, 'Icelandic', 'Indo-European', 'Germanic')
all_lang = add_lang(all_lang, 'Lao', 'Kra–Dai', 'Tai')
all_lang = add_lang(all_lang, 'Maldivian', 'Indo-European', 'Iranian')
all_lang = add_lang(all_lang, 'Mongolian', 'Mongolic', 'Mongolian')
all_lang = add_lang(all_lang, 'Norwegian', 'Indo-European', 'Germanic')
all_lang = add_lang(all_lang, 'Swedish', 'Indo-European', 'Germanic')
all_lang = add_lang(all_lang, 'Urdu', 'Indo-European', 'Iranian')
all_lang = add_lang(all_lang, 'Irish', 'Indo-European', 'Celtic')
all_lang = add_lang(all_lang, 'Guyanese', 'Indo-European', 'Creole')
all_lang = add_lang(all_lang, 'Krio', 'Indo-European', 'Creole')
all_lang = add_lang(all_lang, 'Fante', 'Niger-Congo', 'Atlantic')
all_lang = add_lang(all_lang, 'Hausa', 'Afro-Asiatic', 'Chadic')
all_lang = add_lang(all_lang, 'Fula', 'Niger-Congo', 'Atlantic')
all_lang = add_lang(all_lang, 'Swahili', 'Niger-Congo', 'Bantu')
all_lang = add_lang(all_lang, 'Tonga', 'Niger-Congo', 'Bantu')
all_lang = add_lang(all_lang, 'Sotho', 'Niger-Congo', 'Bantu')
all_lang = add_lang(all_lang, 'Chewa', 'Niger-Congo', 'Bantu')
all_lang = add_lang(all_lang, 'Kalanga', 'Niger-Congo', 'Bantu')
all_lang = add_lang(all_lang, 'Filipino', 'Austronesian', 'Tagalog')

for i in all_lang.index:
    all_lang.loc[i,'Language'] = str(all_lang.loc[i,'Language']).split(' (')[0]
    all_lang.loc[i,'Language'] = str(all_lang.loc[i,'Language']).split('[')[0]

lang_list = all_lang['Language'].to_list()

link = 'https://en.wikipedia.org/wiki/List_of_official_languages_by_country_and_territory'
list2 = pd.read_html(link,)[1]['Language'].to_list()
lang_list = lang_list + list2 + ['Chinese', 'Arabic']

a = []
for i in lang_list:
    if i not in a:
        a.append(i)
lang_list = a

# Get tables identifying which languages belong to which family and branch
family_list = all_lang['Language family'].drop_duplicates().to_list()
branch_list = all_lang['Branch'].drop_duplicates().to_list()

families = pd.DataFrame(columns=['Languages'])
branches = pd.DataFrame(columns=['Languages'])
for i in all_lang.index:
    l = all_lang.loc[i,'Language']
    f = all_lang.loc[i,'Language family']
    b = all_lang.loc[i,'Branch']
    
    try:
        families.at[f,'Languages'] = families.loc[f,'Languages'] + [l]
    except:    
        families.at[f,'Languages'] = [l]

    try:
        branches.at[b,'Languages'] = branches.loc[b,'Languages'] + [l]
    except:    
        branches.at[b,'Languages'] = [l]
        
all_lang.tail(3)

Unnamed: 0,Rank,Language,Native Speakers(millions),Percentageof world pop.(March 2019)[10],Language family,Branch
116,0,Chewa,0,0,Niger-Congo,Bantu
117,0,Kalanga,0,0,Niger-Congo,Bantu
118,0,Filipino,0,0,Austronesian,Tagalog


In [11]:
# Import language data
link = 'https://en.wikipedia.org/wiki/List_of_official_languages_by_country_and_territory'
lang = pd.read_html(link,)[0]

for c in lang.columns[1:]:
    for i in lang.index:
        string = lang.loc[i,c]
        l_list = []
        
        try:
            for j in lang_list:
                if j in string:
                    l_list.append(j)
        except: pass
        
        lang.at[i,c] = l_list

for i in lang.index:
    lang.loc[i,'Country'] = str(lang.loc[i,'Country']).split('[')[0]

lang = lang.replace('United Kingdom and Crown dependencies etc.', 'United Kingdom')
lang.set_index('Country', inplace=True)

language = pd.DataFrame(index=climate.index)

for col in lang_list:
    for i in language.index:
        try:
            if col in lang.loc[i,'Official language']:
                language.loc[i,col] = .1
            elif col in lang.loc[i,'Regional language']:
                language.loc[i,col] = .05
            elif col in lang.loc[i,'Minority language']:
                language.loc[i,col] = .05
            elif col in lang.loc[i,'National language']:
                language.loc[i,col] = .05
            elif col in lang.loc[i,'Widely spoken']:
                language.loc[i,col] = .07
            else:
                language.loc[i,col] = 0

            if col in ['English', 'Mandarin Chinese', 'Spanish', 'Hindi', 'Bengali', 'Portuguese', 'Russian', 'Japanese', 'Arabic']:
                language.loc[i,col] = language.loc[i,col] * 2

        except: pass

for col in branch_list:
    l_list = branches.loc[col, 'Languages']
    
    for i in language.index:
        max = language.loc[i,l_list].max()
        language.loc[i,col] = max * 5

for col in family_list:
    l_list = families.loc[col, 'Languages']
    
    for i in language.index:
        max = language.loc[i,l_list].max()
        language.loc[i,col] = max * 3

link = 'https://en.wikipedia.org/wiki/List_of_countries_by_English-speaking_population'
eng = pd.read_html(link)[4]

eng = pd.DataFrame(np.asarray(eng[['Country','Total English speakers']]), columns=['Country', 'Speakers', 'Eng_pct'])
eng['Eng_pct'] = eng['Eng_pct'].astype('float')
eng.set_index('Country', inplace=True)
eng = normalize(eng, eng.columns,[1,1])

language = language.join(eng['Eng_pct'], how='left')
language.loc['Iceland','Eng_pct'] = 0.98
language.loc['Ireland','Eng_pct'] = 0.99
language['Eng_pct'].fillna(0.0, inplace=True)
language.dropna(inplace=True)

language.head(3)

  self.obj[key] = infer_fill_value(value)


Unnamed: 0_level_0,Mandarin Chinese,Spanish,English,Hindi,Bengali,Portuguese,Russian,Japanese,Punjabi language,Yue Chinese,...,Austronesian,Afroasiatic,Niger–Congo,Kra–Dai,Uralic,Kartvelian,Mongolic,Niger-Congo,Afro-Asiatic,Eng_pct
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Afghanistan,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Albania,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Algeria,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.061554


In [12]:
observation_list = ['United States', 'United Kingdom', 'France', 'Canada', 'Ireland']

language[language.index.isin(observation_list)].replace(0.0,np.nan).dropna(axis=1, how='all')

Unnamed: 0_level_0,English,French,Irish,Romance,Germanic,Celtic,Indo-European,Eng_pct
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Canada,0.2,0.1,,0.5,1.0,,0.6,0.829062
France,,0.1,,0.5,,,0.3,0.38446
Ireland,0.2,,0.1,,1.0,0.5,0.6,0.99
United Kingdom,0.2,0.1,0.1,0.5,1.0,0.5,0.6,0.982846
United States,0.2,,,,1.0,,0.6,0.954591


In [363]:
# Define functions to generate clusters
from sklearn.cluster import KMeans
from sklearn.cluster import SpectralClustering
from sklearn.cluster import DBSCAN

def add_k(data, name, size):
    cluster = KMeans(size)
    X = np.asarray(data)
    return pd.DataFrame(cluster.fit_predict(X), columns=[name + 'K_' + str(size)], index=data.index)

def add_spec(data, name, size):
    cluster = SpectralClustering(size)
    X = np.asarray(data)
    return pd.DataFrame(cluster.fit_predict(X), columns=[name + 'Spec_' + str(size)], index=data.index)

def add_dbs(data, name, neigh, core):
    cluster = DBSCAN(neigh, min_samples=core)
    X = np.asarray(data)
    return pd.DataFrame(cluster.fit_predict(X), columns=[name + 'DBS_' + str(core)], index=data.index)

res = pd.DataFrame(index=climate.index)

# Religion Clusters
res = res.join(add_k(religion, 'Religion', 4), how='outer')
res = res.join(add_k(religion, 'Religion', 8), how='outer')
res = res.join(add_k(religion, 'Religion', 16), how='outer')
res = res.join(add_spec(religion, 'Religion', 16), how='outer')
res = res.join(add_spec(religion, 'Religion', 32), how='outer')
res = res.join(add_spec(religion, 'Religion', 64), how='outer')






  est = KMeans(
  est = KMeans(


In [125]:



# Demographic Clusters
res = res.join(add_k(demographics, 'Demo', 4), how='outer')
res = res.join(add_k(demographics, 'Demo', 8), how='outer')
res = res.join(add_k(demographics, 'Demo', 16), how='outer')
res = res.join(add_k(demographics, 'Demo', 32), how='outer')
res = res.join(add_spec(demographics, 'Demo', 16), how='outer')
res = res.join(add_spec(demographics, 'Demo', 32), how='outer')

# Climate Clusters
res = res.join(add_k(climate, 'Climate', 4), how='outer')
res = res.join(add_k(climate, 'Climate', 8), how='outer')
res = res.join(add_k(climate, 'Climate', 16), how='outer')
res = res.join(add_spec(climate, 'Climate', 16), how='outer')
res = res.join(add_spec(climate, 'Climate', 32), how='outer')
res = res.join(add_spec(climate, 'Climate', 64), how='outer')

# Economic Clusters
res = res.join(add_k(economy, 'Economy', 4), how='outer')
res = res.join(add_k(economy, 'Economy', 8), how='outer')
res = res.join(add_k(economy, 'Economy', 16), how='outer')
res = res.join(add_spec(economy, 'Economy', 8), how='outer')
res = res.join(add_spec(economy, 'Economy', 16), how='outer')
res = res.join(add_spec(economy, 'Economy', 32), how='outer')

# Language Clusters
res = res.join(add_k(language[branch_list + ['Eng_pct']], 'Language', 4), how='outer')
res = res.join(add_k(language, 'Language', 8), how='outer')
res = res.join(add_spec(language[branch_list + ['Eng_pct']], 'Language', 16), how='outer')
res = res.join(add_spec(language, 'Language', 64), how='outer')
res = res.join(add_dbs(language, 'Language', 0.2, 3), how='outer')
res = res.join(add_dbs(language, 'Language', 0.3, 4), how='outer')


In [364]:
# Various re-maping
res.replace(-1.0, np.nan, inplace=True)
res.reset_index(inplace=True)

missing = set(res['Country']).difference(set(country_names))
fill = list(set(country_names).difference(set(res['Country'])))

select_fill = []
for i in fill:
    if i[0] == 'G':
        select_fill.append(i)

res.head(3)

Unnamed: 0,Country,ReligionK_4,ReligionK_8,ReligionK_16,ReligionSpec_16,ReligionSpec_32,ReligionSpec_64
0,Afghanistan,0.0,0.0,11.0,0.0,0.0,0.0
1,Albania,0.0,6.0,15.0,7.0,19.0,34.0
2,Algeria,0.0,0.0,11.0,0.0,0.0,0.0


In [366]:
# Evaluate the foreignness rating for each countries by how many countries and how many groups the user has been to
l = ['United States of America','France', 'Iceland!', 'Mexico', 
     'Mozambique', 'Chile', 'Guatemala', 'Monaco', 'South Africa']
n = ['United States of America', 'Canada', 'Mexico', 'Costa Rica', 'Switzerland',
     'Iceland', 'United Kingdom', 'Ireland', 'France', 'Germany', 'Austria',
     'Italy', 'Turkey', 'Kenya', 'Norway', 'Latvia', 'Estonia', 'Finland', 'Hungary']
x = ['United States of America', 'Canada']
y = ['China']

visited_list = y

visited = res[res['Country'].isin(visited_list)]
totals = pd.DataFrame(res['Country'], columns=['Country'])

for col in res.columns[1:]:
    for i in res.index:
        v_list = visited[col].to_list()
        
        if res.loc[i,col] in v_list:
            totals.loc[i,col] = 1
        else:
            totals.loc[i,col] = 0

totals['Sum'] = totals.sum(axis=1)
totals['nans'] = res.isnull().sum(axis=1)
max = totals['Sum'].max()

for i in totals.index:
    if totals.loc[i,'nans'] > 3:
        totals.loc[i,'Sum'] = np.nan
    if totals.loc[i,'Country'] in visited_list:
        totals.loc[i,'Sum'] = max * 1.3

totals.drop(totals[totals['nans'] > 3].index, axis=0, inplace=True)

m = folium.Map(location=[20, 0], zoom_start=2, scrollWheelZoom=False)
folium.TileLayer('cartodbpositron').add_to(m)

color_map = folium.Choropleth(
    geo_data = country_shapes,
    data = totals,
    columns = ['Country','Sum'],
    key_on = 'properties.ADMIN',
    fill_color = 'BrBG',
    nan_fill_color = 'grey'
).add_to(m)

color_map.geojson.add_child(
    folium.features.GeoJsonTooltip(['ADMIN'], labels=False)
)

m

KeyboardInterrupt: 

In [111]:
totals.sort_values(by='nans', ascending=False).reset_index(drop=True).head(11)

Unnamed: 0,Country,DemoK_4,DemoK_8,DemoK_16,DemoK_32,DemoSpec_16,DemoSpec_32,ReligionK_4,ReligionK_8,ReligionK_16,...,ReligionSpec_32,ReligionSpec_64,LanguageK_4,LanguageK_8,LanguageSpec_16,LanguageSpec_64,LanguageDBS_3,LanguageDBS_4,Sum,nans
0,Afghanistan,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,New Zealand,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,12.0,0
2,Niger,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,Nigeria,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,4.0,0
4,North Korea,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0
5,Macedonia,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0
6,Norway,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,8.0,0
7,Oman,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
8,Pakistan,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0
9,Panama,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0
