In [112]:
import folium
import os
import json
import pandas as pd
import random
import numpy as np
import math

import urllib.request
import requests

## Import the data and basic wrangling

In [113]:
P3_data = os.path.join('data', 'P3_GrantExport_wo_keyword.csv')

df = pd.read_csv(P3_data)
df = df.dropna()
df = df[['University','Approved Amount']]
df.head()

Unnamed: 0,University,Approved Amount
22671,Université de Lausanne - LA,833333.0
23857,Universität Basel - BS,663000.0
24492,Universität Basel - BS,235000.0
24641,Université de Fribourg - FR,962090.0
24784,Universität Zürich - ZH,96625.0


In [114]:
# cleaning
df = df[df['Approved Amount'] != 'data not included in P3']

# make the approved amount col a float
# remove a false positive warning
pd.options.mode.chained_assignment = None
df['Approved Amount'] = df['Approved Amount'].map(lambda x: float(x))
df = df[[university not in 'Nicht zuteilbar - NA' for university in df['University']]] # remove missing unis
df.head()

Unnamed: 0,University,Approved Amount
22671,Université de Lausanne - LA,833333.0
23857,Universität Basel - BS,663000.0
24492,Universität Basel - BS,235000.0
24641,Université de Fribourg - FR,962090.0
24784,Universität Zürich - ZH,96625.0


In [None]:
# only 74 values, we could do the mapping manually
len(list(df['University'].value_counts()))

74

In [None]:
def run_query(uni_name):
    return requests.get(url='http://api.geonames.org/search',  
    params = {
            'q': uni_name, 
            'country': 'CH',
            'maxRows': '10',
            'username': 'mgoretti',
            'type': 'json'
        })
    
def get_canton(uni_name):
    res = json.loads(run_query(uni_name).text)
#     print(res.text)
    if (res['totalResultsCount'] > 0):
        return res['geonames'][0]['adminCode1']
    else:
        return None
  
    
def parse_name(uni_name):
    names = uni_name.split(' - ')
    if len(names) == 1:
        return get_canton(names[0])
    else:
        # we prefer the result of the value before the -, as it's a more precise name
        # but we can use the second if we don't get any result for the first
        return get_canton(names[0]) or get_canton(names[1]) 
    
        
    
def fetch_canton(entries):
    uni_name = entries['University'].iloc[0]
    entries['canton'] = parse_name(uni_name)
    return entries


# print(urllib.request.urlopen(build_query('EPFL')).read()) 
grants = df.groupby('University').apply(fetch_canton)



In [None]:
# we still have 1476 entries that don't have a matching
len(grants[grants['canton'].isnull()])

In [None]:
# check if the matched cantons are correct
grants[grants['canton'].notnull()].groupby('University').first()

Add some manual mapping to complete the results based on google searches

In [None]:
#Remove private sector entries as we are interested in grant given to universities
grants = grants[grants.University != 'Firmen/Privatwirtschaft - FP']

#Remove NPO entries for same reasons
grants = grants[grants.University != 'NPO (Biblioth., Museen, Verwalt.) - NPO']

#TODO check if this should be included imo it is international help so no?
grants = grants[grants.University != 'Weitere Spitäler - ASPIT']

def manualFetch(entries):
    name = manualDict.get(entries['University'].iloc[0])
    if (name):
        entries['canton'] = name
    return entries

manualDict = {'Friedrich Miescher Institute - FMI':'BS',
              'Inst. de Hautes Etudes Internat. et du Dév - IHEID' : 'GE',
              'Hochschule Luzern - HSLU' : 'LU',
              'Zürcher Fachhochschule (ohne PH) - ZFH' : 'ZH',
              'Università della Svizzera italiana - USI' : 'TI',
              'Pädagogische Hochschule Zürich - PHZFH' : 'ZH',
              "Centre de rech. sur l'environnement alpin - CREALP": 'VS',
              'Idiap Research Institute - IDIAP' : 'VS',
              'Haute école pédagogique du canton de Vaud - HEPL': 'VD',
              'Berner Fachhochschule - BFH' : 'BE',
              'Pädagogische Hochschule Nordwestschweiz - PHFHNW' : 'AG',
              'Ente Ospedaliero Cantonale - EOC' : 'TI',
              'Swiss Institute of Bioinformatics - SIB' : 'VD',
              'Pädagogische Hochschule Luzern - PHLU' : 'LU',
             }

grants = grants.groupby('University').apply(manualFetch)

In [None]:
#Check that we reached the 95% threshold
entryLeft = len(grants[grants['canton'].isnull()])
entriesTotal = df.shape[0]
print("{} / {} = {}%".format(entryLeft, entriesTotal, (entryLeft / entries)*100))

In [None]:
# compute the sum of the grant by canton

# def sum_approved_amout(entries):
#     print(entries['Approved Amount'].apply(lambda x: float(x)))
# grants[].groupby('canton').apply(sum_approved_amout)
grants_by_canton = grants[['Approved Amount', 'canton']].groupby('canton').sum()
grants_by_canton


In [None]:
# apply a transformation
# log
trans_grants_by_canton = grants_by_canton.copy()
trans_grants_by_canton['Approved Amount'] = trans_grants_by_canton['Approved Amount'].apply(lambda x: math.log10(x))
trans_grants_by_canton

In [None]:
# get canton info
swiss_cantons = os.path.join('data', 'ch-cantons.topojson.json')

cantons = json.load(open(swiss_cantons))
cantons_df = pd.DataFrame(pd.Series(x['id']) for x in cantons['objects']['cantons']['geometries'])

cantons_df.columns = ['canton']
cantons_df

# join on canton 
# left_grants_by_canton = cantons_df.join(grants_by_canton, on = ['canton'], how = 'left')
left_grants_by_canton = cantons_df.join(trans_grants_by_canton, on = ['canton'], how = 'left')
left_grants_by_canton.fillna(0, inplace = True)
left_grants_by_canton

In [None]:

swiss_map = folium.Map([46.8, 8.5], tiles='cartodbpositron', zoom_start=8)


swiss_map.choropleth(geo_path=swiss_cantons, data=left_grants_by_canton,
               columns=['canton', 'Approved Amount'],
               key_on='feature.id',
#                threshold_scale=[0, 1e7, 5e7, 1e8, 5e8, ],
               threshold_scale=[0, 5, 7, 8, 8.5, 9],
               fill_color='YlOrRd',
               topojson='objects.cantons',
               legend_name='Unemployment Rate (%)' #doesn't work?
                    ) 
swiss_map
