In [None]:
# Install folium in the current python environement
!pip install folium
!pip install https://github.com/slimkrazy/python-google-places/zipball/master

In [None]:
import pandas as pd
import numpy as np
import urllib.parse

## Load data

We opened the file and determined the delimiter. We also saw that their was two date fields. We can then directly load the data correctly.

In [None]:
df = pd.read_csv('P3_GrantExport.csv', delimiter=";", parse_dates=['Start Date', 'End Date'])

In [None]:
data = df.copy()
len(data)

We do a little cleanup of the data.

In [None]:
# Replace "Nicht zuteilbar - NA" with np.nan, indeed it won't possible to know the canton
print("Nanify {}".format(sum(data['University'] == 'Nicht zuteilbar - NA')))
data['University'].replace(r'Nicht zuteilbar - NA', np.nan, regex=True, inplace=True)

# Replace with nan when it is not a precise location
print("Nanify {}".format(sum(data['University'] == 'NPO (Biblioth., Museen, Verwalt.) - NPO')))
data.loc[data['University'] == 'Firmen/Privatwirtschaft - FP', 'University'] = np.nan
print("Nanify {}".format(sum(data['University'] == 'NPO (Biblioth., Museen, Verwalt.) - NPO')))
data.loc[data['University'] == 'NPO (Biblioth., Museen, Verwalt.) - NPO'] = np.nan

The homework suggest to use Geocodename api, instead we will use the google api. We will rely on the wrapper [python-google-places](https://github.com/slimkrazy/python-google-places) built by slimkrazy. 

In [None]:
from googleplaces import GooglePlaces, types, lang
import json

API_KEY = 'AIzaSyAO3y5A063zYuDOcygdsSr9pQQLffresvY'
google_places = GooglePlaces(API_KEY)

def get_info(name):
    query_result = google_places.text_search(query=name)
    if query_result.places:
        if len(query_result.places) > 1:
            print("  !! More than one !!".format(name))
        place = query_result.places[0]
        place.get_details()
        return get_area_level_1(place.details)

def get_area_level_1(details):
    interest_types = 'administrative_area_level_1'
    canton = None
    for t in details['address_components']:
        if interest_types in t['types']:
            return t['short_name']
            

In [None]:
names = data[~data['University'].isnull()]['University'].unique()
print("Total unique 'university': {}".format(len(names)))

We will know try to find the "administrative_area_level_1", i.e the canton for each "University".

In [None]:
import re
def check_dict(d):
    good = {name: value for name, value in d.items() if value}
    empty = [name for name, value in d.items() if not value]
    return good, empty

def pretty_print_get_info(name):
    print(name)
    canton = get_info(name)
    print("  -> " + canton) if canton else print("  -> Unknown")
    return canton

def parse_name(x):
    match = re.search('(.+?)\ -\ (\w+)', x)
    if match:
        return (match.group(1), match.group(2))
    return ""

First we do it the stupid way, juste take the full university name and ask google if it knows it. 

In [None]:
name2canton_first = {name: pretty_print_get_info(name + " schweiz") for name in names}
name2canton, unknown = check_dict(name2canton_first)
print()
print("Still {} items without a canton".format(len(unknown)))

We then try to find using only the name (without the abreviation)

In [None]:
names_empty = {name: parse_name(name) for name in unknown}
names_empty2canton = {name: pretty_print_get_info(p[0] + " schweiz") for name, p in names_empty.items() if p}
names_empty2canton, unknown = check_dict(names_empty2canton)
print()
print("Still {} items without a canton".format(len(unknown)))

There is still some items without a canton. Their is one possiblity left, we could use the abreviation to find the canton. Even though we try, we would still verify the result to be sure. So we will treat them by hand.

In [None]:
unknown

In [None]:
by_hand = {
    'Swiss Institute of Bioinformatics - SIB': 'VD',
    #'Weitere Institute - FINST': , # No idea
    'Pädag. Hochschule Tessin (Teilschule SUPSI) - ASP': 'TI',
    'Forschungsinstitut für Opthalmologie - IRO': 'VS',
    'Staatsunabh. Theologische Hochschule Basel - STHB': 'BL',
    'Physikal.-Meteorolog. Observatorium Davos - PMOD' : 'GR', 
}

We now can just merge the dictionnary. We are just missing the "Weitere Institute - FINST".

In [None]:
name2canton = dict(name2canton_first, **dict(names_empty2canton, **by_hand));
print("Total unique 'university': {}".format(len(name2canton)))

Finally, we add a column to the dataframe and save it. This way no need to rerun all the above everytime we restart the kernel.

In [None]:
data['canton'] = data[~data['University'].isnull()]['University'].map(lambda x: name2canton[x])
data.to_csv('P3_GrantExport_meta.csv')

# Visualization



In [None]:
data = pd.read_csv('P3_GrantExport_meta.csv')