In [None]:
# Install folium in the current python environement
!pip install folium
!pip install https://github.com/slimkrazy/python-google-places/zipball/master
!pip install simplejson

In [None]:
import pandas as pd
import numpy as np
import urllib.parse

## Load data

We opened the file and determined the delimiter. We also saw that their was two date fields. We can then directly load the data correctly.

In [None]:
df = pd.read_csv('P3_GrantExport.csv', delimiter=";", parse_dates=['Start Date', 'End Date'])

In [None]:
data = df.copy()
len(data)

We do a little cleanup of the data.

In [None]:
# Replace "Nicht zuteilbar - NA" with np.nan, indeed it won't possible to know the canton
print("Nanify {}".format(sum(data['University'] == 'Nicht zuteilbar - NA')))
data['University'].replace(r'Nicht zuteilbar - NA', np.nan, regex=True, inplace=True)

# Replace with nan when it is not a precise location
print("Nanify {}".format(sum(data['University'] == 'NPO (Biblioth., Museen, Verwalt.) - NPO')))
data.loc[data['University'] == 'Firmen/Privatwirtschaft - FP', 'University'] = np.nan
print("Nanify {}".format(sum(data['University'] == 'NPO (Biblioth., Museen, Verwalt.) - NPO')))
data.loc[data['University'] == 'NPO (Biblioth., Museen, Verwalt.) - NPO'] = np.nan
print("Nanify {}".format(sum(data['University'] == 'Weitere Institute - FINST')))
data.loc[data['University'] == 'Weitere Institute - FINST'] = np.nan

The homework suggest to use Geocodename api, instead we will use the google api. We will rely on the wrapper [python-google-places](https://github.com/slimkrazy/python-google-places) built by slimkrazy. 

In [None]:
from googleplaces import GooglePlaces, types, lang
import json

API_KEY = 'AIzaSyAO3y5A063zYuDOcygdsSr9pQQLffresvY'
API_KEY = 'AIzaSyAn50ctJ8AlxOY85aYBmByoTIWVvZd0qaI'
API_KEY = 'AIzaSyAqJboklPIDoGtp83HrHuxk7sOdMiDkHKc'
API_KEY = 'AIzaSyB30YEXaCNiJMmv0i4x1s2O9zcmGav9s2Y'
google_places = GooglePlaces(API_KEY)

def get_info(name):
    query_result = google_places.text_search(query=name)
    
    info = {
        'canton': None,
        'geo_location': None,
    }
    
    if query_result.places:
        if len(query_result.places) > 1:
            print("  !! More than one !!".format(name))
        place = query_result.places[0]
        place.get_details()
        
        canton = get_area_level_1(place.details)
        if canton:
            info['canton'] = canton
            info['geo_location'] = place.geo_location
    
    return info

def get_area_level_1(details):
    interest_types = 'administrative_area_level_1'
    canton = None
    for t in details['address_components']:
        if interest_types in t['types']:
            return t['short_name']
            

In [None]:
names = data[~data['University'].isnull()]['University'].unique()
print("Total unique 'university': {}".format(len(names)))

We will know try to find the "administrative_area_level_1", i.e the canton for each "University".

In [None]:
import re
def check_dict(d):
    good = {name: values for name, values in d.items() if values['canton']}
    empty = [name for name, values in d.items() if not values['canton']]
    return good, empty

def pretty_print_get_info(name):
    print(name)
    d = get_info(name)
    canton = d['canton']
    print("  -> " + canton) if canton else print("  -> Unknown")
    return d

def parse_name(x):
    match = re.search('(.+?)\ -\ (\w+)', x)
    if match:
        return (match.group(1), match.group(2))
    return [x]

First we do it the stupid way, juste take the full university name and ask google if it knows it. 

In [None]:
name2dict_first = {name: pretty_print_get_info(name + " schweiz") for name in names}
name2dict, unknown = check_dict(name2dict_first)
print()
print("Still {} items without a canton".format(len(unknown)))

We then try to find using only the name (without the abreviation)

In [None]:
names_empty = {name: parse_name(name) for name in unknown}
names_empty2dict = {name: pretty_print_get_info(p[0] + " schweiz") for name, p in names_empty.items() if p}
names_empty2dict, unknown = check_dict(names_empty2dict)
print()
print("Still {} items without a canton".format(len(unknown)))

There is still some items without a canton. Their is one possiblity left, we could use the abreviation to find the canton. Even though we try, we would still verify the result to be sure. So we will treat them by hand.

In [None]:
unknown

In [None]:
by_hand = {
    'Swiss Institute of Bioinformatics - SIB': {'canton': 'VD', 'geo_location': {'lat': 46.519433, 'lng': 6.574533}},
    'Pädag. Hochschule Tessin (Teilschule SUPSI) - ASP': {'canton': 'TI', 'geo_location': {'lat': 46.023528, 'lng': 8.917150}},
    'Forschungsinstitut für Opthalmologie - IRO': {'canton': 'VS', 'geo_location':  {'lat':46.233131,  'lng':7.383104}},
    'Staatsunabh. Theologische Hochschule Basel - STHB': {'canton': 'BL', 'geo_location': {'lat':47.577821, 'lng':7.650187}},
    'Physikal.-Meteorolog. Observatorium Davos - PMOD' : {'canton': 'GR', 'geo_location': {'lat':46.814241, 'lng':9.844508}}, 
    'Forschungskommission SAGW' :  {'canton': 'LU', 'geo_location': {'lat':47.050179, 'lng':8.312586}}, 
}

We now can just merge the dictionnary. We are just missing the "Weitere Institute - FINST".

In [None]:
name2dict = dict(name2dict, **dict(names_empty2dict, **by_hand));
print("Total unique 'university': {}".format(len(name2dict)))

In [None]:
map_1 = folium.Map(location=[46.8182, 8.2275], zoom_start=8)

for name, value in name2dict.items():
    canton = value['canton']
    geo_location = value['geo_location']
    folium.Marker([geo_location['lat'], geo_location['lng']], popup='{} - {}'.format(canton, name)).add_to(map_1)
map_1

Finally, we add a column to the dataframe and save it. This way no need to rerun all the above everytime we restart the kernel.

In [None]:
import simplejson as json
j = json.dumps(name2dict, use_decimal=True)
with open('name2dict.txt', 'w') as outfile:
    json.dump(j, outfile)

data['canton'] = data[~data['University'].isnull()]['University'].map(lambda x: name2dict[x]['canton'])
data.to_csv('P3_GrantExport_meta.csv')

# Visualization



In [None]:
data = pd.read_csv('P3_GrantExport_meta.csv')