In [None]:
# Install folium and other component in the current python environement if needed
!pip install folium
!pip install https://github.com/slimkrazy/python-google-places/zipball/master
!pip install simplejson

In [None]:
import simplejson as json
import pandas as pd
import numpy as np
import urllib.parse
import folium
import pprint
%matplotlib inline  

## Load data

Just loading the data using _read_csv_ won't work. Indeed the function assume that by default the delimiter is a coma. Without any surpise this is not the delimiter used in this dataset. By opening the file, we saw the delimiter used (";") and that two columns contains date information. 

In [None]:
data = pd.read_csv('P3_GrantExport.csv', delimiter=";", parse_dates=['Start Date', 'End Date'])
print("Number of entries: ", len(data))

Let's directly rename some of the column that we will be using later on. This way no more need to rely on the bracket notation!

In [None]:
cols = {'Project Title' : 'title',
        'University' : 'university',
        'Approved Amount' : 'amount'}

data = data.rename(columns=cols)

First we do a little cleanup of the data. Indeed some of the informations is not relevant in our study of this dataset. For example, some of the values of the university field can not be pin pointed to a specific location, thus we "nanify" them.

In [None]:
#
# Clean the 'Approved Amount' field
#

data.amount = pd.to_numeric(data.amount, errors='coerce')

#
# Clean the 'University' field
#

def nanify(name, df=data):
    print("Nanify ",sum(df.university == name) ,name)
    df.university.replace(name, np.nan, inplace=True)

# Not even known
nanify('Nicht zuteilbar - NA')
# Pirvate firm, can be anywhere in switzerland
nanify('Firmen/Privatwirtschaft - FP')
# Non profit organisation
nanify('NPO (Biblioth., Museen, Verwalt.) - NPO')
# Other institute, we are not sure about this one, but we didn't
# find any information about "FINST", so nanify.
nanify('Weitere Institute - FINST')

print()
print("Non null entries: ", sum(~data.university.isnull()))

Now that we have a "clean" dataset (at least for the "University" field), let's find the set of universities:

In [None]:
university_names = data[~data.university.isnull()].university.unique()
print("Total unique 'university': {}".format(len(university_names)))

----------

## Utils
The homework suggest to use Geocodename api, instead we will use the google api. We will rely on the wrapper [python-google-places](https://github.com/slimkrazy/python-google-places) built by slimkrazy. 

In [None]:
from googleplaces import GooglePlaces, types, lang
import json
import re

# If you want to run the code, please replace with your own api key
#API_KEY = 'AIzaSyAO3y5A063zYuDOcygdsSr9pQQLffresvY'
API_KEY = 'AIzaSyAn50ctJ8AlxOY85aYBmByoTIWVvZd0qaI'
#API_KEY = 'AIzaSyAqJboklPIDoGtp83HrHuxk7sOdMiDkHKc'
#API_KEY = 'AIzaSyB30YEXaCNiJMmv0i4x1s2O9zcmGav9s2Y'

API_KEY = 'YOURBEAUTIFULGOOGLEAPIKEY'

google_places = GooglePlaces(API_KEY)


def get_info(name):
    '''Return a dictionary containing the canton and locatisation information about a university.
    
    Args:
        name (str): The name of the place to query google about.

    Returns:
        A python dict containing the canton and geo location information about the "name". 
        If no result are returned by google, the value for each key will be None.
    '''
    
    query_result = google_places.text_search(query=name)
    
    info = {
        'canton': None,
        'geo_location': None,
    }
    
    if query_result.places:
        if len(query_result.places) > 1:
            print("  !! More than one !!".format(name)) # Just print a warning
        
        place = query_result.places[0]
        place.get_details()
        
        canton = get_area_level_1(place.details)
        if canton:
            info['canton'] = canton
            info['geo_location'] = place.geo_location
    
    return info

def get_info_wrapper(name):
    '''Return a dictionary containing the canton and locatisation information about a university.
    
    This function do the same as get_info but also prints feedback.
    
    Args:
        name (str): The name of the place to query google about.

    Returns:
        A python dict containing the canton and geo location information about the "name". 
        If no result are returned by google, the value for each key will be None.
    '''
    
    print(name)
    d = get_info(name)
    canton = d['canton']
    print("  -> " + canton) if canton else print("  -> Unknown")
    return d

def get_area_level_1(details):
    ''' Return the adimistrative area level 1
    
    Args:
        details (dict):  A dict matching the JSON response from Google.

    Returns:
        The name of the canton (2 letters code) or None if information not present.
    '''
    
    interest_types = 'administrative_area_level_1'
    canton = None
    for t in details['address_components']:
        if interest_types in t['types']:
            return t['short_name']

        
def check_dict(d):
    ''' Return the adimistrative area level 1
    
    Args:
        details (dict):  A dict matching the JSON response from Google.

    Returns:
        The name of the canton (2 letters code) or None if information not present.
    '''
    
    good = {name: values for name, values in d.items() if values['canton']}
    empty = [name for name, values in d.items() if not values['canton']]
    return good, empty


def parse_name(x):
    ''' Returns the parsed input.
    
    The university name almost all follows the format "X - Y" where X is the name of the university
    and Y is the abreviation or the canton.
    
    Args:
        x (str):  The name to parse

    Returns:
        A tuple of (name, abbr) or a list containing the name if couldn't be parsed.
    '''
    
    match = re.search('(.+?)\ -\ (\w+)', x)
    if match:
        return (match.group(1), match.group(2))
    return [x]

--------

# Canton information retrieval

First we do it the stupid way, juste take the full university name and ask google if it knows it. 

In [None]:
name2dict_first = {name: pretty_print_get_info(name + " schweiz") for name in university_names}
name2dict, unknown = check_dict(name2dict_first)
print()
print("Still {} items without a canton".format(len(unknown)))

We then try using only the name (without the abreviation)

In [None]:
names_empty = {name: parse_name(name) for name in unknown}
names_empty2dict = {name: pretty_print_get_info(p[0] + " schweiz") for name, p in names_empty.items() if p}
names_empty2dict, unknown = check_dict(names_empty2dict)
print()
print("Still {} items without a canton".format(len(unknown)))

There is still some items without a canton. Their is one possiblity left, we could use the abreviation to find the canton. Due to the low number of unknown field we decided to do it by hand directly, indeed we would still have verified them by hand to be sure of the results.

In [None]:
unknown

In [None]:
by_hand = {
    'Swiss Institute of Bioinformatics - SIB': {'canton': 'VD', 'geo_location': {'lat': 46.519433, 'lng': 6.574533}},
    'Pädag. Hochschule Tessin (Teilschule SUPSI) - ASP': {'canton': 'TI', 'geo_location': {'lat': 46.023528, 'lng': 8.917150}},
    'Forschungsinstitut für Opthalmologie - IRO': {'canton': 'VS', 'geo_location':  {'lat':46.233131,  'lng':7.383104}},
    'Staatsunabh. Theologische Hochschule Basel - STHB': {'canton': 'BL', 'geo_location': {'lat':47.577821, 'lng':7.650187}},
    'Physikal.-Meteorolog. Observatorium Davos - PMOD' : {'canton': 'GR', 'geo_location': {'lat':46.814241, 'lng':9.844508}}, 
    'Forschungskommission SAGW' :  {'canton': 'LU', 'geo_location': {'lat':47.050179, 'lng':8.312586}}, 
}

We now can just merge the dictionnary.

In [None]:
name2dict = dict(name2dict, **dict(names_empty2dict, **by_hand));
print("Total unique 'university': {}".format(len(name2dict)))

In [None]:
pprint.pprint(name2dict)

As we can see there is a weird canton: 'Lazio'. After investigation, it is the value attributed to _Istituto Svizzero di Roma - ISR schweiz_. The value in the table represents grant attributed to a foundation based in Italy with a least a branch in Switzerland. We will just drop those. 

In [None]:
print(set([v['canton'] for k,v in name2dict.items()]))
nanify('Istituto Svizzero di Roma - ISR'):

### Univeristy geo localisation
To verify is our retrieved data are correct we will show them on a map.

In [None]:
map_1 = folium.Map(location=[46.484, 8.1336], zoom_start=7)

# Add a marker for each university
for name, value in name2dict.items():
    canton = value['canton']
    geo_location = value['geo_location']
    folium.Marker([geo_location['lat'], geo_location['lng']], popup='{} - {}'.format(canton, name)).add_to(map_1)

folium.TopoJson(open('ch-cantons.topojson.json'), 'objects.cantons', name='topojson',).add_to(map_1)

map_1

Finally, we add a column to the dataframe and save it. This way no need to rerun all the above everytime we restart the kernel.

In [None]:
# Dump the dictionary to disk
j = json.dumps(name2dict, use_decimal=True)
with open('name2dict.txt', 'w') as outfile:
    json.dump(j, outfile)

# Dump the new dataset to disk
data['canton'] = data[~data['University'].isnull()]['University'].map(lambda x: name2dict[x]['canton'])
data.to_csv('P3_GrantExport_meta.csv')

----------

# Visualization

In [None]:
# Load back the dataset if needed
data = pd.read_csv('P3_GrantExport_meta.csv', index_col = [0])
data = data.rename(columns=cols)
data.amount = pd.to_numeric(data.amount, errors='coerce')
data.head(2)

First we drop everything that has _NA_ either in the university, the canton  or in the approved amount column. They won't be usefull for our analysis.

In [None]:
data = data.dropna(subset = ['university', 'canton', 'amount'])
print("Total number of usable projets: ", len(data))

We can know find the total amount for each canton. Some canton are not represented, their "total amount" will thus be 0.

In [None]:
canton_file = 'ch-cantons.topojson.json'
with open(canton_file) as data_file:    
    geojson = json.load(data_file)
    
swiss_canton = [c['id'] for c in geojson['objects']['cantons']['geometries']]
total_amount = data.groupby(['canton']).agg({'amount': sum})
total_amount = total_amount.reindex(swiss_canton).fillna(0).sort_values(by=['amount'], ascending=False)
total_amount.head()

We can know visualize this total amount. The data is quiet skewed as we can see in the plot under. So instead of using a simple linear scale, we will rely on the quantils. 

In [None]:
total_amount.plot.bar()

The map losses some style points, the borders are not perfectly aligned :(

In [None]:
quantiles = total_amount.quantile([.10, .45, .60, .85, .90, 1.0])
scale = folium.utilities.split_six(total_amount)# quantiles.amount.tolist()

map = folium.Map(location=[46.8,8.2], zoom_start=8, tiles='Mapbox bright')
map.choropleth(geo_path = 'ch-cantons.topojson.json',
                       data = total_amount.reset_index(),
                       columns = ['canton', 'amount'],
                       key_on = 'feature.id',
                       threshold_scale = scale,
                       topojson = 'objects.cantons',
                       fill_color = 'GnBu',
)
map

# Visualization Bonus



We gonna cheat and reuse the 'ch-cantons.topojson.json' file. We could have generated a new topojson file with only the separation between the röstigraben but well time is of the essence. Instead we gonna group the canton in two categories.

In [None]:
# We know, it is quiet optinated... 
french = ['GE','VD','FR','VS','NE','JU']
german = [c for c in swiss_canton if not c in french]

Let's find the total for the french and german part of switzerland.

In [None]:
french_idx = total_amount.reset_index().apply(lambda x: x.canton in french, axis=1)
total_french = total_amount.reset_index()[french_idx].amount.sum()
total_german = total_amount.reset_index()[~french_idx].amount.sum()

In [None]:
swiss_df = pd.DataFrame(swiss_canton)
swiss_df['total'] = swiss_df.apply(lambda x: total_french if x[0] in french else total_german, axis = 1)

In [None]:
map = folium.Map(location=[46.8,8.2], zoom_start=8, tiles='Mapbox bright')
map.choropleth(geo_path = 'ch-cantons.topojson.json',
                       data = swiss_df.reset_index(),
                       columns = [0, 'total'],
                       key_on = 'feature.id',
                       topojson = 'objects.cantons',
                       fill_color = 'GnBu',
)
map