# Data Processing Scripts
@author: gvisona

In [1]:
from collections import defaultdict
from copy import deepcopy
import pickle
import json 
import os
import time
from tqdm.notebook import tqdm

import googlemaps
from opencage.geocoder import OpenCageGeocode


In [2]:
departments_list = [
"01 - Ain - Bourg-en-Bresse",
"02 - Aisne - Laon",
"03 - Allier - Moulins",
"04 - Alpes-de-Haute-Provence - Digne",
"05 - Hautes-Alpes - Gap",
"06 - Alpes Maritimes - Nice",
"07 - Ardèche - Privas",
"08 - Ardennes - Charleville-Mézières",
"09 - Ariège - Foix",
"10 - Aube - Troyes",
"11 - Aude - Carcassonne",
"12 - Aveyron - Rodez",
"13 - Bouches-du-Rhône - Marseille",
"14 - Calvados - Caen",
"15 - Cantal - Aurillac",
"16 - Charente - Angoulême",
"17 - Charente-Maritime - La Rochelle",
"18 - Cher - Bourges",
"19 - Corrèze - Tulle",
"2A - Corse-du-Sud - Ajaccio",
"2B - Haute Corse - Bastia",
"21 - Côte-d'Or - Dijon",
"22 - Côtes d'Armor - St-Brieuc",
"23 - Creuse - Guéret",
"24 - Dordogne - Périgueux",
"25 - Doubs - Besançon",
"26 - Drôme - Valence",
"27 - Eure - Evreux",
"28 - Eure-et-Loir - Chartres",
"29 - Finistère - Quimper",
"30 - Gard - Nîmes",
"31 - Haute Garonne - Toulouse",
"32 - Gers - Auch",
"33 - Gironde - Bordeaux",
"34 - Hérault - Montpellier",
"35 - Ille-et-Vilaine - Rennes",
"36 - Indre - Châteauroux",
"37 - Indre-et-Loire - Tours",
"38 - Isère - Grenoble",
"39 - Jura - Lons-le-Saunier",
"40 - Landes - Mont-de-Marsan",
"41 - Loir-et-Cher - Blois",
"42 - Loire - St-Étienne",
"43 - Haute Loire - Le Puy",
"44 - Loire Atlantique - Nantes",
"45 - Loiret - Orléans",
"46 - Lot - Cahors",
"47 - Lot-et-Garonne - Agen",
"48 - Lozère - Mende",
"49 - Maine-et-Loire - Angers",
"50 - Manche - St-Lô",
"51 - Marne - Châlons-sur-Marne",
"52 - Haute Marne - Chaumont",
"53 - Mayenne - Laval",
"54 - Meurthe-et-Moselle - Nancy",
"55 - Meuse - Bar-le-Duc",
"56 - Morbihan - Vannes",
"57 - Moselle - Metz",
"58 - Nièvre - Nevers",
"59 - Nord - Lille",
"60 - Oise - Beauvais",
"61 - Orne - Alençon",
"62 - Pas-de-Calais - Arras",
"63 - Puy-de-Dôme - Clermont-Ferrand",
"64 - Pyrénées Atlantiques - Pau",
"65 - Hautes Pyrénées - Tarbes",
"66 - Pyrénées Orientales - Perpignan",
"67 - Bas-Rhin - Strasbourg",
"68 - Haut-Rhin - Colmar",
"69 - Rhône - Lyon",
"70 - Haute Saône - Vesoul",
"71 - Saône-et-Loire - Mâcon",
"72 - Sarthe - Le Mans",
"73 - Savoie - Chambéry",
"74 - Haute Savoie - Annecy",
"75 - Paris - Paris",
"76 - Seine Maritime - Rouen",
"77 - Seine-et-Marne - Melun",
"78 - Yvelines - Versailles",
"79 - Deux-Sèvres - Niort",
"80 - Somme - Amiens",
"81 - Tarn - Albi",
"82 - Tarn-et-Garonne - Montauban",
"83 - Var - Toulon",
"84 - Vaucluse - Avignon",
"85 - Vendée - La Roche-sur-Yon",
"86 - Vienne - Poitiers",
"87 - Haute Vienne - Limoges",
"88 - Vosges - Épinal",
"89 - Yonne - Auxerre",
"90 - Territoire de Belfort - Belfort",
"91 - Essonne - Evry",
"92 - Hauts-de-Seine - Nanterre",
"93 - Seine-St-Denis - Bobigny",
"94 - Val-de-Marne - Créteil",
"95 - Val-D'Oise - Pontoise",
"971 - Guadeloupe - Basse-Terre",
"972 - Martinique - Fort-de-France",
"973 - Guyane - Cayenne",
"974 - La-Reunion - Saint-Denis",
"976 - Mayotte - Mamoudzou"
]


In [3]:
if not os.path.exists("data/processed_data"):
    os.mkdir("data/processed_data")

## Creating the departments dictionary 

In [7]:
key = "ADD_API_KEY"
geocoder = OpenCageGeocode(key)

with open(os.path.join("data", "raw_data", "bed_per_dep.pkl"), "rb") as f:
    bpd = pickle.load(f)

departments = {}
for d in departments_list:
    s = d.split(" - ")

    query =  s[2] +", "  + s[1]  + u", France"
    results = geocoder.geocode(query)

    departments[s[0]] = {"name": s[1], 
                         "capital": s[2], 
                         "capital_coords":results[0]["geometry"].copy(),
                         "beds": bpd[s[0]]}
    time.sleep(0.01) #Added for the api calls
with open(os.path.join("data", "processed_data", "departments.pkl"), "wb") as f:
    pickle.dump(departments, f)
with open(os.path.join("data", "processed_data", "departments.json"), "w") as f:
    json.dump(departments, f, sort_keys=True, indent=4)

## Calculating connectivity

In [12]:

region_1 = {'08': ['51','02','55'],
            '51': ['08', '02', '77', '10', '52', '55'],
            '10': ['51', '77', '89', '21', '52'],
            '52': ['55', '88', '70', '21', '10', '51'], 
            '55':  ['08', '54', '88', '52', '51'], 
            '54': ['55',  '57', '88'],
            '57': ['67', '54'],
            '88': ['54', '68', '70', '52', '55'], 
            '67':['57', '68', '88'], 
            '68':['67', '88', '90'],
            '89': ['10', '21', '58', '45','77'], 
            '21': ['10', '52', '70', '39', '71', '58', '89'],
            '58': ['18', '89', '71', '03'], 
            '71': ['21', '39', '01', '69', '42', '03', '58'],
            '70': ['88', '52', '21', '39', '25', '90'],
            '90':['68', '70', '25'],
            '25': ['70','90', '39'],
            '39':['25', '70', '21', '71', '01']}

region_2 = {
    "87" : ["36", "86", "16", "24", "19", "23"],
    "23" : ["18", "36", "87", "19", "63", "03"],
    "19" : ["23", "87", "24", "46", "15", "63"],
    "03" : ["58", "18", "23", "63", "42", "71"],
    "63" : ["03", "23", "19", "15", "43", "42"],
    "15" : ["63", "19", "46", "12", "48", "43"],
    "43" : ["63", "15", "48", "07", "42"],
    "42" : ["71", "03", "63", "43", "07", "69"],
    "69" : ["71", "42", "38", "01"],
    "07" : ["42", "43", "48", "30", "26", "38", "69"],
    "26" : ["38", "07", "84", "04", "05"],
    "38" : ["01", "69", "42", "07", "26", "05", "73"],
    "73" : ["74", "01", "38", "05"]
}

region_3 = {'2A': ['2B'],
 '2B': ['2A'],
 '04': ['05', '06', '83', '84', '13', '26'],
 '05': ['73', '38', '26', '04'],
 '06': ['04', '83'],
 '83': ['13', '84', '04', '06'],
 '13': ['30', '84', '04', '83'],
 '84': ['13', '30', '26', '04', '83', '07'],
 '30': ['34', '12', '48', '07', '26', '84', '13'],
 '48': ['12', '15', '43', '07', '30'],
 '34': ['11', '81', '12', '30'],
 '11': ['66', '09', '31', '81', '34'],
 '66': ['09', '11']
}



region_4 = {
    "12": ["30", "48", "15", "19", "46", "82", "81", "34"],
    "46": ["12", "15", "19", "24", "47", "82"],
    "24": ["46", "19", "87", "16", "33", "47"],
    "33": ["24", "17", "40", "47"],
    "40": ["33", "64", "32", "47"],
    "64": ["40", "65", "32"],
    "65": ["64", "31", "32"],
    "31": ["65", "09", "11", "81", "82", "32"],
    "09": ["31", "66", "11"],
    "81": ["31", "11", "34", "12", "82"],
    "82": ["81", "12", "46", "47", "32", "31"],
    "47": ["82", "46", "24", "33", "40", "32"],
    "32": ["47", "40", "64", "65", "31", "82"]
}



region_5 = {'29': ['22', '56'], '22': ['29', '56', '35'], '56': ['29', '22', '03', '44'], '35': ['53', '50', '44', '22', '56'], 
'44': ['85', '49', '53', '35', '56'], '85': ['79', '44', '49', '17'], '53': ['35', '49', '72', '61', '50'], '49': ['44', '85', '72', '53', '79', '37'], 
'72': ['41', '37', '28', '61', '53', '49'], '79': ['85', '49', '86', '17', '16'], '17': ['16', '33', '79', '85'], '16': ['17', '24', '86', '79', '87'], 
'86': ['86', '16', '36', '37', '79', '49'], '50': ['35', '14', '61'], '14': ['50', '61', '27'], 
'61': ['72', '53', '14', '50', '27', '28'], '27': ['14', '61', '74', '28', '78', '95', '60'], '76': ['80', '60', '27']}

region_6 = { '62':['59','80'],
                        '59':['62','02'],
                        '80':['62','02','60','76'],
                        '60':['76','27','95','77','02','80'],
                        '02':['59','80','60','77','51','08'],
                        '95':['60','27','78','77','60','92','93'],
                        '77':['95','93','94','91','45','89','10','51','02','60'],
                        '78':['95','27','28','91','92'],
                        '91':['92','94','78','28','45','77'],
                        '28':['78','91','45','41','72','61','27'],
                        '45':['28','91','77','89','58','18','41'],
                        '41':['28','45','18','36','37','72'],
                        '18':['45','58','03','23','36','41'],
                        '37':['72','41','36','86','49'],
                        '36':['37','41','18','23','87','86'],
                        '92':['93','75','94','95','78','91'],
                        '93':['92','75','94','95','77'],
                        '75':['92','93','94'],
                        '94':['92','93','75','91','77']}

In [13]:
connectivity = defaultdict(list)
connectivity.update(region_1)
connectivity.update(region_2)
connectivity.update(region_3)
connectivity.update(region_4)
connectivity.update(region_5)
connectivity.update(region_6)

for dep, connections in connectivity.items():
    connectivity[dep] = sorted(list(set(connections)))

# Check that the connections are symmetrical
counter = 0
while True:
    tmp = deepcopy(connectivity)
    for dep, connections in connectivity.items():
        for c in connections:
            tmp[c].append(dep)

    for dep, connections in tmp.items():
        tmp[dep] = sorted(list(set(connections)))

    if tmp == connectivity:
        break
    connectivity = deepcopy(tmp)

    counter += 1
    if counter>1000:
        # print("SOMETHING WENT WRONG")
        break

with open(os.path.join("data", "processed_data", "connectivity.pkl"), "wb") as f:
    pickle.dump(connectivity, f)

with open(os.path.join("data", "processed_data", "connectivity.json"), "w") as f:
    json.dump(connectivity, f, sort_keys=True, indent=4)

## Calculating distances and times

In [None]:
google_key = "ADD_API_KEY"
gmaps = googlemaps.Client(key=google_key)


codes_to_exclude = ["2A", "2B"] #Corsica
dep_codes = sorted(departments.keys())
dep_codes = [d for d in dep_codes if d not in codes_to_exclude and len(d)<3]

cities = [departments[c]["capital"] for c in dep_codes]
cities_coords = [departments[c]["capital_coords"] for c in dep_codes]
coords_list = []
for city in cities_coords:
    coords_list.append([city["lat"], city["lng"]])

In [None]:
td_data = {}
for i, code in enumerate(dep_codes):
    time.sleep(0.03)
    neighbor_idxs = [dep_codes.index(c) for c in connectivity[code]]
    neighbors = [coords_list[k] for k in neighbor_idxs]
    result = gmaps.distance_matrix([coords_list[i]], neighbors, mode='driving')
    connections = []
    for k, element in enumerate(result['rows'][0]["elements"]):
        neighbor = {"code": dep_codes[neighbor_idxs[k]],
                   "time": element["duration"]["value"],
                    "distance": element["distance"]["value"]
                   }
        connections.append(neighbor)
    td_data[code] = connections

In [None]:
with open(os.path.join("data", "processed_data", "distances_times.pkl"), "wb") as f:
    pickle.dump(td_data, f)
with open(os.path.join("data", "processed_data", "distances_times.json"), "w") as f:
    json.dump(td_data, f, sort_keys=True, indent=4)