In [18]:
import pandas as pd
import os
import requests
from tqdm.notebook import tqdm
import time
import numpy as np
import re
tqdm.pandas()

In [19]:
DATA_ROOT_PATH = "../data"
STATS_GENERALES_PATH = os.path.join(DATA_ROOT_PATH, "stats_generales")
STATS_GENERALES_FILE_PATH = os.path.join(STATS_GENERALES_PATH, "stats_generales.csv")
SUMMARY_STATS_GENERALES_FILE_PATH = os.path.join(STATS_GENERALES_PATH, "summary_stats_generales.csv")
STATS_LYCEES_FILE_PATH = os.path.join(DATA_ROOT_PATH, "stats_lycees", "stats_lycees.csv")
STATS_SUMMARY_FILE_PATH = os.path.join(DATA_ROOT_PATH, "stats_summary", "summary.csv")

API_KEY = "PUT YOUR API KEY HERE"

## Enricher `stats_lycees.csv` avec les coordonnées GPS
On va utiliser l'API `https://api-adresse.data.gouv.fr/search/?q=`

### Fonctions pour récuper le code postal et les coordonnées

In [204]:
GEO_API_URL = "https://api-adresse.data.gouv.fr/search/"

In [205]:
def get_postalcode_and_coords(city : str):
    if city is None or city == "":
        return None
    response = requests.get(GEO_API_URL, params=dict(q=city, type="municipality"))
    if not response.ok:
        print(f"ERREUR : city=`{city}`")
        print(f"SERVER RESPONSE : `{response.content}`")
        return None
    geo_response = response.json()["features"]
    if len(geo_response) == 0:
        return None

    feature = geo_response[0]    
    geometry = feature.get("geometry", {})
    longitude, latitude = geometry.get("coordinates", [None,None])

    properties = feature.get("properties", {})
    postalcode = properties.get("postcode")
    if postalcode is not None:
        postalcode = int(postalcode[:2])
    return dict(longitude=longitude, latitude=latitude, postalcode=postalcode)


In [206]:
get_postalcode_and_coords("Amiens")

{'longitude': 2.29248, 'latitude': 49.903034, 'postalcode': 80}

### Lister les villes dans la colonnes `ville` pour éviter la redondance
En l'occurence, si on `apply` sur toute la DataFrame `stats_lycees`, on se retrouve à faire 42 372 appels (environ 60 minutes de traîtement).
On va éviter d'appeler en double certaine ville en faisant un `unique`.

In [207]:
stats_lycees_df = pd.read_csv(STATS_LYCEES_FILE_PATH)
stats_lycees_df.head()

Unnamed: 0,year,concours,prepa,ville,etablissement,inscrits,dont filles,admissibles,dont filles.1,classes,dont filles.2,integres,dont filles.3
0,2002,centrale-supelec,PC,AIX-EN-PROVENCE,PAUL CEZANNE,59,21,29,10,26,9,6,3
1,2002,centrale-supelec,PC,MARSEILLE,THIERS,68,13,35,5,30,5,11,1
2,2002,centrale-supelec,PC,SALON DE PROVENCE,L'EMPERI,3,1,0,0,0,0,0,0
3,2002,centrale-supelec,PC,AMIENS,THUILLIER,22,8,0,0,0,0,0,0
4,2002,centrale-supelec,PC,COMPIEGNE,PIERRE D'AILLY,2,0,0,0,0,0,0,0


In [208]:
city_df = pd.DataFrame({"ville":stats_lycees_df["ville"].unique()})
city_df

Unnamed: 0,ville
0,AIX-EN-PROVENCE
1,MARSEILLE
2,SALON DE PROVENCE
3,AMIENS
4,COMPIEGNE
...,...
525,Sète
526,Papeete
527,Yaoundé
528,Dakar


In [209]:
def apply_in_city_df(row):
    postalcode_and_coords = get_postalcode_and_coords(row["ville"])
    return pd.Series(postalcode_and_coords)

city_df[["longitude", "latitude", "postalcode"]] = city_df.progress_apply(lambda row: apply_in_city_df(row), axis=1)

  0%|          | 0/530 [00:00<?, ?it/s]

  return pd.Series(postalcode_and_coords)
  return pd.Series(postalcode_and_coords)
  return pd.Series(postalcode_and_coords)
  return pd.Series(postalcode_and_coords)
  return pd.Series(postalcode_and_coords)
  return pd.Series(postalcode_and_coords)
  return pd.Series(postalcode_and_coords)
  return pd.Series(postalcode_and_coords)
  return pd.Series(postalcode_and_coords)
  return pd.Series(postalcode_and_coords)
  return pd.Series(postalcode_and_coords)
  return pd.Series(postalcode_and_coords)
  return pd.Series(postalcode_and_coords)
  return pd.Series(postalcode_and_coords)
  return pd.Series(postalcode_and_coords)
  return pd.Series(postalcode_and_coords)
  return pd.Series(postalcode_and_coords)
  return pd.Series(postalcode_and_coords)
  return pd.Series(postalcode_and_coords)
  return pd.Series(postalcode_and_coords)
  return pd.Series(postalcode_and_coords)


#### Cas particulier (hors France ou villes particulières)

Quelque ville n'ont pas été complétée. La plupart ne sont pas en France, c'est pour ça. On va utiliser `https://geocode.xyz/{ville}?json=1` ou `https://nominatim.openstreetmap.org/search.php?q={ville}&format=jsonv2`. Ces API sont plus contraignants sur leur utilisation, c'est pour ça qu'on ne les utilisent que si c'est strictement nécessaire.

In [210]:
OSM_API_URL = "https://nominatim.openstreetmap.org/search.php?q={city}&format=jsonv2"

def get_coords_with_osm(city : str):
    if city is None or city == "":
        return None

    if city.lower().endswith("cedex"):
        city = city[:-5]
    
    #time.sleep(1)
    url = OSM_API_URL.format(city=city)

    response = requests.get(url)
    if not response.ok:
        print(f"ERREUR : city=`{city}`")
        print(f"SERVER RESPONSE : `{response.content}`")
        return None
    
    response_json = response.json()[0]
    
    if response_json["display_name"].endswith("France"):
        display_name_list = response_json["display_name"].split(",")
        postalcode = display_name_list[-2].strip()
        postalcode = postalcode[:2]
    else:
        postalcode = None
        
    longitude = float(response_json.get("lon"))
    latitude = float(response_json.get("lat"))
    
    return dict(longitude=longitude, latitude=latitude, postalcode=postalcode)

In [211]:
city_with_nan_df = city_df[city_df["postalcode"].isna()]
city_with_nan_df

Unnamed: 0,ville,longitude,latitude,postalcode
129,OUJDA,,,
132,YAMOUSSOUKRO,,,
143,MOHAMMEDIA,,,
168,NABEUL,,,
170,SFAX,,,
176,KENITRA,,,
337,KHOURIBGA,,,
343,Errachidia,,,
345,Mohammedia,,,
350,Oujda,,,


In [212]:
def apply_in_city_with_nan_df(row):
    postalcode_and_coords = get_coords_with_osm(row["ville"])
    return pd.Series(postalcode_and_coords)

city_with_nan_df[["longitude", "latitude", "postalcode"]] = city_with_nan_df.progress_apply(lambda row: apply_in_city_with_nan_df(row), axis=1)

  0%|          | 0/21 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  city_with_nan_df[["longitude", "latitude", "postalcode"]] = city_with_nan_df.progress_apply(lambda row: apply_in_city_with_nan_df(row), axis=1)


In [219]:
city_df.loc[city_df["latitude"].isna(), ["longitude", "latitude", "postalcode"]] = city_with_nan_df[["longitude", "latitude", "postalcode"]]

### Join `stats_lycees_df` avec `city_df`

In [224]:
# Convertir les colonnes dans le bon type
city_df = city_df.convert_dtypes()
stats_lycees_df = stats_lycees_df.convert_dtypes()

In [228]:
stats_lycees_df = city_df.merge(stats_lycees_df, on="ville", how="inner")
stats_lycees_df

Unnamed: 0,ville,longitude,latitude,postalcode,year,concours,prepa,etablissement,inscrits,dont filles,admissibles,dont filles.1,classes,dont filles.2,integres,dont filles.3
0,AIX-EN-PROVENCE,43.546605,5.402549,13.0,2002,centrale-supelec,PC,PAUL CEZANNE,59,21,29,10,26,9,6,3
1,AIX-EN-PROVENCE,43.546605,5.402549,13.0,2002,ccp,PC,PAUL CEZANNE,80,36,71,30,64,29,32,17
2,AIX-EN-PROVENCE,43.546605,5.402549,13.0,2002,ccp,PC,LYCEE MILITAIRE,22,2,16,1,13,1,8,0
3,AIX-EN-PROVENCE,43.546605,5.402549,13.0,2002,mines-ponts,PSI,PAUL CEZANNE,2,1,0,0,0,0,0,0
4,AIX-EN-PROVENCE,43.546605,5.402549,13.0,2002,mines-ponts,PSI,VAUVENARGUES,21,4,10,4,9,3,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42367,Yaoundé,11.521334,3.868987,,2019,mines-ponts,MP,Institut PrépaVogt (L0110),1,0,0,0,0,0,0,0
42368,Yaoundé,11.521334,3.868987,,2019,e3a,MP,Institut PrépaVogt (L0110),1,0,0,0,0,0,0,0
42369,Yaoundé,11.521334,3.868987,,2019,centrale-supelec,MP,Institut PrépaVogt (L0110),1,0,0,0,0,0,0,0
42370,Dakar,47.022333,5.487753,39.0,2019,e3a,MP,Cours Sainte Marie de Hann (3410005C),1,0,0,0,0,0,0,0


In [229]:
stats_lycees_df.to_csv(STATS_LYCEES_FILE_PATH)

## Convert `postecode` to str

In [231]:
stats_lycees_df = pd.read_csv(STATS_LYCEES_FILE_PATH, index_col=0)
stats_lycees_df.head()

Unnamed: 0,ville,longitude,latitude,postalcode,year,concours,prepa,etablissement,inscrits,dont filles,admissibles,dont filles.1,classes,dont filles.2,integres,dont filles.3
0,AIX-EN-PROVENCE,43.546605,5.402549,13.0,2002,centrale-supelec,PC,PAUL CEZANNE,59,21,29,10,26,9,6,3
1,AIX-EN-PROVENCE,43.546605,5.402549,13.0,2002,ccp,PC,PAUL CEZANNE,80,36,71,30,64,29,32,17
2,AIX-EN-PROVENCE,43.546605,5.402549,13.0,2002,ccp,PC,LYCEE MILITAIRE,22,2,16,1,13,1,8,0
3,AIX-EN-PROVENCE,43.546605,5.402549,13.0,2002,mines-ponts,PSI,PAUL CEZANNE,2,1,0,0,0,0,0,0
4,AIX-EN-PROVENCE,43.546605,5.402549,13.0,2002,mines-ponts,PSI,VAUVENARGUES,21,4,10,4,9,3,2,0


In [None]:
stats_lycees_df.postalcode = stats_lycees_df.postalcode.astype('Int32')

In [243]:
stats_lycees_df.head()

Unnamed: 0,ville,longitude,latitude,postalcode,year,concours,prepa,etablissement,inscrits,dont filles,admissibles,dont filles.1,classes,dont filles.2,integres,dont filles.3
0,AIX-EN-PROVENCE,43.546605,5.402549,13,2002,centrale-supelec,PC,PAUL CEZANNE,59,21,29,10,26,9,6,3
1,AIX-EN-PROVENCE,43.546605,5.402549,13,2002,ccp,PC,PAUL CEZANNE,80,36,71,30,64,29,32,17
2,AIX-EN-PROVENCE,43.546605,5.402549,13,2002,ccp,PC,LYCEE MILITAIRE,22,2,16,1,13,1,8,0
3,AIX-EN-PROVENCE,43.546605,5.402549,13,2002,mines-ponts,PSI,PAUL CEZANNE,2,1,0,0,0,0,0,0
4,AIX-EN-PROVENCE,43.546605,5.402549,13,2002,mines-ponts,PSI,VAUVENARGUES,21,4,10,4,9,3,2,0


In [244]:
stats_lycees_df.to_csv(STATS_LYCEES_FILE_PATH)

## Enrich `stats_general`

In [20]:
# Avoir la liste unique des noms d'école pour éviter d'appeler plusieurs fois l'API pour un même nom 
ecole_df = pd.DataFrame(dict(ecole=pd.read_csv(STATS_GENERALES_FILE_PATH, index_col=0)["ecole"].unique()))
ecole_df

Unnamed: 0,ecole
0,AgroParisTech Grignon
1,Montpellier Sup Agro (cursus agronome)
2,Montpellier Sup Agro (cursus SAADS)
3,Agrocampus Ouest (cursus ingénieur agronome)
4,ENSAT Toulouse
...,...
1770,EPITA campus Paris
1771,ESME SUDRIA
1772,Bordeaux INP-ENSEGID
1773,ESIX Caen Agroalimentaire


### Enrichir à partir des noms des écoles la `stats_generales` pour ajouter les coordonées GPS et le code postal

#### Utiliser Google Geocode pour deviner à partir de l'adresse ces informations de localisation
On suppose que le nom de l'école (qui des fois est composé du nom de la ville) permet à Google de déterminer ces informations.
On stocke les informations dans `ecole.csv` (temporairement)

In [32]:
def get_geocode_from_google(ecole : str):
    response = requests.get("https://maps.googleapis.com/maps/api/geocode/json", params=dict(address=ecole, key=API_KEY))
    response_json = response.json()
    results = response_json["results"]
    if len(results) != 0:
        address_components = results[0]["address_components"]
        postalcode = None
        for c in address_components:
            if c["types"] == ['postal_code']:
                postalcode = c["long_name"]

        location = results[0].get("geometry", {}).get("location", {})
        longitude = location.get("lng")
        latitude = location.get("lat")
        return dict(longitude=longitude, latitude=latitude, postalcode=postalcode)
    else:
        return dict(longitude=None, latitude=None, postalcode=None)

In [33]:
def apply_in_ecole_df(row):
    postalcode_and_coords = get_geocode_from_google(row["ecole"])
    return pd.Series(postalcode_and_coords)

ecole_df[["longitude", "latitude", "postalcode"]] = ecole_df.progress_apply(lambda row: apply_in_ecole_df(row), axis=1)

  0%|          | 0/1775 [00:00<?, ?it/s]

In [34]:
ecole_df.to_csv(os.path.join(DATA_ROOT_PATH, "ecole.csv"))

#### Utiliser l'API `textsearch` pour avoir plus d'information sur les valeurs manquantes (non-détectées par la première API)
Ici, il faudra extraire le code postal si précisé avec un regex.

In [25]:
ecole_df = pd.read_csv(os.path.join(DATA_ROOT_PATH, "ecole.csv"), index_col=0)
ecole_df

Unnamed: 0,ecole,longitude,latitude,postalcode
0,AgroParisTech Grignon,1.935013,48.846946,78850
1,Montpellier Sup Agro (cursus agronome),3.854877,43.617282,34060
2,Montpellier Sup Agro (cursus SAADS),3.854877,43.617282,34060
3,Agrocampus Ouest (cursus ingénieur agronome),-1.710232,48.126424,35000
4,ENSAT Toulouse,1.493193,43.535057,31326
...,...,...,...,...
1770,EPITA campus Paris,2.362817,48.815665,94270
1771,ESME SUDRIA,2.393000,48.814077,94200
1772,Bordeaux INP-ENSEGID,-0.607988,44.804806,33600
1773,ESIX Caen Agroalimentaire,-1.644215,49.634344,50130


In [26]:
empty_coords_and_postalcode_df = ecole_df.postalcode.isna() | ecole_df.longitude.isna() | ecole_df.latitude.isna()
ecole_df[empty_coords_and_postalcode_df]

Unnamed: 0,ecole,longitude,latitude,postalcode
9,ENITAB Bordeaux (Civil),-0.579224,44.837727,
11,VetAgro Sup Clermont-Ferrand (Civil),3.087025,45.777222,
12,Oniris Nantes (cursus agroalimentaire),,,
15,Concours commun A BIO,,,
18,Oniris Nantes (cursus vétérinaire),,,
...,...,...,...,...
1723,ESTIT V.D'ASCQ,,,
1724,EIGIP PC,,,
1725,IST Paris6 CM,2.352222,48.856614,
1728,RÚseau Eiffel PT,,,


In [27]:
def get_geocode_from_google_textsearch(ecole : str):
    response = requests.get("https://maps.googleapis.com/maps/api/place/textsearch/json", params=dict(query=ecole, key=API_KEY))
    response_json = response.json()
    results = response_json["results"]
    if len(results) != 0:
        formatted_address = results[0].get("formatted_address", "")
        postal_code_regex = r"^.*(?P<postalcode>\d{5}).*$"
        postalcode = None
        m = re.match(postal_code_regex, formatted_address)
        if m is not None:
            postalcode = m.group("postalcode")
        
        location = results[0].get("geometry", {}).get("location", {})
        longitude = location.get("lng")
        latitude = location.get("lat")
        return dict(longitude=longitude, latitude=latitude, postalcode=postalcode)
    return dict(longitude=None, latitude=None, postalcode=None)

In [28]:
def apply_in_ecole_df(row):
    postalcode_and_coords = get_geocode_from_google_textsearch(row["ecole"])
    return pd.Series(postalcode_and_coords)

ecole_df.loc[empty_coords_and_postalcode_df, ["longitude", "latitude", "postalcode"]] = ecole_df.loc[empty_coords_and_postalcode_df].progress_apply(lambda row: apply_in_ecole_df(row), axis=1)

  0%|          | 0/386 [00:00<?, ?it/s]

In [42]:
empty_coords_and_postalcode_df = ecole_df.postalcode.isna() & ecole_df.longitude.isna() & ecole_df.latitude.isna()
ecole_df[empty_coords_and_postalcode_df]

Unnamed: 0,ecole,longitude,latitude,postalcode
15,Concours commun A BIO,,,
26,Concours commun A PC BIO,,,
85,Concours Commun Mines Ponts TSI,,,
122,Concours Commun Polytechnique TSI,,,
138,ESTP Paris MECA-ELEC,,,
...,...,...,...,...
1698,ISTG 3I,,,
1711,AUTRES ECOLES DE LA BANQUE E4A,,,
1720,IAAL Lille,,,
1725,IST Paris6 CM,,,


#### Fusionner les deux réponses par les deux endpoints de l'API de Google

In [48]:
ecole_old_df = pd.read_csv(os.path.join(DATA_ROOT_PATH, "ecole.csv"))
ecole_df.loc[empty_coords_and_postalcode_df, ["longitude", "latitude", "postalcode"]] = ecole_old_df[empty_coords_and_postalcode_df]

In [49]:
empty_coords_and_postalcode_df = ecole_df.postalcode.isna() & ecole_df.longitude.isna() & ecole_df.latitude.isna()
ecole_df[empty_coords_and_postalcode_df]

Unnamed: 0,ecole,longitude,latitude,postalcode
15,Concours commun A BIO,,,
26,Concours commun A PC BIO,,,
85,Concours Commun Mines Ponts TSI,,,
122,Concours Commun Polytechnique TSI,,,
323,Concours Commun Mines Ponts PC,,,
383,concours Polytech G2E-BCPST,,,
566,concours des écoles des Mines G2E-BCPST,,,
647,ESIEE-ISMEA MP,,,
651,Concours ESTP,,,
661,ESIL Internet,,,


Les quelques valeurs manquantes peuvent se remplir à la main si on connait les écoles.

In [54]:
ecole_df.loc[ecole_df.ecole=="ISITV Toulon IMATER","postalcode"] = "83000"
ecole_df.loc[ecole_df.ecole=="FIF Civil Nancy","postalcode"] = "54000"
ecole_df.loc[ecole_df.ecole=="ESTP-TP","postalcode"] = "94230"
ecole_df.loc[ecole_df.ecole=="ESTP MECA-ELEC","postalcode"] = "94230"
ecole_df.loc[ecole_df.ecole=="ESTP PC","postalcode"] = "94230"
ecole_df.loc[ecole_df.ecole=="ESINSA Nice","postalcode"] = "06000"
ecole_df.loc[ecole_df.ecole=="ESTP MP","postalcode"] = "94230"
ecole_df.loc[ecole_df.ecole=="Concours ESTP PSI	","postalcode"] = "94230"
ecole_df.loc[ecole_df.ecole=="ESTIT V.D","postalcode"] = "59000"
ecole_df.loc[ecole_df.ecole=="EUDIL Info Mes Auto","postalcode"] = "59000"
ecole_df.loc[ecole_df.ecole=="ISTG 3I","postalcode"] = "38000"

In [58]:
empty_coords_and_postalcode_df = ecole_df.postalcode.isna() & ecole_df.longitude.isna() & ecole_df.latitude.isna()
ecole_df[empty_coords_and_postalcode_df]

Unnamed: 0,ecole,longitude,latitude,postalcode
15,Concours commun A BIO,,,
26,Concours commun A PC BIO,,,
85,Concours Commun Mines Ponts TSI,,,
122,Concours Commun Polytechnique TSI,,,
323,Concours Commun Mines Ponts PC,,,
383,concours Polytech G2E-BCPST,,,
566,concours des écoles des Mines G2E-BCPST,,,
647,ESIEE-ISMEA MP,,,
651,Concours ESTP,,,
661,ESIL Internet,,,


In [59]:
ecole_df

Unnamed: 0,ecole,longitude,latitude,postalcode
0,AgroParisTech Grignon,1.935013,48.846946,78850
1,Montpellier Sup Agro (cursus agronome),3.854877,43.617282,34060
2,Montpellier Sup Agro (cursus SAADS),3.854877,43.617282,34060
3,Agrocampus Ouest (cursus ingénieur agronome),-1.710232,48.126424,35000
4,ENSAT Toulouse,1.493193,43.535057,31326
...,...,...,...,...
1770,EPITA campus Paris,2.362817,48.815665,94270
1771,ESME SUDRIA,2.393000,48.814077,94200
1772,Bordeaux INP-ENSEGID,-0.607988,44.804806,33600
1773,ESIX Caen Agroalimentaire,-1.644215,49.634344,50130


In [60]:
ecole_df.to_csv(os.path.join(DATA_ROOT_PATH, "ecole_complet_with_blank.csv"))

#### Deviner les champs vides à partir des autres
Par exemple deviner le code postal avec les coordonnées GPS.

In [90]:
ecole_df = pd.read_csv(os.path.join(DATA_ROOT_PATH, "ecole_complet_with_blank.csv"), index_col=0)

In [91]:
ecole_df[ecole_df.postalcode.isna() & ~(ecole_df.longitude.isna() | ecole_df.latitude.isna())]

Unnamed: 0,ecole,longitude,latitude,postalcode
138,ESTP Paris MECA-ELEC,2.352222,48.856614,
139,ESTP Paris TOPOGRAPHIE,2.352222,48.856614,
140,ESTP Paris TP,2.352222,48.856614,
141,ESTP Paris,2.352222,48.856614,
406,ESTP Paris Génie Mécanique et Electrique (GME),2.352222,48.856614,
467,EIGSI Casablanca,-7.630756,33.557516,
525,CENTRALE CASABLANCA,-7.620441,33.480076,
559,CENTRALE CASABLANCA-CI,-7.620441,33.480076,
598,"ENSTIM Albi, Alès, Douai, Nantes",-1.553621,47.218371,
708,FIF Fonctionnaire Nancy,6.184417,48.692054,


##### Compute le code postal

In [92]:
def get_postalcode_with_coords(longitude, latitude):
    response = requests.get("https://nominatim.openstreetmap.org/reverse.php", params=dict(lon=longitude, lat=latitude, format="jsonv2"))
    response_json = response.json()
    if len(response_json) == 0:
        return dict(longitude=None, latitude=None)
    
    postalcode = response_json.get("address", {}).get("postcode")
    return dict(postalcode=postalcode)

In [93]:
def apply_in_ecole_df(row):
    postalcode = get_postalcode_with_coords(row["longitude"], row["latitude"])
    return pd.Series(postalcode)

empty_postalcode_df = ecole_df.postalcode.isna() & ~(ecole_df.longitude.isna() | ecole_df.latitude.isna())
ecole_df.loc[empty_postalcode_df, "postalcode"] = ecole_df[empty_postalcode_df].progress_apply(lambda row: apply_in_ecole_df(row), axis=1)

  0%|          | 0/41 [00:00<?, ?it/s]

In [95]:
empty_postalcode_df = ecole_df.postalcode.isna() & ~(ecole_df.longitude.isna() | ecole_df.latitude.isna())
ecole_df[empty_postalcode_df]

Unnamed: 0,ecole,longitude,latitude,postalcode


##### Compute les coords

In [97]:
empty_coords_df = (ecole_df.longitude.isna() | ecole_df.latitude.isna()) & ~ecole_df.postalcode.isna()
ecole_df[empty_coords_df]

Unnamed: 0,ecole,longitude,latitude,postalcode
674,ISITV Toulon IMATER,,,83000
709,FIF Civil Nancy,,,54000
889,ESTP-TP,,,94230
891,ESTP MECA-ELEC,,,94230
948,ESTP PC,,,94230
1016,ESINSA Nice,,,6000
1054,ESTP MP,,,94230
1682,ESTIT V.D,,,59000
1693,EUDIL Info Mes Auto,,,59000
1698,ISTG 3I,,,38000


In [98]:
def get_coords_with_postalcode(postalcode : str):
    response = requests.get("https://nominatim.openstreetmap.org/search.php", params=dict(country="France", postalcode=postalcode, format="jsonv2"))
    response_json = response.json()
    if len(response_json) == 0:
        return dict(longitude=None, latitude=None)
    
    result = response_json[0]
    latitude = result.get("lat")
    longitude = result.get("lon")
    return dict(longitude=longitude, latitude=latitude)

In [99]:
def apply_in_ecole_df(row):
    coords = get_coords_with_postalcode(row["postalcode"])
    return pd.Series(coords)

empty_coords_df = (ecole_df.longitude.isna() | ecole_df.latitude.isna()) & ~ecole_df.postalcode.isna()
ecole_df.loc[empty_coords_df, ["longitude", "latitude"]] = ecole_df[empty_coords_df].progress_apply(lambda row: apply_in_ecole_df(row), axis=1)

  0%|          | 0/10 [00:00<?, ?it/s]

In [100]:
empty_coords_df = (ecole_df.longitude.isna() | ecole_df.latitude.isna()) & ~ecole_df.postalcode.isna()
ecole_df[empty_coords_df]

Unnamed: 0,ecole,longitude,latitude,postalcode


In [103]:
ecole_df[ecole_df.longitude.isna() | ecole_df.latitude.isna() | ecole_df.postalcode.isna()]

Unnamed: 0,ecole,longitude,latitude,postalcode
15,Concours commun A BIO,,,
26,Concours commun A PC BIO,,,
85,Concours Commun Mines Ponts TSI,,,
122,Concours Commun Polytechnique TSI,,,
323,Concours Commun Mines Ponts PC,,,
383,concours Polytech G2E-BCPST,,,
566,concours des écoles des Mines G2E-BCPST,,,
647,ESIEE-ISMEA MP,,,
651,Concours ESTP,,,
661,ESIL Internet,,,


#### Export `ecole.csv`
Sur les 1775 noms d'école, seul 29 n'ont pas de coordonées ni même de code postal.

In [104]:
ecole_df.to_csv(os.path.join(DATA_ROOT_PATH, "ecole.csv"))

In [109]:
ecole_df = pd.read_csv(os.path.join(DATA_ROOT_PATH, "ecole.csv"), index_col=0)
ecole_df

Unnamed: 0,ecole,longitude,latitude,postalcode
0,AgroParisTech Grignon,1.935013,48.846946,78850
1,Montpellier Sup Agro (cursus agronome),3.854877,43.617282,34060
2,Montpellier Sup Agro (cursus SAADS),3.854877,43.617282,34060
3,Agrocampus Ouest (cursus ingénieur agronome),-1.710232,48.126424,35000
4,ENSAT Toulouse,1.493193,43.535057,31326
...,...,...,...,...
1770,EPITA campus Paris,2.362817,48.815665,94270
1771,ESME SUDRIA,2.393000,48.814077,94200
1772,Bordeaux INP-ENSEGID,-0.607988,44.804806,33600
1773,ESIX Caen Agroalimentaire,-1.644215,49.634344,50130


#### Merge `ecole.csv` avec `stats_generales.csv` pour avoir les coordonées et les codes postaux pour chaque école

In [115]:
ecole_df = pd.read_csv(os.path.join(DATA_ROOT_PATH, "ecole.csv"), index_col=0)
stats_generales_df = pd.read_csv(STATS_GENERALES_FILE_PATH, index_col=0)

In [119]:
enriched_stats_generales_df = stats_generales_df.merge(ecole_df, on="ecole", how="inner")
enriched_stats_generales_df

Unnamed: 0,year,filiere,banque,ecole,inscrits_nb,inscrits_filles,inscrits_cinq_demi,admissibles_nb,admissibles_filles,admissibles_cinq_demi,...,integres_nb,integres_filles,integres_cinq_demi,places,appeles,integres_rg_median,integres_rg_moyen,longitude,latitude,postalcode
0,2011,bcpst,A BIO,AgroParisTech Grignon,,,,,,,...,257.0,66.0,19.0,257.0,,,,1.935013,48.846946,78850
1,2012,bcpst,A BIO,AgroParisTech Grignon,,,,,,,...,257.0,64.0,25.0,257.0,,,,1.935013,48.846946,78850
2,2013,bcpst,A BIO,AgroParisTech Grignon,,,,,,,...,259.0,65.0,24.0,257.0,,,,1.935013,48.846946,78850
3,2014,bcpst,A BIO,AgroParisTech Grignon,,,,,,,...,251.0,62.0,16.0,250.0,,,,1.935013,48.846946,78850
4,2015,bcpst,A BIO,AgroParisTech Grignon,,,,,,,...,250.0,62.0,16.0,250.0,,,,1.935013,48.846946,78850
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24169,2021,pt,EPITA,ESME SUDRIA,,,,,,,...,9.0,0.0,22.2,10,,93.0,90.0,2.393000,48.814077,94200
24170,2021,bcpst,G2E,Bordeaux INP-ENSEGID,1657.0,70.8,18.2,1136.0,68.7,23.2,...,18.0,61.1,27.8,18,,457.0,399.0,-0.607988,44.804806,33600
24171,2021,bcpst,CONCOURS POLYTECH,ESIX Caen Agroalimentaire,,,,,,,...,4.0,50.0,0.0,5,,1291.0,1188.0,-1.644215,49.634344,50130
24172,2021,tb,Concours POLYTECH A TB,ESIX Caen Agroalimentaire,,,,,,,...,0.0,0.0,0.0,1,,0.0,0.0,-1.644215,49.634344,50130


In [121]:
enriched_stats_generales_df["postalcode"] = enriched_stats_generales_df.postalcode.str[:2]
enriched_stats_generales_df

Unnamed: 0,year,filiere,banque,ecole,inscrits_nb,inscrits_filles,inscrits_cinq_demi,admissibles_nb,admissibles_filles,admissibles_cinq_demi,...,integres_nb,integres_filles,integres_cinq_demi,places,appeles,integres_rg_median,integres_rg_moyen,longitude,latitude,postalcode
0,2011,bcpst,A BIO,AgroParisTech Grignon,,,,,,,...,257.0,66.0,19.0,257.0,,,,1.935013,48.846946,78
1,2012,bcpst,A BIO,AgroParisTech Grignon,,,,,,,...,257.0,64.0,25.0,257.0,,,,1.935013,48.846946,78
2,2013,bcpst,A BIO,AgroParisTech Grignon,,,,,,,...,259.0,65.0,24.0,257.0,,,,1.935013,48.846946,78
3,2014,bcpst,A BIO,AgroParisTech Grignon,,,,,,,...,251.0,62.0,16.0,250.0,,,,1.935013,48.846946,78
4,2015,bcpst,A BIO,AgroParisTech Grignon,,,,,,,...,250.0,62.0,16.0,250.0,,,,1.935013,48.846946,78
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24169,2021,pt,EPITA,ESME SUDRIA,,,,,,,...,9.0,0.0,22.2,10,,93.0,90.0,2.393000,48.814077,94
24170,2021,bcpst,G2E,Bordeaux INP-ENSEGID,1657.0,70.8,18.2,1136.0,68.7,23.2,...,18.0,61.1,27.8,18,,457.0,399.0,-0.607988,44.804806,33
24171,2021,bcpst,CONCOURS POLYTECH,ESIX Caen Agroalimentaire,,,,,,,...,4.0,50.0,0.0,5,,1291.0,1188.0,-1.644215,49.634344,50
24172,2021,tb,Concours POLYTECH A TB,ESIX Caen Agroalimentaire,,,,,,,...,0.0,0.0,0.0,1,,0.0,0.0,-1.644215,49.634344,50


In [122]:
enriched_stats_generales_df.to_csv(STATS_GENERALES_FILE_PATH)

In [127]:
enriched_stats_generales_df.postalcode.isna().sum()/enriched_stats_generales_df.shape[0]

0.00442624307106809

Moins de 0.5% (107) des lignes ont des coordonnées et des codes postaux non renseignés