In [6]:
# ===============================================
# Tâche 1 - Collecte & Préparation Multi-Source
# ANIP Challenge : Bénin
# Version optimisée et robuste
# ===============================================

import pandas as pd
from pandas_datareader import wb
import requests
from bs4 import BeautifulSoup
import os

# ----------------------------
# 0. Préparer dossiers
# ----------------------------
os.makedirs("datasets/raw", exist_ok=True)
os.makedirs("datasets/final", exist_ok=True)

# ----------------------------
# 1. Téléchargement indicateurs World Bank
# ----------------------------
start_year, end_year = 2000, 2023
wb_indicators = {
    # Démographiques
    "SP.POP.TOTL": "population",
    "SP.URB.TOTL": "population_urbaine",
    "SP.POP.GROW": "taux_croissance_pop",
    # Économiques
    "NY.GDP.MKTP.CD": "gdp_usd",
    "FP.CPI.TOTL": "inflation",
    "SL.UEM.TOTL.ZS": "taux_chomage",
    "NE.EXP.GNFS.CD": "exportations",
    # Sociaux
    "SP.DYN.LE00.IN": "esperance_vie",
    "SE.ADT.LITR.ZS": "alphabetisation",
    "SE.PRM.ENRR": "scolarisation_primaire"
}

print("Téléchargement indicateurs WDI...")
df_wdi = wb.download(indicator=list(wb_indicators.keys()), country="BJ",
                     start=start_year, end=end_year).rename(columns=wb_indicators).reset_index()
df_wdi['year'] = pd.to_numeric(df_wdi['year'], errors='coerce').astype(int)
df_wdi.to_csv("datasets/raw/benin_wb_raw.csv", index=False)

# Calcul PIB par habitant
df_wdi['gdp_per_capita'] = df_wdi['gdp_usd'] / df_wdi['population']

# ----------------------------
# 2. Scraping indicateurs sportifs Wikipedia
# ----------------------------
url = "https://fr.wikipedia.org/wiki/B%C3%A9nin_aux_Jeux_olympiques"
resp = requests.get(url)
soup = BeautifulSoup(resp.text, "html.parser")

tables = soup.find_all("table", {"class": "wikitable"})
df_sport = pd.DataFrame()

if tables:
    table = pd.read_html(str(tables[0]))[0]
    table = table.rename(columns={table.columns[0]: "year"})
    table['year'] = pd.to_numeric(table['year'], errors='coerce')
    table = table.dropna(subset=['year'])
    table['year'] = table['year'].astype(int)
    if "Participants" in table.columns:
        table = table.rename(columns={"Participants": "nb_participants_olympiques"})
    df_sport = table[['year','nb_participants_olympiques']]

# Remplir par 0 si aucune donnée
if df_sport.empty:
    df_sport['year'] = df_wdi['year']
    df_sport['nb_participants_olympiques'] = 0

# ----------------------------
# 3. Fusion WDI + sport
# ----------------------------
df = pd.merge(df_wdi, df_sport, on='year', how='left')
df['nb_participants_olympiques'].fillna(0, inplace=True)

# ----------------------------
# 4. Calculs indicateurs dérivés
# ----------------------------
df = df.sort_values('year').reset_index(drop=True)
df['pop_growth_pct'] = df['population'].pct_change() * 100
df['gdp_growth_pct'] = df['gdp_usd'].pct_change() * 100
df['gdp_per_capita'] = df['gdp_usd'] / df['population']

# ----------------------------
# 5. Nettoyage valeurs invalides
# ----------------------------
numeric_cols = ['primary_enrolment_gross', 'adult_literacy_pct',
                'pop_growth_pct','gdp_growth_pct','gdp_per_capita']
for col in numeric_cols:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce').round(2)

# ----------------------------
# 6. Enrichissement WHO API
# ----------------------------
who_indicators = {
    "life_expectancy": "LIFE_EXP_BOTHSEX",
    "under5_mortality_per1000": "SH_DYN_MORT"
}

df_who_list = []
for col, code in who_indicators.items():
    url = f"https://ghoapi.azureedge.net/api/{code}"
    resp = requests.get(url)
    if resp.status_code == 200:
        data = resp.json()
        df_tmp = pd.json_normalize(data['value'])
        df_tmp = df_tmp[df_tmp['SpatialDim'] == 'BEN']
        df_tmp = df_tmp[['TimeDim','NumericValue']].rename(columns={'TimeDim':'year','NumericValue':col})
        df_tmp['year'] = pd.to_numeric(df_tmp['year'], errors='coerce').astype(int)
        df_who_list.append(df_tmp)

if df_who_list:
    df_who = df_who_list[0]
    for df_tmp in df_who_list[1:]:
        df_who = df_who.merge(df_tmp, on='year', how='outer')
    df = df.merge(df_who, on='year', how='left')

# ----------------------------
# 7. Sauvegarde dataset final
# ----------------------------
out_fp = "datasets/final/benin_multi_enriched_apis.csv"
df.to_csv(out_fp, index=False)
print(f"Dataset enrichi multi-source sauvegardé : {out_fp}")

# ----------------------------
# 8. Glossaire
# ----------------------------
glossaire_fp = "datasets/final/glossaire_variables.csv"
new_rows = [
    {"Variable":"primary_enrolment_gross","Nom complet":"Scolarisation primaire (taux brut)","Définition":"Gross enrolment ratio - primaire","Unité":"%","Source":"World Bank (WDI)","Période":"2000-2023","Géographie":"Bénin"},
    {"Variable":"adult_literacy_pct","Nom complet":"Taux d'alphabétisation adulte","Définition":"% d'adultes (15+) sachant lire et écrire","Unité":"%","Source":"World Bank (WDI)","Période":"2000-2023","Géographie":"Bénin"},
    {"Variable":"life_expectancy","Nom complet":"Espérance de vie à la naissance","Définition":"Espérance de vie totale (années)","Unité":"années","Source":"WHO","Période":"2000-2023","Géographie":"Bénin"},
    {"Variable":"under5_mortality_per1000","Nom complet":"Mortalité <5 ans","Définition":"Taux de mortalité des moins de 5 ans (par 1000 naissances)","Unité":"pour 1000","Source":"WHO","Période":"2000-2023","Géographie":"Bénin"}
]
new_df = pd.DataFrame(new_rows)
if os.path.exists(glossaire_fp):
    glossaire = pd.read_csv(glossaire_fp)
    glossaire = pd.concat([glossaire, new_df], ignore_index=True)
else:
    glossaire = new_df
glossaire.to_csv(glossaire_fp, index=False)
print("Glossaire mis à jour :", glossaire_fp)

# ----------------------------
# 9. Détection anomalies
# ----------------------------
# Liste des colonnes critiques pour les anomalies
critical_cols = ['population','gdp_usd','gdp_per_capita','under5_mortality_per1000','adult_literacy_pct']

# Créer les colonnes manquantes avec NaN
for col in critical_cols:
    if col not in df.columns:
        df[col] = pd.NA

# Détection anomalies
anomalies_rows = df[
    (df['population'] <= 0) |
    (df['gdp_usd'] <= 0) |
    (df['gdp_per_capita'] <= 0) |
    ((df['under5_mortality_per1000'] < 0) & df['under5_mortality_per1000'].notna()) |
    ((df['adult_literacy_pct'] > 150) & df['adult_literacy_pct'].notna())
].copy()

anomalies_rows.to_csv("datasets/final/anomalies_2.csv", index=False)
print("Anomalies détectées et sauvegardées : datasets/final/anomalies_2.csv")



Téléchargement indicateurs WDI...


  df_wdi = wb.download(indicator=list(wb_indicators.keys()), country="BJ",
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['nb_participants_olympiques'].fillna(0, inplace=True)


Dataset enrichi multi-source sauvegardé : datasets/final/benin_multi_enriched_apis.csv
Glossaire mis à jour : datasets/final/glossaire_variables.csv
Anomalies détectées et sauvegardées : datasets/final/anomalies_2.csv
