In [None]:
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from openfisca_survey_manager.survey_collections import SurveyCollection
from openfisca_survey_manager.paths import default_config_files_directory as config_files_directory

In [None]:
from openfisca_france_indirect_taxation.build_survey_data.matching_bdf_entd.step_2_homogenize_variables import \
    create_niveau_vie_quantiles
from openfisca_france_indirect_taxation.build_survey_data.utils import \
    hellinger
from openfisca_france_indirect_taxation.build_survey_data.utils import \
    histogrammes
data_bdf, data_entd = create_niveau_vie_quantiles(year_data = 2017)

In [None]:
data_entd.columns

In [None]:
from openfisca_france_indirect_taxation.build_survey_data.matching_bdf_entd.step_3_1_compute_hellinger_distance import hellinger_variable   

In [None]:
matching_varlist = ["nb_diesel", "agepr", "age_vehicule", "rural", "paris", "npers", "nactifs", "veh_tot"]

In [None]:
hellinger_distances = { var : hellinger_variable(data_bdf, data_entd, var = var, weight_col = 'pondmen')[2] for var in matching_varlist }

In [None]:
hellinger_distances


In [None]:
def histogram_cat_variable(data_bdf, data_entd, var, data_name_1='BdF', data_name_2='ENTD'):
    """
    Crée un histogramme comparatif pour une variable catégorielle donnée entre deux DataFrames.

    Parameters
    ----------
    data_bdf : pd.DataFrame
        Premier DataFrame.
    data_entd : pd.DataFrame
        Deuxième DataFrame.
    var : str
        Nom de la variable à analyser.
    data_name_1 : str
        Légende pour le premier DataFrame (par défaut : 'BdF').
    data_name_2 : str
        Légende pour le deuxième DataFrame (par défaut : 'ENTD').

    Returns
    -------
    matplotlib.pyplot
        Figure contenant l'histogramme.
    """
    categories = set(data_bdf[var].dropna().unique()).union(set(data_entd[var].dropna().unique()))
    list_values_bdf = []
    list_values_entd = []
    list_keys = []

    # Cas d'une variable catégorielle : utiliser les catégories fournies
    list_keys = [str(cat) for cat in categories]
    for cat in categories:
        # Calcul des proportions pondérées pour chaque catégorie
        part_bdf = data_bdf.loc[data_bdf[var] == cat, 'pondmen'].sum() / data_bdf['pondmen'].sum()
        part_entd = data_entd.loc[data_entd[var] == cat, 'pondmen'].sum() / data_entd['pondmen'].sum()
        list_values_bdf.append(part_bdf)
        list_values_entd.append(part_entd)

    # Appel de ta fonction histogrammes
    histogrammes(list_keys, list_values_bdf, list_values_entd, data_name_1, data_name_2)

    # Ajout des labels et titres
    plt.xlabel(f'Catégories de {var}')
    plt.ylabel('Proportion pondérée')
    plt.title(f'Comparaison des catégories de {var} entre {data_name_1} et {data_name_2}')

    plt.grid(True, linestyle='--', alpha=0.7)
    return plt


def boxplot_variable(data_bdf, data_entd, var, data_name_1='BdF', data_name_2='ENTD'):
    """
    Crée un boxplot comparatif pour une variable donnée entre deux DataFrames.

    Parameters
    ----------
    data_bdf : pd.DataFrame
        Premier DataFrame.
    data_entd : pd.DataFrame
        Deuxième DataFrame.
    var : str
        Nom de la variable à analyser.
    data_name_1 : str
        Légende pour le premier DataFrame (par défaut : 'BdF').
    data_name_2 : str
        Légende pour le deuxième DataFrame (par défaut : 'ENTD').

    Returns
    -------
    matplotlib.axes.Axes
        Axe contenant le boxplot.
    """
    # Préparation des données
    df_plot = pd.concat([
        data_bdf[[var]].copy().assign(dataset=data_name_1),
        data_entd[[var]].copy().assign(dataset=data_name_2)
        ])

    # Création du boxplot
    plt.figure(figsize=(10, 6))
    ax = sns.boxplot(x='dataset', y=var, data=df_plot)
    plt.title(f'Comparaison des distributions de {var} entre {data_name_1} et {data_name_2}')
    plt.grid(True, linestyle='--', alpha=0.7)

    return ax

In [None]:
"nb_diesel", "agepr", "age_vehicule", "rural", "paris", "npers", "nactifs", "veh_tot"

In [None]:
boxplot_variable(data_bdf, data_entd, var = 'agepr', data_name_1='BdF', data_name_2='ENTD')

In [None]:
boxplot_variable(data_bdf, data_entd, var = 'age_vehicule', data_name_1='BdF', data_name_2='ENTD')

In [None]:
boxplot_variable(data_bdf, data_entd, var = 'npers', data_name_1='BdF', data_name_2='ENTD')

In [None]:
boxplot_variable(data_bdf, data_entd, var = 'nactifs', data_name_1='BdF', data_name_2='ENTD')

In [None]:
from openfisca_france_indirect_taxation.build_survey_data.matching_bdf_entd.step_2_homogenize_variables import homogenize_variables_definition_bdf_entd 

In [None]:
data_bdf, data_entd = homogenize_variables_definition_bdf_entd(2017)

In [None]:
data_bdf['veh_tot'] = data_bdf['veh_tot'].fillna(0)

data_bdf['part_essence'] = (
    data_bdf.essence / (data_bdf.essence + data_bdf.diesel + data_bdf.autre_carbu)
    )
data_bdf['part_diesel'] = (
    data_bdf.diesel / (data_bdf.essence + data_bdf.diesel + data_bdf.autre_carbu)
    )
data_bdf['part_autre_carbu'] = 1 - data_bdf['part_essence'] - data_bdf['part_diesel']

data

In [None]:
data_entd['part_diesel']

In [None]:
data_bdf['nb_diesel'].unique()

In [None]:
data_entd['veh_tot'] = data_entd['veh_tot'].fillna(0)

data_entd['part_essence'] = (
    data_entd.essence / (data_entd.essence + data_entd.diesel + data_entd.autre_carbu)
    )
data_entd['part_diesel'] = (
    data_entd.diesel / (data_entd.essence + data_entd.diesel + data_entd.autre_carbu)
    )
data_entd['nb_essence'] = data_entd.veh_tot * data_entd.part_essence
data_entd['nb_diesel'] = data_entd.veh_tot * data_entd.part_diesel
data_entd[['nb_essence', 'nb_diesel']] = data_entd[['nb_essence', 'nb_diesel']].fillna(0)

In [None]:
data_entd['nb_essence'].unique()

In [None]:
data_bdf['nb_diesel'].unique()
data_entd['nb_diesel'].unique()

In [None]:
categories = set(data_bdf['nb_diesel'].dropna().unique()).union(set(data_entd['nb_diesel'].dropna().unique()))
categories

In [None]:
histogram_cat_variable(data_bdf, data_entd, var = 'nb_diesel', data_name_1='BdF', data_name_2='ENTD')

In [None]:
histogram_cat_variable(data_bdf, data_entd, var = 'rural', data_name_1='BdF', data_name_2='ENTD')

In [None]:
histogram_cat_variable(data_bdf, data_entd, var = 'paris', data_name_1='BdF', data_name_2='ENTD')

In [None]:
histogram_cat_variable(data_bdf, data_entd, var = 'veh_tot', data_name_1='BdF', data_name_2='ENTD')

In [None]:
from openfisca_survey_manager.survey_collections import SurveyCollection
from openfisca_survey_manager.paths import default_config_files_directory as config_files_directory

In [None]:
entd_survey_collection = SurveyCollection.load(
    collection = 'enquete_transports', config_files_directory = config_files_directory
    )
survey_entd = entd_survey_collection.get_survey('enquete_transports_{}'.format(2019))
input_entd_vehicule = survey_entd.get_values(table = 'q_voitvul_public_V2')
input_entd_menage = survey_entd.get_values(table = 'q_menage_public_V2')

In [None]:
input_entd_vehicule['energie_agrege'].isna().sum()

In [None]:
from openfisca_france_indirect_taxation.build_survey_data.matching_bdf_entd.step_1_2_build_dataframes_vehicles import load_data_vehicules_bdf_entd        
data_bdf, data_entd, data_entd_menage = load_data_vehicules_bdf_entd(2017)
data_entd_full = data_entd_menage.merge(data_entd, on = 'ident_men', how = 'left')

In [None]:
data_entd_full = data_entd_menage.merge(data_entd, on = 'ident_men', how = 'left')

In [None]:
 # Définition des véhicules par carburant
data_entd_full['essence'] = 0
data_entd_full.loc[data_entd_full['energie_agrege'] == 1, 'essence'] = 1
data_entd_full['diesel'] = 0
data_entd_full.loc[data_entd_full['energie_agrege'] == 2, 'diesel'] = 1
data_entd_full['autre_carbu'] = 0
data_entd_full.loc[data_entd_full['energie_agrege'] > 2, 'autre_carbu'] = 1

In [None]:
# Calculer les proportions par ménage
proportions = (
    data_entd_full.groupby('ident_men')
    [['essence', 'diesel', 'autre_carbu']]
    .sum()
    .pipe(lambda df: df.assign(tot_carbu=df.sum(axis=1)))
    .assign(
        prop_essence=lambda x: x['essence'] / x['tot_carbu'],
        prop_diesel=lambda x: x['diesel'] / x['tot_carbu']
    )
    .assign(prop_autre_carbu=lambda x: 1 - x['prop_essence'] - x['prop_diesel'])
)

# Fusionner les proportions avec la base originale
data_entd_full = data_entd_full.merge(
    proportions[['prop_essence', 'prop_diesel', 'prop_autre_carbu']],
    left_on='ident_men',
    right_index=True,
    how='left'
)



In [None]:
data_entd_full

In [None]:
import numpy as np

In [None]:
data_entd_full['energie_imputee'] = data_entd_full.apply(lambda row: np.random.choice([1, 2, 6], p=[row['prop_essence'], row['prop_diesel'], row['prop_autre_carbu']]), axis=1)

In [None]:


    # Calcul de l'âge du véhicule
    data_entd['anvoi'] = pd.to_numeric(data_entd['annee_1mec'], errors = 'coerce')

    data_entd['age_vehicule'] = 0
    data_entd.loc[data_entd['anvoi'] != 0, 'age_vehicule'] = 2019 - data_entd['anvoi']

    data_bdf['anvoi'] = data_bdf['anvoi'].fillna(0).astype(int)
    data_bdf['age_vehicule'] = 0
    data_bdf.loc[data_bdf['anvoi'] != 0, 'age_vehicule'] = year_data - data_bdf['anvoi']

    # Définition des véhicules par carburant
    data_entd['essence'] = 0
    data_entd.loc[data_entd['energie_agrege'] == 1, 'essence'] = 1
    data_entd['diesel'] = 0
    data_entd.loc[data_entd['energie_agrege'] == 2, 'diesel'] = 1
    data_entd['autre_carbu'] = 0
    data_entd.loc[data_entd['energie_agrege'] > 2, 'autre_carbu'] = 1

    if year_data == 2017:
        carbu_cols = ['carbu1', 'carbu2', 'carbu3', 'carbu4', 'carbu5']
        data_bdf['carbu'] = data_bdf[carbu_cols].idxmax(axis=1).str.extract('(\d)').astype(int)
        data_bdf.drop(carbu_cols, axis = 1, inplace = True)

    data_bdf['essence'] = 0
    data_bdf.loc[data_bdf['carbu'] == 1, 'essence'] = 1
    data_bdf['diesel'] = 0
    data_bdf.loc[data_bdf['carbu'] == 2, 'diesel'] = 1
    data_bdf['autre_carbu'] = 0
    data_bdf.loc[data_bdf['carbu'] > 2, 'autre_carbu'] = 1

    # déf des distances parcourues par carburant
    data_entd['distance_essence'] = 0.0
    data_entd.loc[data_entd['essence'] == 1, 'distance_essence'] = data_entd['kvkm1anv']
    data_entd['distance_diesel'] = 0.0
    data_entd.loc[data_entd['diesel'] == 1, 'distance_diesel'] = data_entd['kvkm1anv']
    data_entd['distance_autre_carbu'] = 0.0
    data_entd.loc[data_entd['autre_carbu'] == 1, 'distance_autre_carbu'] = data_entd['kvkm1anv']

    # Df avec le nombre de véhicule et les distances pour chaque type de carburant
    data_vehicule_entd = data_entd[
        ['essence',
        'diesel',
        'autre_carbu',
        'distance_essence',
        'distance_diesel',
        'distance_autre_carbu',
        'ident_men']
        ].groupby(by = 'ident_men').sum()
    data_vehicule_entd = data_vehicule_entd.reset_index()

    # Df avec les infos du véhicule principal (dans ENTD)
    data_entd = data_entd.sort_values(by = 'kvkm1anv', ascending= False)
    data_entd = data_entd.drop_duplicates(['ident_men'], keep='first')
    data_entd.rename(
        columns = {
            'puis_fisc_fin': 'puissance',
            'kvcons': 'consommation',
            },
        inplace = True,
        )
    data_entd = data_entd[
        ['ident_men', 'puissance', 'consommation', 'age_vehicule']
        ]

    # déf des distances parcourues par carburant
    data_bdf['km_essence'] = 0.0
    data_bdf.loc[data_bdf['essence'] == 1, 'km_essence'] = data_bdf['km_auto']
    data_bdf['km_diesel'] = 0.0
    data_bdf.loc[data_bdf['diesel'] == 1, 'km_diesel'] = data_bdf['km_auto']
    data_bdf['km_autre_carbu'] = 0.0
    data_bdf.loc[data_bdf['autre_carbu'] == 1, 'km_autre_carbu'] = data_bdf['km_auto']

    # Df avec le nombre de véhicule et les distances pour chaque type de carburant
    data_vehicule_bdf = data_bdf[
        ['essence',
        'diesel',
        'autre_carbu',
        'km_essence',
        'km_diesel',
        'km_autre_carbu',
        'ident_men']
        ].groupby(by = 'ident_men').sum()
    data_vehicule_bdf = data_vehicule_bdf.reset_index()

    # Df avec les infos du véhicule principal (dans BdF)
    data_bdf = data_bdf.sort_values(by = ['nbvehic'])
    data_bdf = data_bdf.drop_duplicates(['ident_men'], keep='first')
    data_bdf.rename(
        columns = {
            'acqvoi': 'etat_veh_achat',
            'expvoi1': 'vp_domicile_travail',
            'expvoi2': 'vp_deplacements_pro',
            'nbvehic': 'veh_tot',
            'privoi_d': 'prix_achat',
            },
        inplace = True,
        )
    data_bdf = data_bdf[
        ['ident_men', 'prix_achat', 'veh_tot', 'etat_veh_achat', 'age_vehicule', 'vp_domicile_travail', 'vp_deplacements_pro']
        ]

    # Df infos comportements ménages
    data_entd_menage.rename(
        columns = {
            # 'v1_logdist01': 'distance_commerces',              (pas dans q_menage regarder ailleurs ?)
            # 'v1_logdist15': 'distance_transports_communs',     (pas dans q_menage regarder ailleurs ?)
            'jnbveh': 'veh_tot',
            # 'v1_jpasvoit_b': 'vp_domicile_travail',            (pas dans q_menage regarder ailleurs ?)
            # 'v1_jpasvoit_c': 'vp_deplacements_pro'             (pas dans q_menage regarder ailleurs ?)
            },
        inplace = True,
        )

    # Merge les différentes df
    data_entd_full = data_vehicule_entd.merge(data_entd, on = 'ident_men', how = 'left')
    data_entd_final = data_entd_menage.merge(data_entd_full, on = 'ident_men', how = 'left')

    data_bdf_full = data_vehicule_bdf.merge(data_bdf, on = 'ident_men', how = 'left')

    return data_bdf_full, data_entd_final