In [1]:
import numpy as np
import pandas as pd
import os
import csv
import ast
import seaborn as sns
from matplotlib import pyplot as plt
import wquantiles 

from tqdm import tqdm
from wquantiles import quantile
from openfisca_survey_manager.utils import asof

from openfisca_france_indirect_taxation import FranceIndirectTaxationTaxBenefitSystem
from openfisca_france_indirect_taxation.surveys import SurveyScenario
from openfisca_france_indirect_taxation.examples.utils_example import (
    dataframe_by_group,
    df_weighted_average_grouped,
    wavg)
from openfisca_france_indirect_taxation.build_survey_data.utils import weighted_sum
from openfisca_france_indirect_taxation.projects.TVA_Herve_IPP.new_calage_bdf_cn import new_get_inflators_by_year, new_get_cn_aggregates
from openfisca_france_indirect_taxation.projects.TVA_Herve_IPP.Utils import weighted_quantiles
from openfisca_france_indirect_taxation.utils import assets_directory, get_input_data_frame

In [2]:
simulated_variables = ['aise',
'depenses_totales',
'depenses_tot',
'rev_disponible',
 'niveau_de_vie',
 'niveau_vie_decile',
 'decile_indiv_niveau_vie',
 'ocde10',
 'pondmen',
 'nactifs',
 'npers',
 'identifiant_menage']

In [51]:
# Première simulation : données BdF 2017 brutes
year = 2017
data_year = 2017
tax_benefit_system = FranceIndirectTaxationTaxBenefitSystem()

survey_scenario = SurveyScenario.create(
    tax_benefit_system = tax_benefit_system,
    year = year,
    data_year = data_year
    )

In [44]:
def epargne(df):
    df['part_epargnants'] = (df['rev_disponible'] >= df['depenses_tot']).astype(float)
    df['aise_1_2_3'] = (df['aise'] <= 3).astype(float)
    df_by_decile = df_weighted_average_grouped(dataframe = df, groupe = 'decile_indiv_niveau_vie', varlist = ['depenses_tot','rev_disponible','niveau_de_vie','ocde10','part_epargnants','aise_1_2_3'])
    df_by_decile['taux_epargne'] = 1 - (df_by_decile ['depenses_tot'] / df_by_decile ['rev_disponible'])

    return(df_by_decile)
    

In [52]:
bdf_brut_2017 = survey_scenario.create_data_frame_by_entity(simulated_variables, period = 2017)['menage']
bdf_brut_2017_by_decile = epargne(bdf_brut_2017)

In [53]:
bdf_brut_2017_by_decile

Unnamed: 0_level_0,depenses_tot,rev_disponible,niveau_de_vie,ocde10,part_epargnants,aise_1_2_3,taux_epargne
decile_indiv_niveau_vie,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1.0,16618.849857,12033.365675,7771.597011,1.505684,0.364042,0.556694,-0.381064
2.0,18669.596003,18695.775843,12194.837137,1.535194,0.606756,0.646267,0.0014
3.0,20154.570616,22062.551965,14711.467432,1.499341,0.668024,0.7179,0.086481
4.0,22767.916162,25747.384567,17021.68215,1.513742,0.696474,0.800655,0.115719
5.0,24178.406425,28938.312244,19152.627451,1.510622,0.723806,0.840285,0.164485
6.0,27044.725435,32627.493487,21407.492165,1.523802,0.748634,0.875082,0.171106
7.0,29295.627583,37174.906606,23894.011292,1.55614,0.783319,0.917643,0.211952
8.0,32679.086339,42383.481753,27136.563509,1.562476,0.792963,0.927321,0.228966
9.0,35957.389789,50143.268038,32092.461224,1.563346,0.855351,0.948395,0.282907
10.0,48513.473511,76299.78235,49005.906036,1.55299,0.872814,0.980521,0.364173


In [47]:
# Deuxième simulation : données BdF calées sur la compta nat en 2017 
inflators_by_year = new_get_inflators_by_year(rebuild = True, year_range = range(2017, 2025), data_year = data_year)
inflation_kwargs = dict(inflator_by_variable = inflators_by_year[year])

survey_scenario = SurveyScenario.create(
    inflation_kwargs =  inflation_kwargs,
    tax_benefit_system = tax_benefit_system,
    year = year,
    data_year = data_year
    )

  data_bdf_postes_cn[poste] = 0


In [48]:
bdf_cale_2017 = survey_scenario.create_data_frame_by_entity(simulated_variables, period = 2017)['menage']
bdf_cale_2017_by_decile = epargne(bdf_cale_2017)

In [49]:
bdf_cale_2017_by_decile

Unnamed: 0_level_0,depenses_tot,rev_disponible,niveau_de_vie,ocde10,part_epargnants,aise_1_2_3,taux_epargne
decile_indiv_niveau_vie,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1.0,20668.353962,14621.500769,9443.111331,1.505684,0.38263,0.556694,-0.413559
2.0,22954.577933,22716.861486,14817.701543,1.535194,0.636913,0.646267,-0.010464
3.0,24796.036146,26807.763471,17875.608407,1.499341,0.679809,0.7179,0.075043
4.0,28093.29356,31285.129565,20682.703918,1.513742,0.703334,0.800655,0.102024
5.0,30760.658627,35162.361638,23271.972713,1.510622,0.730713,0.840285,0.125182
6.0,34929.534747,39645.011599,26011.813482,1.523802,0.754121,0.875082,0.118943
7.0,35568.988561,45170.481892,29033.132894,1.55614,0.785993,0.917643,0.212561
8.0,41255.380525,51499.316872,32973.09299,1.562476,0.788073,0.927321,0.198914
9.0,43611.139633,60928.077225,38994.904632,1.563346,0.849966,0.948395,0.284219
10.0,56293.722275,92710.332169,59546.091661,1.55299,0.872776,0.980521,0.3928


In [10]:
# Comparaison avec l'ERFS 
erfs_path = "C:/Users/veve1/OneDrive/Documents/ENSAE 3A/Memoire MiE/Data/erfs_fpr/2017/csv"
erfs_menage_2017 = pd.read_csv(os.path.join(erfs_path,"fpr_menage_2017.csv"), sep = ";")
erfs_mrf17 = pd.read_csv(os.path.join(erfs_path,"fpr_mrf17e17t4.csv"), sep = ";") 

erfs_2017 = erfs_menage_2017.merge(erfs_mrf17, how = "left", left_on='ident17', right_on = 'ident17')
erfs_2017.columns = erfs_2017.columns.str.lower()
erfs_2017.rename({'wprm' : 'pondmen'}, axis = 1, inplace= True)
erfs_2017['pondindiv'] = erfs_2017['nbind'] * erfs_2017['pondmen']

erfs_2017['niveau_de_vie'] = erfs_2017['revdispm'] / erfs_2017['nb_uci']
erfs_2017['decile_indiv_niveau_vie'] = weighted_quantiles(erfs_2017['niveau_de_vie'], labels = np.arange(1,11), weights = erfs_2017['pondindiv'], return_quantiles=False)
erfs_2017_by_decile = df_weighted_average_grouped(erfs_2017, groupe = 'decile_indiv_niveau_vie', varlist = ['revdispm','niveau_de_vie','nb_uci'])

In [11]:
(erfs_2017['revdispm'] * erfs_2017['pondmen']).sum() * 1E-9

1046.961731829182

In [12]:
cn_aggregates = new_get_cn_aggregates(2017)
cn_aggregates.loc[cn_aggregates.index == 'rev_disponible','conso_CN_2017'] * 1e-9

Code
rev_disponible    1198.2253
Name: conso_CN_2017, dtype: float64

In [54]:
bdf_brut_2017_by_decile

Unnamed: 0_level_0,depenses_tot,rev_disponible,niveau_de_vie,ocde10,part_epargnants,aise_1_2_3,taux_epargne
decile_indiv_niveau_vie,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1.0,16618.849857,12033.365675,7771.597011,1.505684,0.364042,0.556694,-0.381064
2.0,18669.596003,18695.775843,12194.837137,1.535194,0.606756,0.646267,0.0014
3.0,20154.570616,22062.551965,14711.467432,1.499341,0.668024,0.7179,0.086481
4.0,22767.916162,25747.384567,17021.68215,1.513742,0.696474,0.800655,0.115719
5.0,24178.406425,28938.312244,19152.627451,1.510622,0.723806,0.840285,0.164485
6.0,27044.725435,32627.493487,21407.492165,1.523802,0.748634,0.875082,0.171106
7.0,29295.627583,37174.906606,23894.011292,1.55614,0.783319,0.917643,0.211952
8.0,32679.086339,42383.481753,27136.563509,1.562476,0.792963,0.927321,0.228966
9.0,35957.389789,50143.268038,32092.461224,1.563346,0.855351,0.948395,0.282907
10.0,48513.473511,76299.78235,49005.906036,1.55299,0.872814,0.980521,0.364173


In [14]:
erfs_2017_by_decile

Unnamed: 0_level_0,revdispm,niveau_de_vie,nb_uci
decile_indiv_niveau_vie,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1.0,10549.366141,6885.910338,1.460163
2.0,19171.650407,12553.453549,1.529674
3.0,22669.085425,15141.987546,1.497272
4.0,26202.206028,17419.079931,1.504354
5.0,29605.349803,19592.440636,1.510467
6.0,33656.452165,21907.3091,1.5359
7.0,38045.423074,24567.160571,1.548459
8.0,43616.957178,27997.120623,1.558526
9.0,51999.964338,33609.156754,1.546625
10.0,88050.55176,56997.497074,1.544034


In [55]:
bdf_cale_2017_by_decile

Unnamed: 0_level_0,depenses_tot,rev_disponible,niveau_de_vie,ocde10,part_epargnants,aise_1_2_3,taux_epargne
decile_indiv_niveau_vie,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1.0,20668.353962,14621.500769,9443.111331,1.505684,0.38263,0.556694,-0.413559
2.0,22954.577933,22716.861486,14817.701543,1.535194,0.636913,0.646267,-0.010464
3.0,24796.036146,26807.763471,17875.608407,1.499341,0.679809,0.7179,0.075043
4.0,28093.29356,31285.129565,20682.703918,1.513742,0.703334,0.800655,0.102024
5.0,30760.658627,35162.361638,23271.972713,1.510622,0.730713,0.840285,0.125182
6.0,34929.534747,39645.011599,26011.813482,1.523802,0.754121,0.875082,0.118943
7.0,35568.988561,45170.481892,29033.132894,1.55614,0.785993,0.917643,0.212561
8.0,41255.380525,51499.316872,32973.09299,1.562476,0.788073,0.927321,0.198914
9.0,43611.139633,60928.077225,38994.904632,1.563346,0.849966,0.948395,0.284219
10.0,56293.722275,92710.332169,59546.091661,1.55299,0.872776,0.980521,0.3928


In [None]:
bdf_brut_2017_by_decile.rename({'niveau_de_vie' : 'niveau_de_vie_bdf' }, axis = 1, inplace = True)
erfs_2017_by_decile.rename({'niveau_de_vie' : 'niveau_de_vie_erfs'}, axis= 1, inplace = True)

In [62]:
base_calage = bdf_brut_2017_by_decile.merge(erfs_2017_by_decile, how = 'left', left_index = True, right_index= True)
base_calage['ratio_erfs_bdf'] = base_calage['niveau_de_vie_erfs'] / base_calage['niveau_de_vie_bdf']
base_calage

Unnamed: 0_level_0,depenses_tot,rev_disponible,niveau_de_vie_bdf,ocde10,part_epargnants,aise_1_2_3,taux_epargne,revdispm,niveau_de_vie_erfs,nb_uci,ratio_erfs_bdf
decile_indiv_niveau_vie,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1.0,16618.849857,12033.365675,7771.597011,1.505684,0.364042,0.556694,-0.381064,10549.366141,6885.910338,1.460163,0.886035
2.0,18669.596003,18695.775843,12194.837137,1.535194,0.606756,0.646267,0.0014,19171.650407,12553.453549,1.529674,1.029407
3.0,20154.570616,22062.551965,14711.467432,1.499341,0.668024,0.7179,0.086481,22669.085425,15141.987546,1.497272,1.029264
4.0,22767.916162,25747.384567,17021.68215,1.513742,0.696474,0.800655,0.115719,26202.206028,17419.079931,1.504354,1.023347
5.0,24178.406425,28938.312244,19152.627451,1.510622,0.723806,0.840285,0.164485,29605.349803,19592.440636,1.510467,1.022964
6.0,27044.725435,32627.493487,21407.492165,1.523802,0.748634,0.875082,0.171106,33656.452165,21907.3091,1.5359,1.023348
7.0,29295.627583,37174.906606,23894.011292,1.55614,0.783319,0.917643,0.211952,38045.423074,24567.160571,1.548459,1.028172
8.0,32679.086339,42383.481753,27136.563509,1.562476,0.792963,0.927321,0.228966,43616.957178,27997.120623,1.558526,1.031712
9.0,35957.389789,50143.268038,32092.461224,1.563346,0.855351,0.948395,0.282907,51999.964338,33609.156754,1.546625,1.04726
10.0,48513.473511,76299.78235,49005.906036,1.55299,0.872814,0.980521,0.364173,88050.55176,56997.497074,1.544034,1.163074


In [16]:
input_bdf_2017 = get_input_data_frame(2017)
input_bdf_2017_grouped = input_bdf_2017.filter(like='poste_').groupby(lambda col: '_'.join(col.split('_')[:3]), axis=1).sum()

to_drop = [col for col in input_bdf_2017_grouped.columns.tolist() if col.startswith('poste_17') or col.startswith('poste_18')]
input_bdf_2017_grouped.drop(columns= to_drop, axis = 1, inplace= True)
input_bdf_2017_grouped = pd.concat([input_bdf_2017[['pondmen','npers','ident_men','rev_disponible','ocde10','aise','stalog','typmen']], input_bdf_2017_grouped], axis=1)

## Méthode 1 : redressement des revenus sur la base d’un filtre de cohérence revenu/consommation

Tiré de Bellamy et al. (2009): "Le redressement du revenu basé sur le filtre de cohérence revenu - consommation est le
suivant :
    (i) sur l’ensemble du fichier de l’enquête les très faibles revenus sont redressés : le
niveau de vie d’un ménage déclarant moins de 300 € par mois et par UC est
ramené automatiquement à 300 € par mois et par UC. Ce cas concerne 16
ménages sur les 10 240 de l’échantillon.
    (ii) On définit ensuite une "consommation courante" en éliminant les
consommations exceptionnelles.
    (iii) Si cette consommation courante est supérieure de plus de 20 % au revenu
courant (revenu constaté une fois ce dernier éventuellement mis à 300 € par
mois et par UC en (i)), soit pour 15,3 % des ménages, on effectue le test
suivant, analogue à celui utilisé par Loisy (1999).

Si le ménage :
1 - déclare être « à l’aise » financièrement, que « ça va » ou que « c’est juste, mais il
faut faire attention »
et
2 - déclare mettre de l’argent de côté ou que revenus et dépenses s’équilibrent
alors on remonte le revenu calculé en (i) au niveau de la consommation courante (ii). Dans
les autres cas, on laisse le revenu (i) et la consommation inchangés. Au total 10,7 % des
ménages sont affectés par ce redressement. On notera que ce traitement touche a priori
tous les ménages, quel que soit leur niveau de vie initial."

In [17]:
input_bdf_2017_grouped.drop('poste_04_2', axis= 1, inplace= True)        # on retire les loyers imputés des dépenses
input_bdf_2017_grouped = input_bdf_2017_grouped.apply(pd.to_numeric, errors='coerce')

liste_poste = [col for col in input_bdf_2017_grouped.columns.tolist() if col.startswith('poste')]
for poste in liste_poste:
   results = weighted_quantiles(input_bdf_2017_grouped['{}'.format(poste)], labels = np.arange(1,21), weights = input_bdf_2017_grouped['pondmen'], return_quantiles=True)
   input_bdf_2017_grouped['conso_courante_{}'.format(poste)] = input_bdf_2017_grouped['{}'.format(poste)].clip(upper=results[1][19])

input_bdf_2017_grouped['depenses_tot'] = input_bdf_2017_grouped[liste_poste].sum(axis = 1) 
input_bdf_2017_grouped['conso_courante_totale'] = input_bdf_2017_grouped.filter(like = 'conso_courante').sum(axis = 1)

input_bdf_2017_grouped['rev_disponible_2'] = input_bdf_2017_grouped['rev_disponible'].clip(lower= 3600 * input_bdf_2017_grouped['ocde10'])   

In [18]:
len(input_bdf_2017_grouped.loc[input_bdf_2017_grouped['rev_disponible_2'] > input_bdf_2017_grouped['rev_disponible']])

137

Il y a 137 ménages dont le niveau de vie est inférieur à 300€ par mois par UC.

In [19]:
input_bdf_2017_grouped['TEST'] = input_bdf_2017_grouped['conso_courante_totale'] > 1.2 * input_bdf_2017_grouped['rev_disponible_2']
input_bdf_2017_grouped['rev_disponible_3'] = input_bdf_2017_grouped['TEST']*(input_bdf_2017_grouped['aise'] <= 3)*input_bdf_2017_grouped['conso_courante_totale'] + input_bdf_2017_grouped['TEST'] * (input_bdf_2017_grouped['aise']>3) * input_bdf_2017_grouped['rev_disponible_2'] + (1 - input_bdf_2017_grouped['TEST']) * input_bdf_2017_grouped['rev_disponible_2']

In [20]:
input_bdf_2017_grouped['TEST'].sum(axis = 0)

1232

Il y a 1232 (825 quand le seuil est à p90) ménages dont la consommation courante est supérieure de plus de 20% à leur revenu disponible

In [21]:
(input_bdf_2017_grouped['TEST']*(input_bdf_2017_grouped['aise'] <= 3)).sum()

890

Dont 890 (559 quand le seuil est à p90) qui déclarent être « à l’aise » financièrement, que « ça va » ou que « c’est juste, mais il
faut faire attention ».

In [22]:
input_bdf_2017_grouped['part_epargnants'] = input_bdf_2017_grouped['rev_disponible_3'] >= input_bdf_2017_grouped['depenses_tot'] 
input_bdf_2017_grouped['part_epargnants_initiale'] = input_bdf_2017_grouped['rev_disponible'] >= input_bdf_2017_grouped['depenses_tot']

In [23]:
input_bdf_2017_grouped['pondindiv'] = input_bdf_2017_grouped['npers'] * input_bdf_2017_grouped['pondmen']
for rev in ['','_2','_3']:
    input_bdf_2017_grouped['niveau_de_vie{}'.format(rev)] = input_bdf_2017_grouped['rev_disponible{}'.format(rev)] / input_bdf_2017_grouped['ocde10']
    
input_bdf_2017_grouped['decile_indiv_niveau_vie_3'] = weighted_quantiles(input_bdf_2017_grouped['niveau_de_vie_3'], labels = np.arange(1,11), weights = input_bdf_2017_grouped['pondindiv'], return_quantiles=False)
input_bdf_2017_by_decile = df_weighted_average_grouped(input_bdf_2017_grouped, groupe = 'decile_indiv_niveau_vie_3', varlist = ['niveau_de_vie_3','rev_disponible_3','depenses_tot','pondmen','part_epargnants','part_epargnants_initiale'])

input_bdf_2017_by_decile['taux_epargne'] = 1 - input_bdf_2017_by_decile['depenses_tot'] / input_bdf_2017_by_decile['rev_disponible_3'] 

In [24]:
input_bdf_2017_by_decile

Unnamed: 0_level_0,niveau_de_vie_3,rev_disponible_3,depenses_tot,pondmen,part_epargnants,part_epargnants_initiale,taux_epargne
decile_indiv_niveau_vie_3,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1.0,8870.04789,13870.594505,14208.776642,3140.234542,0.548106,0.504947,-0.024381
2.0,12824.824834,19546.282103,17052.213569,3015.639235,0.681657,0.652485,0.127598
3.0,15280.530966,23403.316928,20192.046268,2928.113398,0.671712,0.663997,0.137214
4.0,17508.092291,25978.991113,21591.939618,3071.092906,0.705966,0.694074,0.168869
5.0,19638.26117,29939.650331,24209.424145,2872.515766,0.729557,0.718682,0.191393
6.0,21814.442111,33220.946586,27426.392924,2937.162837,0.730572,0.719146,0.174425
7.0,24262.404514,37595.77717,30186.9295,2994.762479,0.745023,0.739163,0.197066
8.0,27478.556999,42572.954991,33187.333283,2937.728071,0.749734,0.746818,0.22046
9.0,32334.160539,49580.00654,37403.114166,2868.578056,0.794068,0.793443,0.245601
10.0,49292.286932,76580.92828,49485.366649,2805.690502,0.858588,0.858588,0.353816


In [25]:
erfs_2017_by_decile

Unnamed: 0_level_0,revdispm,niveau_de_vie,nb_uci
decile_indiv_niveau_vie,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1.0,10549.366141,6885.910338,1.460163
2.0,19171.650407,12553.453549,1.529674
3.0,22669.085425,15141.987546,1.497272
4.0,26202.206028,17419.079931,1.504354
5.0,29605.349803,19592.440636,1.510467
6.0,33656.452165,21907.3091,1.5359
7.0,38045.423074,24567.160571,1.548459
8.0,43616.957178,27997.120623,1.558526
9.0,51999.964338,33609.156754,1.546625
10.0,88050.55176,56997.497074,1.544034


## Méthode 2 : Imputation des données de consommation par strates

### Méthode 2.1 : En utilisant le revenu disponible de BdF (brut)

In [26]:
to_drop = [col for col in input_bdf_2017.columns.tolist() if col.startswith('poste_17') or col.startswith('poste_18')]
input_bdf_2017.drop(to_drop, axis = 1, inplace = True)
input_bdf_2017['niveau_de_vie'] = input_bdf_2017['rev_disponible'] / input_bdf_2017['ocde10']
input_bdf_2017['niveau_de_vie'] = input_bdf_2017['niveau_de_vie'].astype(float)
input_bdf_2017['pondmen'] = input_bdf_2017['pondmen'].astype(float)
input_bdf_2017['npers'] = input_bdf_2017['npers'].astype(float)

input_bdf_2017['pondindiv'] = input_bdf_2017['npers'] * input_bdf_2017['pondmen'] 

input_bdf_2017['decile_indiv_niveau_vie'] = weighted_quantiles(data = input_bdf_2017['niveau_de_vie'], labels = np.arange(1,11), weights = input_bdf_2017['pondindiv'], return_quantiles=False)
input_bdf_2017['stalog_2'] = input_bdf_2017['stalog'].replace({1:1, 3:2, 4:2, 5:2})

In [27]:
def assign_strate_ines(row):
    if row['typmen'] == 5:
        return 'T5'
    elif row['typmen'] == 2:
        return f'T2_D{int(row["decile_indiv_niveau_vie"])}'
    elif row['typmen'] in [1, 3, 4]:
        return f'T{int(row["typmen"])}_D{int(row["decile_indiv_niveau_vie"])}_S{int(row["stalog_2"])}'
    else:
        return 'Other'

input_bdf_2017['strate_ines'] = input_bdf_2017.apply(assign_strate_ines, axis=1)

In [28]:
input_bdf_2017['strate_ines'].nunique()

71

In [29]:
input_bdf_2017[['decile_indiv_niveau_vie','typmen','stalog','strate_ines']]

Unnamed: 0,decile_indiv_niveau_vie,typmen,stalog,strate_ines
0,7.0,3,1,T3_D7_S1
1,10.0,4,2,T4_D10_S2
2,7.0,4,1,T4_D7_S1
3,2.0,1,3,T1_D2_S2
4,7.0,4,1,T4_D7_S1
...,...,...,...,...
12076,5.0,1,3,T1_D5_S2
12077,10.0,4,2,T4_D10_S2
12078,4.0,4,1,T4_D4_S1
12079,8.0,3,1,T3_D8_S1


In [30]:
input_bdf_2017['strate_ines'].value_counts()

T3_D10_S1    413
T2_D1        389
T2_D2        318
T1_D1_S2     309
T2_D3        306
            ... 
T4_D3_S1      54
T4_D2_S1      54
T3_D1_S1      51
T2_D10        48
T4_D1_S1      43
Name: strate_ines, Length: 71, dtype: int64

In [31]:
liste_var = ['rev_disponible'] + [col for col in input_bdf_2017.columns.tolist() if col.startswith('poste_')]

input_bdf_2017_by_strate = df_weighted_average_grouped(input_bdf_2017, groupe = 'strate_ines', varlist = liste_var)
for poste in [col for col in input_bdf_2017.columns.tolist() if col.startswith('poste_')] : 
    input_bdf_2017_by_strate['part_{}'.format(poste)] = input_bdf_2017_by_strate['{}'.format(poste)] / input_bdf_2017_by_strate['rev_disponible']
    
part_conso_by_strat = input_bdf_2017_by_strate.filter(like = 'part').reset_index()

  input_bdf_2017_by_strate['part_{}'.format(poste)] = input_bdf_2017_by_strate['{}'.format(poste)] / input_bdf_2017_by_strate['rev_disponible']


In [32]:
new_input_bdf_2017 = input_bdf_2017.merge(part_conso_by_strat, how = 'left', left_on = 'strate_ines', right_on = 'strate_ines')
for poste in [col for col in input_bdf_2017.columns.tolist() if col.startswith('poste_')] : 
    new_input_bdf_2017['new_{}'.format(poste)] = new_input_bdf_2017['part_{}'.format(poste)] * new_input_bdf_2017['rev_disponible']
    
liste_new_poste = [col for col in new_input_bdf_2017.columns.tolist() if col.startswith('new_poste')]
new_input_bdf_2017['depenses_tot'] = new_input_bdf_2017[liste_new_poste].sum(axis = 1)

new_input_bdf_2017_by_decile = df_weighted_average_grouped(dataframe = new_input_bdf_2017, groupe = 'decile_indiv_niveau_vie', varlist =['depenses_tot','rev_disponible'])
new_input_bdf_2017_by_decile['taux_epargne'] = 1 - new_input_bdf_2017_by_decile['depenses_tot'] / new_input_bdf_2017_by_decile['rev_disponible']

  new_input_bdf_2017['new_{}'.format(poste)] = new_input_bdf_2017['part_{}'.format(poste)] * new_input_bdf_2017['rev_disponible']


In [33]:
new_input_bdf_2017_by_decile

Unnamed: 0_level_0,depenses_tot,rev_disponible,taux_epargne
decile_indiv_niveau_vie,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1.0,18612.2815,11599.193347,-0.604619
2.0,21053.291203,18695.940417,-0.126089
3.0,23311.658745,22062.75479,-0.056607
4.0,26584.543601,25747.670334,-0.032503
5.0,28295.546328,28938.607224,0.022222
6.0,31657.044128,32627.717343,0.02975
7.0,34597.586832,37175.1051,0.069335
8.0,38757.98069,42383.685422,0.085545
9.0,43033.300463,50143.457903,0.141796
10.0,57509.462753,76671.035257,0.249919


In [34]:
def imputation_depenses_ines(input_df):
    to_drop = [col for col in input_df.columns.tolist() if col.startswith('poste_17') or col.startswith('poste_18')]
    input_df.drop(to_drop, axis = 1, inplace = True)
    
    input_df['niveau_de_vie'] = input_df['rev_disponible'] / input_df['ocde10']
    input_df['niveau_de_vie'] = input_df['niveau_de_vie'].astype(float)
    input_df['pondmen'] = input_df['pondmen'].astype(float)
    input_df['decile_indiv_niveau_vie'] = weighted_quantiles(data = input_df['niveau_de_vie'], labels = np.arange(1,11), weights = input_df['pondmen'], return_quantiles=False)
    input_df['stalog_2'] = input_df['stalog'].replace({1:1, 3:2, 4:2, 5:2})
    
    input_df['strate_ines'] = input_df.apply(assign_strate_ines, axis=1)
    liste_var = ['rev_disponible'] + [col for col in input_df.columns.tolist() if col.startswith('poste_')]

    input_df_by_strate = df_weighted_average_grouped(input_df, groupe = 'strate_ines', varlist = liste_var)
    for poste in [col for col in input_df.columns.tolist() if col.startswith('poste_')] : 
        input_df_by_strate['part_{}'.format(poste)] = input_df_by_strate['{}'.format(poste)] / input_df_by_strate['rev_disponible']
    part_conso_by_strat = input_df_by_strate.filter(like = 'part').reset_index()
    
    new_input_df = input_df.merge(part_conso_by_strat, how = 'left', left_on = 'strate_ines', right_on = 'strate_ines')
    for poste in [col for col in input_df.columns.tolist() if col.startswith('poste_')] : 
        new_input_df['new_{}'.format(poste)] = new_input_df['part_{}'.format(poste)] * new_input_df['rev_disponible']
        
    liste_new_poste = [col for col in new_input_df.columns.tolist() if col.startswith('new_poste')]
    new_input_df['depenses_tot'] = new_input_df[liste_new_poste].sum(axis = 1)

    new_input_df_by_decile = df_weighted_average_grouped(dataframe = new_input_df, groupe = 'decile_indiv_niveau_vie', varlist =['depenses_tot','rev_disponible'])
    new_input_df_by_decile['taux_epargne'] = 1 - new_input_df_by_decile['depenses_tot'] / new_input_df_by_decile['rev_disponible']
    
    return(new_input_df,new_input_df_by_decile)

### Méthode 2.2 : En utilisant les consos et revenus BdF calés en 2017

In [35]:
input_bdf_cale_2017 = survey_scenario.create_data_frame_by_entity(liste_var + ['ocde10','pondmen','stalog','typmen'], period = 2017)['menage']

In [36]:
new_input_bdf_cale_2017, new_input_bdf_cale_2017_by_decile = imputation_depenses_ines(input_bdf_cale_2017)

  input_df_by_strate['part_{}'.format(poste)] = input_df_by_strate['{}'.format(poste)] / input_df_by_strate['rev_disponible']
  new_input_df['new_{}'.format(poste)] = new_input_df['part_{}'.format(poste)] * new_input_df['rev_disponible']


In [37]:
new_input_bdf_cale_2017_by_decile

Unnamed: 0_level_0,depenses_tot,rev_disponible,taux_epargne
decile_indiv_niveau_vie,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1.0,20007.811761,14069.24448,-0.422096
2.0,22856.973505,22814.61709,-0.001857
3.0,24643.756769,26783.250618,0.079882
4.0,28163.371912,31276.013087,0.099522
5.0,30490.474136,34890.238635,0.126103
6.0,34603.907942,39403.562384,0.121808
7.0,35670.279512,45070.958138,0.208575
8.0,41355.604681,51198.827453,0.192255
9.0,43859.248055,60812.681652,0.278781
10.0,56652.70365,92976.841634,0.390679


In [38]:
new_input_bdf_cale_2017_by_decile[['depenses_tot','rev_disponible']].sum(axis = 0)

depenses_tot      338304.131922
rev_disponible    419296.235171
dtype: float64

In [39]:
1 - 275495/ 345077

0.2016419523758466

In [40]:
new_input_bdf_2017_by_decile

Unnamed: 0_level_0,depenses_tot,rev_disponible,taux_epargne
decile_indiv_niveau_vie,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1.0,18612.2815,11599.193347,-0.604619
2.0,21053.291203,18695.940417,-0.126089
3.0,23311.658745,22062.75479,-0.056607
4.0,26584.543601,25747.670334,-0.032503
5.0,28295.546328,28938.607224,0.022222
6.0,31657.044128,32627.717343,0.02975
7.0,34597.586832,37175.1051,0.069335
8.0,38757.98069,42383.685422,0.085545
9.0,43033.300463,50143.457903,0.141796
10.0,57509.462753,76671.035257,0.249919


### Méthode 2.3 : En utilisant les revenus calés sur ceux de TaxIPP et les dépenses calées pour 13 Mds€

In [41]:
from openfisca_france_indirect_taxation.projects.TVA_Herve_IPP.new_calage_bdf_cn import new_get_cn_aggregates

In [42]:
cn_agregates = new_get_cn_aggregates(2017)

In [43]:
cn_agregates.loc[cn_agregates.index == 'poste_06']

Unnamed: 0_level_0,2017,Solde territorial,conso_CN_2017
Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
poste_06,20360640000.0,,20360640000.0
