In [1]:
import pandas as pd
import os

In [None]:
# 🔹 Chemins
input_path = "../../data/cleaning/step1_typed.parquet"
output_path = "../../data/cleaning/step2_with_monthly_payment_corrected.parquet"

# 🔹 Chargement du fichier issu de l'étape 1
try:
    df = pd.read_parquet(input_path)
except FileNotFoundError:
    print("Erreur : le fichier step1_typed.parquet n'a pas été trouvé.")
    exit()

# 🔹 Vérification des colonnes nécessaires
required_cols = ['loan_amnt', 'int_rate', 'term']
if not all(col in df.columns for col in required_cols):
    print("Erreur : colonnes manquantes pour le calcul des mensualités.")
    exit()

# 🔹 Recalcul de la mensualité selon la formule standard
M = df['loan_amnt']
t = df['int_rate'] / (100*12)  
n = df['term']

# 🔹 Formule : m = (M * t) / (1 - (1 + t)**(-n))
with pd.option_context('mode.use_inf_as_na', True):  # gérer les divisions par zéro
    m = (M * t) / (1 - (1 + t) ** (-n))
    df['monthly_payment_calculated'] = m

# 🔹 Sauvegarde au format Parquet
df.to_parquet(output_path, index=False)
print(f"✅ Fichier sauvegardé avec mensualités recalculées : {output_path}")

  with pd.option_context('mode.use_inf_as_na', True):  # gérer les divisions par zéro


✅ Fichier sauvegardé avec mensualités recalculées : ../../data/cleaning/step2_with_monthly_payment_corrected.parquet


In [5]:
df.describe()

Unnamed: 0,loan_amnt,term,int_rate,installment,emp_length,annual_inc,issue_d,dti,earliest_cr_line,open_acc,pub_rec,revol_bal,revol_util,total_acc,mort_acc,pub_rec_bankruptcies,monthly_payment_calculated
count,396030.0,396030.0,396030.0,396030.0,377729.0,396030.0,396030,396030.0,396030,396030.0,396030.0,396030.0,395754.0,396030.0,358235.0,395495.0,396030.0
mean,141138.880893,41.698053,12.501757,39314.077666,5.938578,778209.9,2013-08-23 23:26:25.898037760,1562.102119,1998-09-27 10:20:21.180213760,113.111532,1.78191,158445.4,538.331332,254.147438,18.139908,1.216476,4241.972435
min,5000.0,36.0,0.6,285.0,0.0,0.0,2007-01-06 00:00:00,0.0,1975-01-01 00:00:00,0.0,0.0,0.0,0.0,20.0,0.0,0.0,160.77313
25%,80000.0,36.0,9.67,19926.0,3.0,450000.0,2013-01-05 00:00:00,881.0,1994-01-12 00:00:00,80.0,0.0,60250.0,358.0,170.0,0.0,0.0,2456.292134
50%,120000.0,36.0,12.99,34465.0,6.0,646350.0,2014-01-04 00:00:00,1552.0,1999-01-11 00:00:00,100.0,0.0,111810.0,548.0,240.0,10.0,0.0,3681.716044
75%,200000.0,36.0,15.88,53906.0,10.0,900000.0,2015-01-03 00:00:00,2209.0,2003-01-06 00:00:00,140.0,0.0,196200.0,729.0,320.0,30.0,0.0,5575.927817
max,400000.0,60.0,30.99,153381.0,10.0,87065820.0,2016-01-12 00:00:00,99990.0,2074-01-12 00:00:00,900.0,860.0,17432660.0,9446.0,1510.0,340.0,80.0,15338.036132
std,83574.413411,10.212038,5.398287,26412.949444,3.645623,813757.3,,925.265762,,51.376488,5.306706,205918.4,248.238561,118.869907,21.479305,3.561743,2471.637239
