# Step 1 - Data prep and cleaning

Details and rationale are implemented in the code cells below.


## 1. Load data files

Details and rationale are implemented in the code cells below.


In [None]:
import pandas as pd
import os
from pathlib import Path

raw_data_path = Path('../data/raw/')
files = list(raw_data_path.glob('*.csv'))
datasets = {}

# Encodage utf-8 avec gestion des erreurs
for file in files:
    try:
        df = pd.read_csv(file, encoding='utf-8')
    except UnicodeDecodeError:
        df = pd.read_csv(file, encoding='ISO-8859-1')
    datasets[file.stem] = df
    print(f"{file.name}: {df.shape[0]} lignes, {df.shape[1]} colonnes")

## 2. Validate and fix formats

Details and rationale are implemented in the code cells below.


In [None]:
for name, df in datasets.items():
    print(f"\n==== {name.upper()} : types initiaux ====")
    print(df.dtypes.value_counts())
    
    # Exemple : conversion des colonnes 'DAYS_BIRTH' en valeur absolue
    if 'DAYS_BIRTH' in df.columns:
        df['DAYS_BIRTH'] = df['DAYS_BIRTH'].abs()
    
    # Conversion des colonnes contenant 'DATE' ou 'datetime' si besoin
    for col in df.select_dtypes('object'):
        if 'date' in col.lower() or 'datetime' in col.lower():
            try:
                df[col] = pd.to_datetime(df[col])
                print(f"{col} convertie en datetime")
            except:
                pass  # ignore if not convertible

    # Update the dictionary
    datasets[name] = df

## 3. Missing values and duplicates

Details and rationale are implemented in the code cells below.


### 3.1 Visualize missing values

Details and rationale are implemented in the code cells below.


In [None]:
import missingno as msno
import matplotlib.pyplot as plt

for name, df in datasets.items():
    print(f"\n=== {name.upper()} ===")
    na_ratio = df.isna().mean().sort_values(ascending=False)
    print("Variables with > 50% missing values:")
    display(na_ratio[na_ratio > 0.5])

    msno.matrix(df)
    plt.title(f"Missing values - {name}")
    plt.show()

### 3.2 Cleaning: drop and impute

Details and rationale are implemented in the code cells below.


In [None]:
from sklearn.impute import SimpleImputer

for name, df in datasets.items():
    print(f"\n=== Cleaning dataset : {name.upper()} ===")

    # 1. Supprimer les colonnes avec plus de 80% de NaN
    threshold = 0.8
    missing_ratio = df.isna().mean()
    cols_to_drop = missing_ratio[missing_ratio > threshold].index
    df.drop(columns=cols_to_drop, inplace=True)
    print(f"{len(cols_to_drop)} columns dropped (>80% NaN) : {list(cols_to_drop)}")

    # 2. Impute numeric variables (median)
    num_cols = df.select_dtypes(include=['int64', 'float64']).columns
    imputer_num = SimpleImputer(strategy='median')
    df[num_cols] = imputer_num.fit_transform(df[num_cols])

    # 3. Impute categorical variables (use "unknown")
    cat_cols = df.select_dtypes(include='object').columns
    df[cat_cols] = df[cat_cols].fillna("inconnu")

    # Update the dictionary
    datasets[name] = df

    print(f"{len(num_cols)} numeric columns imputed")
    print(f"{len(cat_cols)} categorical columns imputed")

### 3.3 Remove duplicates

Details and rationale are implemented in the code cells below.


In [None]:
for name, df in datasets.items():
    duplicates = df.duplicated().sum()
    if duplicates > 0:
        df.drop_duplicates(inplace=True)
        print(f"{name}: {duplicates} duplicates removed")
    else:
        print(f"{name}: no duplicates detected")

    # Update
    datasets[name] = df

## 4. Identify join keys

Details and rationale are implemented in the code cells below.


## Join keys

Details and rationale are implemented in the code cells below.


In [None]:
# Join keys identified manually from the documentation
primary_keys = {
    'application_train': 'SK_ID_CURR',
    'application_test': 'SK_ID_CURR',
    'bureau': 'SK_ID_CURR',
    'bureau_balance': 'SK_ID_BUREAU',
    'previous_application': 'SK_ID_CURR',
    'installments_payments': 'SK_ID_PREV',
    'credit_card_balance': 'SK_ID_PREV',
    'POS_CASH_balance': 'SK_ID_PREV',
    'sample_submission': 'SK_ID_CURR'
}

## 5. Merge datasets

Details and rationale are implemented in the code cells below.


### 5.1 Merge bureau data

Details and rationale are implemented in the code cells below.


In [None]:
def agg_numeric_only(df, group_var, df_name):
    """
    Aggregate numeric columns of a DataFrame by a given key.
    Column names are prefixed to avoid collisions.
    """
    df_numeric = df.select_dtypes(include=['number']).copy()
    df_numeric[group_var] = df[group_var]
    agg = df_numeric.groupby(group_var).agg(['mean', 'sum'])
    agg.columns = [f"{df_name}_{col[0]}_{col[1]}" for col in agg.columns]
    agg.reset_index(inplace=True)
    return agg

In [None]:
# Merge bureau_balance into bureau
bureau_balance = datasets['bureau_balance']
bureau = datasets['bureau']

bureau_balance_agg = agg_numeric_only(bureau_balance, 'SK_ID_BUREAU', 'BB')
bureau = bureau.merge(bureau_balance_agg, on='SK_ID_BUREAU', how='left')

# Merge bureau into application
app_train = datasets['application_train']
app_train = app_train.merge(bureau.drop(columns='SK_ID_BUREAU'), on='SK_ID_CURR', how='left')

### 5.2 Merge previous application histories

Details and rationale are implemented in the code cells below.


In [None]:
previous = datasets['previous_application']

# Aggregates for histories linked to previous applications
for table_name in ['POS_CASH_balance', 'installments_payments', 'credit_card_balance']:
    df = datasets[table_name]
    df_agg = agg_numeric_only(df, 'SK_ID_PREV', table_name)
    previous = previous.merge(df_agg, on='SK_ID_PREV', how='left')

# Merge with application
app_train = app_train.merge(previous.drop(columns='SK_ID_PREV'), on='SK_ID_CURR', how='left')

### 5.3 Final export

Details and rationale are implemented in the code cells below.


In [None]:
output_path = Path('../data/output')
output_path.mkdir(parents=True, exist_ok=True)
app_train.to_csv(output_path / 'train_clean_merged.csv', index=False)

## Sampling note

The full merged dataset is very large. For local experiments, a smaller sample is used to keep notebook execution practical.


In [None]:
import pandas as pd

# Count total rows (minus header)
with open("../data/output/train_clean_merged.csv", "r") as f:
    total_lines = sum(1 for line in f) - 1

# Calculer combien garder (1/8 du fichier)
eighth_lines = total_lines // 8

# Lire et sauvegarder 1/8 des data
df_sample = pd.read_csv("../data/output/train_clean_merged.csv", nrows=eighth_lines)
df_sample.to_csv("../data/output/train_clean_sample.csv", index=False)

# Supprimer le fichier trop volumineux
import os
os.remove("../data/output/train_clean_merged.csv")

## Conclusion

Details and rationale are implemented in the code cells below.
