In [1]:
#%pip install pandas numpy scikit-learn matplotlib seaborn

import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

def prepare_and_analyze_data():
    """
    Combines two raw datasets, performs data cleaning, preprocessing, and
    exploratory data analysis (EDA).

    Returns:
        pd.DataFrame: A clean, preprocessed DataFrame.
    """

    # Define the base directory for the data files.
    data_dir = "../3.RawDataStorage/hagging/"
    data_dirk= "../3.RawDataStorage/kaggle/"

    # Define the full file paths using the base directory.
    file_path_h = os.path.join(data_dir, "h-bank_churn.csv")
    file_path_k = os.path.join(data_dirk, "k-bank_churn.csv")

    # Check if the input files exist
    if not os.path.exists(file_path_h) or not os.path.exists(file_path_k):
        print(f"Error: One or both of the files ('{file_path_h}' and '{file_path_k}') were not found.")
        return None

    print("--- Starting Data Preparation and EDA ---")

    # Load and merge the raw data
    print("Step 0: Loading and Merging Datasets")
    df_h = pd.read_csv(file_path_h)
    df_k = pd.read_csv(file_path_k)
    df = pd.concat([df_h, df_k], ignore_index=True)
    print(f"  - Successfully merged datasets. Total rows: {len(df)}")

    # 5.1 Clean and preprocess the raw data
    print("Step 1: Handling Missing and Inconsistent Values")

    # Replace empty strings with NaN
    df.replace('', np.nan, inplace=True)

    # Impute numerical missing values with the median
    numerical_cols = ['Balance', 'Tenure', 'EstimatedSalary']
    for col in numerical_cols:
        if df[col].isnull().sum() > 0:
            median_val = df[col].median()
            df[col].fillna(median_val, inplace=True)
            print(f"  - Imputed missing values in '{col}' with median: {median_val:.2f}")

    # Remove rows with missing values in critical categorical columns
    df.dropna(subset=['Surname', 'Gender'], inplace=True)
    print(f"  - Removed rows with missing 'Surname' or 'Gender'. New dataset size: {len(df)}")

    # Handle inconsistent data (e.e.g., 'unknown' or invalid values)
    df['Geography'] = df['Geography'].replace('unknown', 'France')
    df['Gender'] = df['Gender'].replace('other', np.nan)
    df['HasCrCard'] = df['HasCrCard'].replace(99, 1)

    # Remove duplicates
    initial_rows = len(df)
    df.drop_duplicates(inplace=True)
    duplicates_removed = initial_rows - len(df)
    print(f"  - Removed {duplicates_removed} duplicate rows.")

    # 5.2 Perform EDA to identify trends, distributions, and outliers.
    print("\nStep 2: Performing Exploratory Data Analysis (EDA) and generating visualizations...")
    os.makedirs('eda_visualizations', exist_ok=True)

    # Visualizations for numerical data
    print("  - Generating histograms for numerical features...")
    df.hist(figsize=(15, 10))
    plt.suptitle("Histograms of Numerical Features", y=0.95, fontsize=16)
    plt.savefig(os.path.join('eda_visualizations', 'histograms.png'))
    plt.close()

    print("  - Generating box plots for outlier detection...")
    fig, axes = plt.subplots(2, 3, figsize=(18, 12))
    for i, col in enumerate(['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary']):
        sns.boxplot(y=df[col], ax=axes[i//3, i%3])
        axes[i//3, i%3].set_title(f'Box Plot of {col}')
    plt.suptitle("Box Plots for Outlier Detection", y=0.95, fontsize=16)
    plt.tight_layout()
    plt.savefig(os.path.join('eda_visualizations', 'boxplots.png'))
    plt.close()

    # Visualizations for categorical data
    print("  - Generating count plots for categorical features...")
    categorical_cols = ['Geography', 'Gender', 'HasCrCard', 'IsActiveMember', 'Exited']
    fig, axes = plt.subplots(2, 3, figsize=(18, 12))
    for i, col in enumerate(categorical_cols):
        sns.countplot(x=df[col], ax=axes[i//3, i%3])
        axes[i//3, i%3].set_title(f'Count Plot of {col}')
    plt.suptitle("Count Plots of Categorical Features", y=0.95, fontsize=16)
    plt.tight_layout()
    plt.savefig(os.path.join('eda_visualizations', 'count_plots.png'))
    plt.close()

    # 5.3 Standardize and encode data
    print("\nStep 3: Standardizing and Encoding Data...")

    # Identify feature types
    numeric_features = df.select_dtypes(include=np.number).columns.tolist()
    categorical_features = df.select_dtypes(include='object').columns.tolist()

    # Drop non-essential columns from feature lists before dropping them from the DataFrame
    cols_to_drop = ['RowNumber', 'CustomerId', 'Surname']
    numeric_features = [col for col in numeric_features if col not in cols_to_drop]
    categorical_features = [col for col in categorical_features if col not in cols_to_drop]

    # Drop non-essential columns for modeling
    df.drop(cols_to_drop, axis=1, inplace=True)

    # Create preprocessing pipelines for numerical and categorical data
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')), # Redundant but good practice
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    # Create a preprocessor using ColumnTransformer
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)
        ],
        remainder='passthrough'
    )

    # Fit and transform the data
    df_clean = preprocessor.fit_transform(df)

    # Get feature names after one-hot encoding
    new_features = preprocessor.get_feature_names_out()

    df_clean = pd.DataFrame(df_clean, columns=new_features)

    print("--- Data Preparation and EDA Complete ---")
    print(f"Final dataset shape: {df_clean.shape}")

    return df_clean

if __name__ == "__main__":
    clean_dataset = prepare_and_analyze_data()

    if clean_dataset is not None:
        print("\nDeliverable: A clean dataset ready for transformations:")
        print(clean_dataset.head())
        # You can save this cleaned dataset if needed
        clean_dataset.to_csv('cleaned_data.csv', index=False)

--- Starting Data Preparation and EDA ---
Step 0: Loading and Merging Datasets
  - Successfully merged datasets. Total rows: 15010
Step 1: Handling Missing and Inconsistent Values
  - Imputed missing values in 'Balance' with median: 103202.74
  - Imputed missing values in 'Tenure' with median: 5.00
  - Imputed missing values in 'EstimatedSalary' with median: 154508.43
  - Removed rows with missing 'Surname' or 'Gender'. New dataset size: 14960
  - Removed 10 duplicate rows.

Step 2: Performing Exploratory Data Analysis (EDA) and generating visualizations...
  - Generating histograms for numerical features...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(median_val, inplace=True)


  - Generating box plots for outlier detection...
  - Generating count plots for categorical features...

Step 3: Standardizing and Encoding Data...
--- Data Preparation and EDA Complete ---
Final dataset shape: (14950, 15)

Deliverable: A clean dataset ready for transformations:
   num__CreditScore  num__Age  num__Tenure  num__Balance  num__NumOfProducts  \
0         -0.039196  0.124312    -1.013669     -1.335200           -1.394443   
1         -0.125802  0.043188    -1.350455     -0.125025           -1.394443   
2         -0.960374  0.124312     1.007046      0.970282            0.367244   
3          0.590669 -0.119059    -1.350455     -1.335200           -0.513599   
4          1.779540  0.205435    -1.013669      0.477160           -1.394443   

   num__HasCrCard  num__IsActiveMember  num__EstimatedSalary  num__Exited  \
0        0.667295             0.977510             -0.581393     1.530205   
1       -1.498588             0.977510             -0.459792    -0.653507   
2      