In [1]:
import os
import glob
import pyspark
from pyspark.sql.functions import col

In [2]:
# Initialize SparkSession
spark = pyspark.sql.SparkSession.builder \
    .appName("read_parquet") \
    .master("local[*]") \
    .getOrCreate()

spark.sparkContext.setLogLevel("ERROR")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/10/07 03:45:05 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/10/07 03:45:05 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
os.getcwd()

'/app/experiments'

In [4]:
silver_path = '/app/datamart/silver/'

# Silver layer - Parquet

## METHOD 1: Read a specific partition (single date)

In [3]:
def read_single_partition(table_name, snapshot_date_str):
    """
    Read a single partition for a specific date
    
    Args:
        table_name: One of ['loan_daily', 'feature_clickstream', 
                           'features_attributes', 'features_financials']
        snapshot_date_str: Date string in format 'YYYY-MM-DD'
    
    Returns:
        DataFrame
    """
    silver_directory = "/app/datamart/silver/"
    table_path = os.path.join(silver_directory, table_name)
    
    partition_name = f"silver_{table_name}_{snapshot_date_str.replace('-','_')}.parquet"
    filepath = os.path.join(table_path, partition_name)
    
    if os.path.exists(filepath):
        df = spark.read.parquet(filepath)
        print(f"Loaded {table_name} for {snapshot_date_str}: {df.count()} rows")
        return df
    else:
        print(f"File not found: {filepath}")
        return None

In [13]:
df_loan = read_single_partition('loan_daily', '2023-01-01')
if df_loan:
    df_loan.show(5)
    df_loan.printSchema()

                                                                                

Loaded loan_daily for 2023-01-01: 530 rows
+--------------------+-----------+---------------+------+---------------+--------+-------+--------+-----------+-------+-------------+---+-------------------+-----------------+---+
|             loan_id|Customer_ID|loan_start_date|tenure|installment_num|loan_amt|due_amt|paid_amt|overdue_amt|balance|snapshot_date|mob|installments_missed|first_missed_date|dpd|
+--------------------+-----------+---------------+------+---------------+--------+-------+--------+-----------+-------+-------------+---+-------------------+-----------------+---+
|CUS_0x1037_2023_0...| CUS_0x1037|     2023-01-01|    10|              0|   10000|    0.0|     0.0|        0.0|10000.0|   2023-01-01|  0|                  0|             NULL|  0|
|CUS_0x1069_2023_0...| CUS_0x1069|     2023-01-01|    10|              0|   10000|    0.0|     0.0|        0.0|10000.0|   2023-01-01|  0|                  0|             NULL|  0|
|CUS_0x114a_2023_0...| CUS_0x114a|     2023-01-01|    10|

## METHOD 2: Read all partitions for a table (all dates)

In [5]:
def read_all_partitions(table_name):
    """
    Read all partitions for a table across all dates
    
    Args:
        table_name: One of ['loan_daily', 'feature_clickstream', 
                           'features_attributes', 'features_financials']
    
    Returns:
        DataFrame with all historical data
    """
    silver_directory = "/app/datamart/silver/"
    table_path = os.path.join(silver_directory, table_name)
    
    # Get all parquet files
    parquet_files = glob.glob(os.path.join(table_path, "*.parquet"))
    
    if parquet_files:
        # Read all parquet files at once
        df = spark.read.parquet(*parquet_files)
        print(f"Loaded all partitions for {table_name}: {df.count()} rows")
        print(f"Date range: {df.agg({'snapshot_date': 'min'}).collect()[0][0]} to {df.agg({'snapshot_date': 'max'}).collect()[0][0]}")
        return df
    else:
        print(f"No parquet files found in {table_path}")
        return None

In [6]:
df_clickstream = read_all_partitions('loan_daily')
if df_clickstream:
    df_clickstream.show(5)
    print(f"Unique dates: {df_clickstream.select('snapshot_date').distinct().count()}")

                                                                                

Loaded all partitions for loan_daily: 104288 rows
Date range: 2023-01-01 to 2024-12-01
+--------------------+-----------+---------------+------+---------------+--------+-------+--------+-----------+-------+-------------+---+-------------------+-----------------+---+----------+
|             loan_id|Customer_ID|loan_start_date|tenure|installment_num|loan_amt|due_amt|paid_amt|overdue_amt|balance|snapshot_date|mob|installments_missed|first_missed_date|dpd| asof_date|
+--------------------+-----------+---------------+------+---------------+--------+-------+--------+-----------+-------+-------------+---+-------------------+-----------------+---+----------+
|CUS_0x100b_2024_0...| CUS_0x100b|     2024-03-01|    10|              6|   10000| 1000.0|  1000.0|        0.0| 4000.0|   2024-09-01|  6|                  0|             NULL|  0|2024-09-01|
|CUS_0x1011_2023_1...| CUS_0x1011|     2023-11-01|    10|             10|   10000| 1000.0|  1000.0|        0.0|    0.0|   2024-09-01| 10|            

## METHOD 3: Read with wildcard (Spark handles directory automatically)

In [5]:
def read_with_wildcard(table_name):
    """
    Let Spark read the entire directory - simplest approach
    
    Args:
        table_name: One of ['loan_daily', 'feature_clickstream', 
                           'features_attributes', 'features_financials']
    
    Returns:
        DataFrame with all historical data
    """
    silver_directory = "/app/datamart/silver/"
    table_path = os.path.join(silver_directory, table_name, "*.parquet")
    
    df = spark.read.parquet(table_path)
    print(f"Loaded {table_name}: {df.count()} rows")
    return df

In [16]:
df_attributes = read_with_wildcard('features_attributes')
if df_attributes:
    df_attributes.show(5)

                                                                                

Loaded features_attributes: 11974 rows
+-----------+------------+-------------+---+
|Customer_ID|  Occupation|snapshot_date|Age|
+-----------+------------+-------------+---+
| CUS_0x10ac|   Developer|   2024-08-01| 29|
| CUS_0x10c5|     _______|   2024-08-01| 24|
| CUS_0x1145|     Teacher|   2024-08-01| 24|
| CUS_0x11ac|  Journalist|   2024-08-01| 26|
| CUS_0x122c|Entrepreneur|   2024-08-01| 48|
+-----------+------------+-------------+---+
only showing top 5 rows



## METHOD 4: Read specific date range

In [18]:
def read_date_range(table_name, start_date, end_date):
    """
    Read partitions within a specific date range
    
    Args:
        table_name: Table name
        start_date: Start date string 'YYYY-MM-DD'
        end_date: End date string 'YYYY-MM-DD'
    
    Returns:
        Filtered DataFrame
    """
    # Read all partitions
    df = read_all_partitions(table_name)
    
    if df:
        # Filter by date range
        df_filtered = df.filter(
            (col("snapshot_date") >= start_date) & 
            (col("snapshot_date") <= end_date)
        )
        print(f"Filtered to {start_date} - {end_date}: {df_filtered.count()} rows")
        return df_filtered
    
    return None

In [19]:
df_financials = read_date_range('features_financials', '2023-01-01', '2023-06-01')
if df_financials:
    df_financials.show(5)

Loaded all partitions for features_financials: 11974 rows
Date range: 2023-01-01 to 2024-12-01
Filtered to 2023-01-01 - 2023-06-01: 3085 rows
+-----------+---------------------+-----------------+---------------+-------------+--------------------+-------------------+--------------------+----------+------------------------+--------------------+---------------------+-------------------+--------------------+------------------+-------------+-------------+-----------+----------------------+--------------------+----------------+-----------------------+-----------------------+------------------------+-----------+-------------------+-------------------------+-----------------------+---------------------+-------------------+----------------------+----------------------------+---------------+----------+
|Customer_ID|Monthly_Inhand_Salary|Num_Bank_Accounts|Num_Credit_Card|Interest_Rate|        Type_of_Loan|Delay_from_due_date|Num_Credit_Inquiries|Credit_Mix|Credit_Utilization_Ratio|  Credit_Histor

In [22]:
df_financials.printSchema()

root
 |-- Customer_ID: string (nullable = true)
 |-- Monthly_Inhand_Salary: double (nullable = true)
 |-- Num_Bank_Accounts: integer (nullable = true)
 |-- Num_Credit_Card: integer (nullable = true)
 |-- Interest_Rate: integer (nullable = true)
 |-- Type_of_Loan: string (nullable = true)
 |-- Delay_from_due_date: integer (nullable = true)
 |-- Num_Credit_Inquiries: double (nullable = true)
 |-- Credit_Mix: string (nullable = true)
 |-- Credit_Utilization_Ratio: double (nullable = true)
 |-- Credit_History_Age: string (nullable = true)
 |-- Payment_of_Min_Amount: string (nullable = true)
 |-- Total_EMI_per_month: double (nullable = true)
 |-- Payment_Behaviour: string (nullable = true)
 |-- Monthly_Balance: double (nullable = true)
 |-- snapshot_date: date (nullable = true)
 |-- Annual_Income: float (nullable = true)
 |-- Num_of_Loan: integer (nullable = true)
 |-- Num_of_Delayed_Payment: integer (nullable = true)
 |-- Changed_Credit_Limit: float (nullable = true)
 |-- Outstanding_Debt:

## METHOD 5: Read and convert to Pandas

In [7]:
def read_to_pandas(table_name, snapshot_date_str=None):
    """
    Read parquet and convert to Pandas DataFrame
    
    Args:
        table_name: Table name
        snapshot_date_str: Optional specific date. If None, reads all dates
    
    Returns:
        Pandas DataFrame
    """
    if snapshot_date_str:
        df_spark = read_single_partition(table_name, snapshot_date_str)
    else:
        df_spark = read_all_partitions(table_name)
    
    if df_spark:
        # Convert to Pandas
        df_pandas = df_spark.toPandas()
        print(f"Converted to Pandas: {len(df_pandas)} rows")
        return df_pandas
    
    return None


In [23]:
df_pandas = read_to_pandas('loan_daily', '2023-01-01')
if df_pandas is not None:
    display(df_pandas.head())
    print(f"\nDataFrame info:")
    df_pandas.info()

Loaded loan_daily for 2023-01-01: 530 rows
Converted to Pandas: 530 rows


Unnamed: 0,loan_id,Customer_ID,loan_start_date,tenure,installment_num,loan_amt,due_amt,paid_amt,overdue_amt,balance,snapshot_date,mob,installments_missed,first_missed_date,dpd,asof_date
0,CUS_0x1037_2023_01_01,CUS_0x1037,2023-01-01,10,0,10000,0.0,0.0,0.0,10000.0,2023-01-01,0,0,,0,2023-01-01
1,CUS_0x1069_2023_01_01,CUS_0x1069,2023-01-01,10,0,10000,0.0,0.0,0.0,10000.0,2023-01-01,0,0,,0,2023-01-01
2,CUS_0x114a_2023_01_01,CUS_0x114a,2023-01-01,10,0,10000,0.0,0.0,0.0,10000.0,2023-01-01,0,0,,0,2023-01-01
3,CUS_0x1184_2023_01_01,CUS_0x1184,2023-01-01,10,0,10000,0.0,0.0,0.0,10000.0,2023-01-01,0,0,,0,2023-01-01
4,CUS_0x1297_2023_01_01,CUS_0x1297,2023-01-01,10,0,10000,0.0,0.0,0.0,10000.0,2023-01-01,0,0,,0,2023-01-01



DataFrame info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 530 entries, 0 to 529
Data columns (total 16 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   loan_id              530 non-null    object 
 1   Customer_ID          530 non-null    object 
 2   loan_start_date      530 non-null    object 
 3   tenure               530 non-null    int32  
 4   installment_num      530 non-null    int32  
 5   loan_amt             530 non-null    int32  
 6   due_amt              530 non-null    float64
 7   paid_amt             530 non-null    float64
 8   overdue_amt          530 non-null    float64
 9   balance              530 non-null    float64
 10  snapshot_date        530 non-null    object 
 11  mob                  530 non-null    int32  
 12  installments_missed  530 non-null    int32  
 13  first_missed_date    0 non-null      object 
 14  dpd                  530 non-null    int32  
 15  asof_date            53

# Bronze Layer - CSV

In [7]:
import os
import glob
import pandas as pd
import pyspark
from pyspark.sql.functions import col, lit
from datetime import datetime

In [8]:
# Initialize Spark
spark = pyspark.sql.SparkSession.builder \
    .appName("read_bronze") \
    .master("local[*]") \
    .getOrCreate()

spark.sparkContext.setLogLevel("ERROR")

In [9]:
# Configuration
BRONZE_DIR = "/app/datamart/bronze/"
TABLE_NAME = "lms_loan_daily"

## METHOD 1: Read Single Partition (One Snapshot Date)

In [10]:
def read_single_partition_csv(bronze_dir, table_name, snapshot_date_str, spark):
    """
    Read a single CSV partition for a specific snapshot date
    
    Args:
        bronze_dir: Bronze directory path
        table_name: Table name (e.g., 'lms_loan_daily')
        snapshot_date_str: Date string 'YYYY-MM-DD'
        spark: SparkSession
    
    Returns:
        DataFrame for that specific date
    """
    # Construct the filename
    partition_name = f"bronze_{table_name}_{snapshot_date_str.replace('-', '_')}.csv"
    filepath = os.path.join(bronze_dir, table_name, partition_name)
    
    print(f"\nReading file: {filepath}")
    
    if not os.path.exists(filepath):
        print(f"❌ File not found: {filepath}")
        return None
    
    # Read the CSV
    df = spark.read.csv(filepath, header=True, inferSchema=True)
    
    print(f"✓ Loaded {df.count():,} rows from {snapshot_date_str}")
    
    return df

In [32]:
snapshot_date = "2023-01-01"
df_single = read_single_partition_csv(BRONZE_DIR, TABLE_NAME, snapshot_date, spark)

if df_single:
    print("\nSample data:")
    df_single.show(5)
    print(f"\nColumns: {df_single.columns}")


Reading file: /app/datamart/bronze/lms_loan_daily/bronze_lms_loan_daily_2023_01_01.csv
✓ Loaded 0 rows from 2023-01-01

Sample data:
+-------+-----------+---------------+------+---------------+--------+-------+--------+-----------+-------+-------------+
|loan_id|Customer_ID|loan_start_date|tenure|installment_num|loan_amt|due_amt|paid_amt|overdue_amt|balance|snapshot_date|
+-------+-----------+---------------+------+---------------+--------+-------+--------+-----------+-------+-------------+
+-------+-----------+---------------+------+---------------+--------+-------+--------+-----------+-------+-------------+


Columns: ['loan_id', 'Customer_ID', 'loan_start_date', 'tenure', 'installment_num', 'loan_amt', 'due_amt', 'paid_amt', 'overdue_amt', 'balance', 'snapshot_date']


## METHOD 2: Read MULTIPLE partitions (date range)

In [12]:
def read_multiple_partitions_csv(bronze_dir, table_name, start_date, end_date, spark):
    """
    Read multiple CSV partitions for a date range
    
    Args:
        bronze_dir: Bronze directory path
        table_name: Table name
        start_date: Start date string 'YYYY-MM-DD'
        end_date: End date string 'YYYY-MM-DD'
        spark: SparkSession
    
    Returns:
        Combined DataFrame with all dates
    """
    table_path = os.path.join(bronze_dir, table_name)
    
    # Get all CSV files in the directory
    all_files = glob.glob(os.path.join(table_path, "*.csv"))
    
    print(f"\nFound {len(all_files)} total partitions")
    
    # Filter files by date range
    selected_files = []
    for filepath in all_files:
        filename = os.path.basename(filepath)
        # Extract date from filename: bronze_lms_loan_daily_2023_01_01.csv
        date_part = filename.replace("bronze_", "").replace(f"{table_name}_", "").replace(".csv", "")
        file_date = date_part.replace("_", "-")  # Convert 2023_01_01 to 2023-01-01
        
        if start_date <= file_date <= end_date:
            selected_files.append(filepath)
    
    print(f"Selected {len(selected_files)} partitions in range {start_date} to {end_date}")
    
    if not selected_files:
        print("❌ No files found in date range")
        return None
    
    # Read all selected files
    dfs = []
    for filepath in sorted(selected_files):
        df = spark.read.csv(filepath, header=True, inferSchema=True)
        
        # Extract date and add as column
        filename = os.path.basename(filepath)
        date_part = filename.replace("bronze_", "").replace(f"{table_name}_", "").replace(".csv", "")
        file_date = date_part.replace("_", "-")
        
        df = df.withColumn("snapshot_date", lit(file_date))
        dfs.append(df)
        
        print(f"  ✓ Loaded {filepath}: {df.count():,} rows")
    
    # Union all dataframes
    combined_df = dfs[0]
    for df in dfs[1:]:
        combined_df = combined_df.union(df)
    
    print(f"\n✓ Combined total: {combined_df.count():,} rows")
    
    return combined_df

In [None]:
df_multiple = read_multiple_partitions_csv(
    BRONZE_DIR, TABLE_NAME, 
    "2023-01-01", "2023-03-01", 
    spark
)

if df_multiple:
    print("\nDate distribution:")
    df_multiple.groupBy("snapshot_date").count().orderBy("snapshot_date").show()

## METHOD 3: Read ALL partitions (entire history)

In [13]:
def read_all_partitions_csv(bronze_dir, table_name, spark):
    """
    Read all CSV partitions (entire history)
    
    Args:
        bronze_dir: Bronze directory path
        table_name: Table name
        spark: SparkSession
    
    Returns:
        Combined DataFrame with all historical data
    """
    table_path = os.path.join(bronze_dir, table_name)
    
    # Get all CSV files
    all_files = glob.glob(os.path.join(table_path, "*.csv"))
    
    print(f"\nFound {len(all_files)} partitions")
    
    if not all_files:
        print("❌ No CSV files found")
        return None
    
    # Read all files
    dfs = []
    for filepath in sorted(all_files):
        df = spark.read.csv(filepath, header=True, inferSchema=True)
        
        # Extract and add snapshot_date
        filename = os.path.basename(filepath)
        date_part = filename.replace("bronze_", "").replace(f"{table_name}_", "").replace(".csv", "")
        file_date = date_part.replace("_", "-")
        
        df = df.withColumn("snapshot_date", lit(file_date))
        dfs.append(df)
    
    # Union all
    combined_df = dfs[0]
    for df in dfs[1:]:
        combined_df = combined_df.union(df)
    
    total_rows = combined_df.count()
    unique_dates = combined_df.select("snapshot_date").distinct().count()
    
    print(f"✓ Loaded {total_rows:,} total rows")
    print(f"✓ Spanning {unique_dates} unique dates")
    
    return combined_df


In [14]:
df_all = read_all_partitions_csv(BRONZE_DIR, TABLE_NAME, spark)

if df_all:
    print("\nDate range:")
    df_all.select("snapshot_date").summary("min", "max").show()


Found 24 partitions
✓ Loaded 104,288 total rows
✓ Spanning 24 unique dates

Date range:
+-------+-------------+
|summary|snapshot_date|
+-------+-------------+
|    min|   2023-01-01|
|    max|   2024-12-01|
+-------+-------------+



## METHOD 4: Using Pandas (for smaller datasets)

In [None]:
def read_partition_pandas(bronze_dir, table_name, snapshot_date_str):
    """
    Read a single partition using Pandas
    Faster for small to medium datasets
    
    Args:
        bronze_dir: Bronze directory path
        table_name: Table name
        snapshot_date_str: Date string 'YYYY-MM-DD'
    
    Returns:
        Pandas DataFrame
    """
    partition_name = f"bronze_{table_name}_{snapshot_date_str.replace('-', '_')}.csv"
    filepath = os.path.join(bronze_dir, table_name, partition_name)
    
    print(f"\nReading with Pandas: {filepath}")
    
    if not os.path.exists(filepath):
        print(f"❌ File not found")
        return None
    
    df = pd.read_csv(filepath)
    df['snapshot_date'] = snapshot_date_str
    
    print(f"✓ Loaded {len(df):,} rows")
    
    return df

In [None]:
df_pandas = read_partition_pandas(BRONZE_DIR, TABLE_NAME, "2023-01-01")

if df_pandas is not None:
    print("\nPandas DataFrame info:")
    print(df_pandas.head())
    print(f"\nShape: {df_pandas.shape}")

## METHOD 5: Read specific partitions by list of dates

In [None]:
def read_specific_dates_csv(bronze_dir, table_name, date_list, spark):
    """
    Read specific partitions by providing a list of dates
    
    Args:
        bronze_dir: Bronze directory path
        table_name: Table name
        date_list: List of date strings ['YYYY-MM-DD', ...]
        spark: SparkSession
    
    Returns:
        Combined DataFrame
    """
    print(f"\nReading {len(date_list)} specific partitions:")
    
    dfs = []
    for date_str in date_list:
        partition_name = f"bronze_{table_name}_{date_str.replace('-', '_')}.csv"
        filepath = os.path.join(bronze_dir, table_name, partition_name)
        
        if not os.path.exists(filepath):
            print(f"  ⚠ Skipping {date_str}: file not found")
            continue
        
        df = spark.read.csv(filepath, header=True, inferSchema=True)
        df = df.withColumn("snapshot_date", lit(date_str))
        dfs.append(df)
        
        print(f"  ✓ {date_str}: {df.count():,} rows")
    
    if not dfs:
        print("❌ No valid partitions found")
        return None
    
    # Union all
    combined_df = dfs[0]
    for df in dfs[1:]:
        combined_df = combined_df.union(df)
    
    print(f"\n✓ Total: {combined_df.count():,} rows")
    
    return combined_df

In [None]:
specific_dates = ["2023-01-01", "2023-03-01", "2023-06-01"]
df_specific = read_specific_dates_csv(BRONZE_DIR, TABLE_NAME, specific_dates, spark)

# Test Other Functions

In [25]:
# --- Standard libraries ---
import os
import logging
from datetime import datetime
from dateutil.relativedelta import relativedelta

# --- PySpark core & types ---
from pyspark.sql import functions as F
from pyspark.sql import Window
from pyspark.sql.functions import col, regexp_replace, trim, when
from pyspark.sql.types import StringType, IntegerType, FloatType, DoubleType, DateType, DecimalType

# =============================================================================
# LOGGING
# =============================================================================
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__)

def upcast_floats_to_double(df):
    for f in df.schema.fields:
        if isinstance(f.dataType, (FloatType, DecimalType)):
            df = df.withColumn(f.name, col(f.name).cast(DoubleType()))
    return df

def cast_to_numeric(df, exclude=("Customer_ID", "snapshot_date"), numeric_threshold=0.9):
    """
    Auto-detect numeric-looking string columns, clean, and cast to Integer/Double.
    """
    try:
        string_cols = [f.name for f in df.schema.fields if isinstance(f.dataType, StringType)]
        candidates = [c for c in string_cols if c not in exclude]

        for c in candidates:
            cleaned = regexp_replace(col(c), r"[^0-9\.\-]+", "")
            cleaned = trim(cleaned)

            cast_col = when(F.length(cleaned) == 0, None).otherwise(cleaned).cast(DoubleType())
            tmp = f"__num_{c}"

            ratio = df.withColumn(tmp, cast_col) \
                      .select((F.count(tmp) / F.count(F.lit(1))).alias("ratio")) \
                      .collect()[0]["ratio"]

            if ratio is not None and ratio >= numeric_threshold:
                df_with = df.withColumn(tmp, cast_col)
                max_frac = df_with.select(
                    F.max(F.abs(col(tmp) - F.floor(col(tmp)))).alias("max_frac")
                ).collect()[0]["max_frac"]

                is_integer = (max_frac is None) or (float(max_frac) == 0.0)
                target_type = IntegerType() if is_integer else DoubleType()

                df = df_with.drop(c).withColumn(c, col(tmp).cast(target_type)).drop(tmp)
                logger.info(
                    "Auto-cast '%s' -> %s (cleaned non-numeric chars)",
                    c, "Integer" if is_integer else "Double"
                )
            else:
                logger.info(
                    "Kept '%s' as string (only %.2f%% numeric after cleaning)",
                    c, (ratio or 0.0) * 100
                )

        return df
    except Exception as e:
        logger.error(f"Error in transformation: {str(e)}")
        raise

df_financias = cast_to_numeric(df_financials, exclude=("Customer_ID", "snapshot_date"), numeric_threshold=0.9)
df_financials = upcast_floats_to_double(df_financials)

2025-10-07 05:14:18,120 - __main__ - INFO - Kept 'Type_of_Loan' as string (only 0.00% numeric after cleaning)
2025-10-07 05:14:18,563 - __main__ - INFO - Kept 'Credit_Mix' as string (only 0.00% numeric after cleaning)
2025-10-07 05:14:19,464 - __main__ - INFO - Auto-cast 'Credit_History_Age' -> Integer (cleaned non-numeric chars)
2025-10-07 05:14:19,924 - __main__ - INFO - Kept 'Payment_of_Min_Amount' as string (only 0.00% numeric after cleaning)
2025-10-07 05:14:20,333 - __main__ - INFO - Kept 'Payment_Behaviour' as string (only 0.00% numeric after cleaning)


In [27]:
df_financials

DataFrame[Customer_ID: string, Monthly_Inhand_Salary: double, Num_Bank_Accounts: int, Num_Credit_Card: int, Interest_Rate: int, Type_of_Loan: string, Delay_from_due_date: int, Num_Credit_Inquiries: double, Credit_Mix: string, Credit_Utilization_Ratio: double, Credit_History_Age: string, Payment_of_Min_Amount: string, Total_EMI_per_month: double, Payment_Behaviour: string, Monthly_Balance: double, snapshot_date: date, Annual_Income: double, Num_of_Loan: int, Num_of_Delayed_Payment: int, Changed_Credit_Limit: double, Outstanding_Debt: double, Amount_invested_monthly: double, Credit_History_Age_Year: double, Credit_History_Age_Month: int, DTI: double, loan_type__AutoLoan: int, loan_type__Credit-Builder: int, loan_type__PersonalLoan: int, loan_type__HomeEquity: int, loan_type__Mortgage: int, loan_type__StudentLoan: int, loan_type__DebtConsolidation: int, loan_type_count: int, asof_date: date]

In [33]:
# Cleanup
spark.stop()
print("\n✓ Spark session stopped")
print("\n" + "="*80)


✓ Spark session stopped

