In [0]:
df_bronze = spark.read.format("delta") \
    .load("/Volumes/loan/credit_score/prediction/bronze")

In [0]:
from pyspark.sql.functions import col, sum

null_bronze = df_bronze.select([
    sum(col(c).isNull().cast("int")).alias(c)
    for c in df_bronze.columns
])

display(null_bronze)


In [0]:
cols_to_drop = df_bronze.columns[22:111]
df_silver = df_bronze.drop(*cols_to_drop)








In [0]:
display(df_silver)

In [0]:
df_silver.printSchema()

In [0]:
null_silver = df_silver.select([
    sum(col(c).isNull().cast("int")).alias(c)
    for c in df_silver.columns
])

display(null_silver)


In [0]:
df_silver_clean = df_silver.dropna(
    subset=[
        "emp_title",
        "emp_length",
        "dti",
        "last_pymnt_d",
        "last_credit_pull_d"
    ]
)


display(df_silver_clean)


df_silver_clean.printSchema()

In [0]:
null_silver = df_silver_clean.select([
    sum(col(c).isNull().cast("int")).alias(c)
    for c in df_silver_clean.columns
])

display(null_silver)

In [0]:
import pandas as pd

bronze_nulls_pd = null_bronze.toPandas().T.reset_index()
bronze_nulls_pd.columns = ["column_name", "bronze_null_count"]

silver_nulls_pd = null_silver.toPandas().T.reset_index()
silver_nulls_pd.columns = ["column_name", "silver_null_count"]

comparison = bronze_nulls_pd.merge(
    silver_nulls_pd,
    on="column_name",
    how="left"
)

display(comparison)


In [0]:
df_dedup = df_silver_clean.dropDuplicates(["id"])


In [0]:
df_silver_clean = df_dedup.fillna({
    "annual_inc": 0,
    "emp_length": 0,
    "int_rate": 0
})


In [0]:
df_silver_clean.display()

In [0]:
from pyspark.sql.functions import when

df_features = df_silver_clean.withColumn(
    "interest_bucket",
    when(col("int_rate") < 10, "LOW")
    .when(col("int_rate") < 15, "MEDIUM")
    .otherwise("HIGH")
)


In [0]:
df_features.display()

In [0]:
df_features = df_features.withColumn(
    "loan_default",
    when(col("loan_status") == "Fully Paid", 1).otherwise(0)
)


In [0]:
df_features = df_features.drop("debt_to_income")
df_features.display()

In [0]:
df_features.write \
    .format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable("silver_loans")


In [0]:
%sql
SELECT * FROM silver_loans;



In [0]:
df = spark.table("silver_loans")


In [0]:
from pyspark.sql.functions import expr

df_fixed = df.withColumn(
    "issue_d",
    expr("try_to_date(issue_d, 'MMM-yy')")
)


In [0]:
df_fixed.select("issue_d").show(10, False)


In [0]:
df_fixed.write \
    .format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable("silver_loans")


In [0]:
%sql
SELECT * FROM silver_loans LIMIT 10;
