In [11]:
import os
import glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

import pyspark
from pyspark.sql.functions import col
import pyspark.sql.functions as F

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    classification_report, 
    confusion_matrix, 
    roc_auc_score, 
    roc_curve,
    precision_recall_curve,
    average_precision_score
)

In [12]:
# Initialize Spark
spark = pyspark.sql.SparkSession.builder \
    .appName("explore_bronze") \
    .master("local[*]") \
    .config("spark.sql.parquet.mergeSchema", "true") \
    .getOrCreate()

spark.sparkContext.setLogLevel("ERROR")

In [13]:
# Configuration
LOAN_DIR = "/app/datamart/bronze/lms_loan_daily/"
ATTR_DIR = "/app/datamart/bronze/features_attributes"
FIN_DIR = "/app/datamart/bronze/features_financials/"
CLICK_DIR = "/app/datamart/bronze/feature_clickstream/"

In [7]:
# Read the bronze clickstream data
df_click = spark.read.parquet(CLICK_DIR)

# Count duplicates by Customer_ID + snapshot_date
dupes = (
    df_click.groupBy("Customer_ID", "snapshot_date")
            .count()
            .filter(F.col("count") > 1)
)

print(f"Total duplicate groups: {dupes.count()}")
dupes.show(10, truncate=False)


                                                                                

Total duplicate groups: 0


[Stage 7:>                                                          (0 + 7) / 7]

+-----------+-------------+-----+
|Customer_ID|snapshot_date|count|
+-----------+-------------+-----+
+-----------+-------------+-----+



                                                                                

In [14]:
df_fin = spark.read.parquet(FIN_DIR)

# Count duplicates by Customer_ID + snapshot_date
dupes = (
    df_fin.groupBy("Customer_ID", "snapshot_date")
            .count()
            .filter(F.col("count") > 1)
)

print(f"Total duplicate groups: {dupes.count()}")
dupes.show(10, truncate=False)


                                                                                

Total duplicate groups: 0




+-----------+-------------+-----+
|Customer_ID|snapshot_date|count|
+-----------+-------------+-----+
+-----------+-------------+-----+



                                                                                

In [15]:
df_fin.columns

['Customer_ID',
 'Annual_Income',
 'Monthly_Inhand_Salary',
 'Num_Bank_Accounts',
 'Num_Credit_Card',
 'Interest_Rate',
 'Num_of_Loan',
 'Type_of_Loan',
 'Delay_from_due_date',
 'Num_of_Delayed_Payment',
 'Changed_Credit_Limit',
 'Num_Credit_Inquiries',
 'Credit_Mix',
 'Outstanding_Debt',
 'Credit_Utilization_Ratio',
 'Credit_History_Age',
 'Payment_of_Min_Amount',
 'Total_EMI_per_month',
 'Amount_invested_monthly',
 'Payment_Behaviour',
 'Monthly_Balance',
 'snapshot_date',
 'year',
 'month']

In [9]:
df_attr = spark.read.parquet(ATTR_DIR)

# Count duplicates by Customer_ID + snapshot_date
dupes = (
    df_attr.groupBy("Customer_ID", "snapshot_date")
            .count()
            .filter(F.col("count") > 1)
)

print(f"Total duplicate groups: {dupes.count()}")
dupes.show(10, truncate=False)


                                                                                

Total duplicate groups: 0
+-----------+-------------+-----+
|Customer_ID|snapshot_date|count|
+-----------+-------------+-----+
+-----------+-------------+-----+



In [17]:
spark.stop()