In [0]:
%pip install openpyxl

In [0]:
import os
import re
import pandas as pd
from datetime import datetime
from pyspark.sql import functions as F

In [0]:
# List all files in the holdings directory and filter for Excel files
files = [f for f in dbutils.fs.ls("/Volumes/personal_finance/default/files/kite/holdings") if not f.isDir() and (f.name.endswith('.xlsx') or f.name.endswith('.xls'))]

# Extract client id and formatted datetime from each file
data = []
for f in files:
    match = re.search(r'holdings-([A-Za-z0-9]+)\.xlsx', f.name)
    client_id = match.group(1) if match else None
    dt = datetime.fromtimestamp(f.modificationTime / 1000).strftime('%Y-%m-%d %H:%M:%S')
    path = f.path
    if path.startswith("dbfs:"):
        path = path[5:]
    holding_date = dt.split(" ")[0]  # Set holding_date as modification date only
    data.append({"name": f.name, "path": path, "modificationTime": dt, "client_id": client_id, "holding_date": holding_date, "modTimeRaw": f.modificationTime})

file_info_df = pd.DataFrame(data)

# Keep only the most recently modified file for each client_id
file_info_df = file_info_df.sort_values("modTimeRaw", ascending=False).drop_duplicates(subset=["client_id"], keep="first").drop(columns=["modTimeRaw"])

display(file_info_df)

In [0]:
import pandas as pd
import io

dfs = []
for _, f in file_info_df.iterrows():
    df = pd.read_excel(
        f['path'], engine="openpyxl", skiprows=22
    )
    df['Client_Id'] = f['client_id']
    df['Holding_Date'] = f['holding_date']
    dfs.append(df)

In [0]:
for i, df in enumerate(dfs):
    if "Client ID" in df.columns:
        dfs[i] = df[df["Client ID"] != "Total"]

if dfs:
    merged_df = pd.concat(dfs, ignore_index=True).drop(columns=["Unnamed: 0"], errors="ignore")
    display(merged_df)

In [0]:
spark_df = spark.createDataFrame(merged_df)
spark_df = spark_df.toDF(*[c.replace(" ", "_").replace("(","").replace(")","") for c in spark_df.columns])
spark_df.write.mode("overwrite").saveAsTable("personal_finance.bronze.kite_holding_statement")