In [6]:
import os
import glob
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import StringType,BooleanType,DateType,IntegerType,DoubleType

spark = SparkSession.builder.appName("PF").config("spark.sql.caseSensitive", "True").getOrCreate()

In [7]:
#read inputs 
acc_meta = spark.read.options(inferSchema='True',header='True').csv('./data/other_input/account_metadata.csv')


#read master ledger file
df = spark.createDataFrame(pd.read_excel('./data/other_input/Master Ledger.xlsx',sheet_name="Master Ledger"))
df = df.withColumn("ID",col("ID").cast(IntegerType()))\
        .withColumn("Amount",col("Amount").cast(DoubleType()))\
        .withColumn("Subscriptions",col("Subscriptions").cast(BooleanType()))\
        .withColumn("Return",col("Return").cast(BooleanType()))\
        .withColumn("Real Amount",col("Real Amount").cast(DoubleType()))

df = df.withColumn("Date",to_date(col("Date"),"MM/dd/yyyy"))
df.printSchema()
df = df.dropna(how="all",subset= ["ID"])
df = df.replace('NaN',"")
df.orderBy("ID", ascending=False).show()

root
 |-- ID: integer (nullable = true)
 |-- Account: string (nullable = true)
 |-- Item: string (nullable = true)
 |-- Amount: double (nullable = true)
 |-- Date: date (nullable = true)
 |-- Transaction Type: string (nullable = true)
 |-- Categories: string (nullable = true)
 |-- Categories 2: string (nullable = true)
 |-- Real Amount: double (nullable = true)
 |-- Note: string (nullable = true)
 |-- Subscriptions: boolean (nullable = true)
 |-- Return: boolean (nullable = true)
 |-- Account Type: string (nullable = true)
 |-- Owner: string (nullable = true)

+----+--------------------+--------------------+------+----------+----------------+-------------+------------+-----------+-------------------+-------------+------+------------+-----+
|  ID|             Account|                Item|Amount|      Date|Transaction Type|   Categories|Categories 2|Real Amount|               Note|Subscriptions|Return|Account Type|Owner|
+----+--------------------+--------------------+------+----------+-

In [8]:
#read all csv files exported from Empower
path = glob.glob('./data/empower_input/*.csv')
df2 = spark.read.options(inferSchema='True',header='True').csv(path)


#add more columns to df2
df2 = df2.withColumn("ID",lit(1))\
.withColumn("Item",col("Description")).drop("Description")\
.withColumn("Real Amount",col("Amount"))\
.withColumn("Amount",abs(col("Amount")))\
.withColumn("Account Type",lit(None))\
.withColumn("Owner",lit(None))\
.withColumn("Transaction Type",when(col("Real Amount") <0, "Expense").otherwise("Income"))\
.drop("Category")\
.withColumn("Owner",lit(None))\
.withColumn("Subscriptions",lit(False))\
.withColumn("Return",lit(False))\
.drop("Tags")

df2.printSchema()

df2.show()

root
 |-- Date: date (nullable = true)
 |-- Account: string (nullable = true)
 |-- Amount: double (nullable = true)
 |-- ID: integer (nullable = false)
 |-- Item: string (nullable = true)
 |-- Real Amount: double (nullable = true)
 |-- Account Type: void (nullable = true)
 |-- Owner: void (nullable = true)
 |-- Transaction Type: string (nullable = false)
 |-- Subscriptions: boolean (nullable = false)
 |-- Return: boolean (nullable = false)

+----------+--------------------+------+---+--------------------+-----------+------------+-----+----------------+-------------+------+
|      Date|             Account|Amount| ID|                Item|Real Amount|Account Type|Owner|Transaction Type|Subscriptions|Return|
+----------+--------------------+------+---+--------------------+-----------+------------+-----+----------------+-------------+------+
|2023-05-20|Capital One Quick...| 41.17|  1|         Dollar Tree|     -41.17|        null| null|         Expense|        false| false|
|2023-05-19| Ro

In [12]:
# union master ledger with empower, then 
max_date = df.select(max("Date")).first()[0]
df3 = df.unionByName(df2.filter(col("Date")> max_date), allowMissingColumns=True)
df3 = df3.drop("ID","Account Type","Owner").join(acc_meta, on = 'Account').dropDuplicates(['Date','Account','Item','Real Amount','Note']).orderBy("Date")
df3.orderBy("Date", ascending=False).show()

+--------------------+--------------------+------+----------+----------------+----------+------------+-----------+----+-------------+------+------------+-----+
|             Account|                Item|Amount|      Date|Transaction Type|Categories|Categories 2|Real Amount|Note|Subscriptions|Return|Account Type|Owner|
+--------------------+--------------------+------+----------+----------------+----------+------------+-----------+----+-------------+------+------------+-----+
|Capital One Quick...|         Dollar Tree| 41.17|2023-05-20|         Expense|      null|        null|     -41.17|null|        false| false|      Credit|   PA|
|Adv Plus Banking ...|      The Home Depot|126.44|2023-05-19|         Expense|      null|        null|    -126.44|null|        false| false|       Debit|Joint|
|Checking BoA Ritu...|             Fanduel| 100.0|2023-05-19|         Expense|      null|        null|     -100.0|null|        false| false|       Debit| Ritu|
| Robinhoodinvestment|Penn $26 Call 6/2.

In [13]:
df3.toPandas().to_csv("./data/output/out.csv", index_label="ID")