In [1]:
import os
import glob

from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import StringType,BooleanType,DateType,IntegerType,DoubleType

spark = SparkSession.builder.appName("PF").config("spark.sql.caseSensitive", "True").getOrCreate()

In [2]:
#read inputs 
acc_meta = spark.read.options(inferSchema='True',header='True').csv('./data/other_input/account_metadata.csv')


#read master ledger file
path = glob.glob('./data/other_input/Master Ledger.xlsx - Master_Ledger.csv')
df = spark.read.options(inferSchema='True',header='True').csv(path)
df = df.withColumn("ID",col("ID").cast(IntegerType()))\
        .withColumn("Amount",col("Amount").cast(DoubleType()))\
        .withColumn("Subscriptions",col("Subscriptions").cast(BooleanType()))\
        .withColumn("Return",col("Return").cast(BooleanType()))\
        .withColumn("Real Amount",col("Real Amount").cast(DoubleType()))

df = df.withColumn("Date",to_date(col("Date"),"MM/dd/yyyy"))
df.printSchema()
df = df.dropna(how="all",subset= ["ID"])
df.orderBy("ID", ascending=True).show()

root
 |-- ID: integer (nullable = true)
 |-- Item: string (nullable = true)
 |-- Amount: double (nullable = true)
 |-- Date: date (nullable = true)
 |-- Account: string (nullable = true)
 |-- Account Type: string (nullable = true)
 |-- Owner: string (nullable = true)
 |-- Transaction Type: string (nullable = true)
 |-- Categories: string (nullable = true)
 |-- Categories 2: string (nullable = true)
 |-- Real Amount: double (nullable = true)
 |-- Note: string (nullable = true)
 |-- Subscriptions: boolean (nullable = true)
 |-- Return: boolean (nullable = true)

+---+--------------------+-------+----------+--------------------+------------+-----+----------------+-------------+------------+-----------+--------------------+-------------+------+
| ID|                Item| Amount|      Date|             Account|Account Type|Owner|Transaction Type|   Categories|Categories 2|Real Amount|                Note|Subscriptions|Return|
+---+--------------------+-------+----------+--------------------

In [3]:
#read all csv files exported from Empower
path = glob.glob('./data/empower_input/*.csv')
df2 = spark.read.options(inferSchema='True',header='True').csv(path)


#add more columns to df2
df2 = df2.withColumn("ID",lit(1))\
.withColumn("Item",col("Description")).drop("Description")\
.withColumn("Real Amount",col("Amount"))\
.withColumn("Amount",abs(col("Amount")))\
.withColumn("Account Type",lit(None))\
.withColumn("Owner",lit(None))\
.withColumn("Transaction Type",when(col("Real Amount") <0, "Expense").otherwise("Income"))\
.drop("Category")\
.withColumn("Owner",lit(None))\
.withColumn("Subscriptions",lit(False))\
.withColumn("Return",lit(False))\
.drop("Tags")
df2.printSchema()

df2.show()

root
 |-- Date: date (nullable = true)
 |-- Account: string (nullable = true)
 |-- Amount: double (nullable = true)
 |-- ID: integer (nullable = false)
 |-- Item: string (nullable = true)
 |-- Real Amount: double (nullable = true)
 |-- Account Type: void (nullable = true)
 |-- Owner: void (nullable = true)
 |-- Transaction Type: string (nullable = false)
 |-- Subscriptions: boolean (nullable = false)
 |-- Return: boolean (nullable = false)

+----------+--------------------+------+---+--------------------+-----------+------------+-----+----------------+-------------+------+
|      Date|             Account|Amount| ID|                Item|Real Amount|Account Type|Owner|Transaction Type|Subscriptions|Return|
+----------+--------------------+------+---+--------------------+-----------+------------+-----+----------------+-------------+------+
|2023-05-11| Robinhoodinvestment|  3.25|  1|Deposit From Adv ...|       3.25|        null| null|          Income|        false| false|
|2023-05-05|401

In [4]:
# union master ledger with empower, then 
min_date = df2.select(min("Date")).first()[0]
df3 = df.filter(col("Date")< lit("2023-05-01")).unionByName(df2, allowMissingColumns=True)
df3 = df3.drop("ID","Account Type","Owner").join(acc_meta, on = 'Account').orderBy("Date")
df3.show()

+--------------------+--------------------+-------+----------+----------------+-------------+------------+-----------+--------------------+-------------+------+------------+-----+
|             Account|                Item| Amount|      Date|Transaction Type|   Categories|Categories 2|Real Amount|                Note|Subscriptions|Return|Account Type|Owner|
+--------------------+--------------------+-------+----------+----------------+-------------+------------+-----------+--------------------+-------------+------+------------+-----+
|Checking BoA PA 7462|    Checking Account|2541.59|2021-03-24|         Income2|        Other|       Other|    2541.59|             Account|        false| false|       Debit|   PA|
|BoA Customized Ca...|      Credit Account|1583.95|2021-03-24|        Expense2|        Other|       Other|   -1583.95|             Account|        false| false|      Credit|   PA|
|Checking BoA PA 7462|         Vay Tùng Hà| 3000.0|2021-03-24|         Income2|       People|       

In [5]:
df3.toPandas().to_csv("./data/output/out.csv", index_label="ID")