# Validate Records

In [1]:
import findspark
findspark.init()
import pyspark
import pyspark.sql.functions as F

from pyspark.sql.types import *

In [3]:
spark = (
    pyspark.sql
    .SparkSession
    .builder
    .appName("Python")
    .config("spark.jars.packages", "org.apache.hadoop:hadoop-azure:3.3.1,com.microsoft.azure:azure-storage:8.6.6,org.postgresql:postgresql:42.7.3")
    .getOrCreate()
)

:: loading settings :: url = jar:file:/Users/leyiauyeung/.pyenv/versions/3.12.4/envs/rabobank-env/lib/python3.12/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/leyiauyeung/.ivy2/cache
The jars for the packages stored in: /Users/leyiauyeung/.ivy2/jars
org.apache.hadoop#hadoop-azure added as a dependency
com.microsoft.azure#azure-storage added as a dependency
org.postgresql#postgresql added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-dc675932-20d1-42f8-bb45-9d1d0fca920c;1.0
	confs: [default]
	found org.apache.hadoop#hadoop-azure;3.3.1 in central
	found org.apache.httpcomponents#httpclient;4.5.13 in central
	found org.apache.httpcomponents#httpcore;4.4.13 in central
	found commons-logging#commons-logging;1.1.3 in central
	found commons-codec#commons-codec;1.11 in central
	found org.apache.hadoop.thirdparty#hadoop-shaded-guava;1.1.1 in central
	found org.eclipse.jetty#jetty-util-ajax;9.4.40.v20210413 in central
	found org.eclipse.jetty#jetty-util;9.4.40.v20210413 in central
	found org.codehaus.jackson#jackson-mapper-asl;1.9.13 in central
	found org.codehaus.jackson#jackson-cor

## Load records CSV

In [4]:
schema = StructType([
    StructField("reference", IntegerType(), False), 
    StructField("account_number", StringType(), False),
    StructField("description", StringType(), False),
    StructField("start_balance", FloatType(), False),
    StructField("mutation", FloatType(), False),
    StructField("end_balance", FloatType(), False)
])

In [5]:
records = (
    spark.read
    .option("encoding", "ISO-8859-1")
    .schema(schema)
    .format("csv")
    .load("../assignment/records 1.csv")
)
records.show()

                                                                                

+---------+------------------+--------------------+-------------+--------+-----------+
|reference|    account_number|         description|start_balance|mutation|end_balance|
+---------+------------------+--------------------+-------------+--------+-----------+
|     NULL|    Account Number|         Description|         NULL|    NULL|       NULL|
|   176104|NL32RABO0195610843|Flowers for Peter...|       101.84|   13.76|      115.6|
|   112806|NL69ABNA0433647324|Subscription from...|         10.2|   -47.4|      -37.2|
|   109169|NL91RABO0315273637|Tickets from Vinc...|        80.53|  -23.66|      56.87|
|   156539|NL43AEGO0773393871|Candy for Jan Bakker|        88.22|    3.82|      92.04|
|   112806|NL32RABO0195610843|Flowers for Wille...|        16.59|   -7.37|       9.22|
|   112806|NL69ABNA0433647324|Flowers for Danië...|         37.1|   28.81|      65.91|
|   178261|NL27SNSB0917829871|Subscription from...|        50.75|  -21.65|       29.1|
|   100723|NL93ABNA0585619023|Tickets from 

In [6]:
# step not necessarily needed
records.printSchema()

root
 |-- reference: integer (nullable = true)
 |-- account_number: string (nullable = true)
 |-- description: string (nullable = true)
 |-- start_balance: float (nullable = true)
 |-- mutation: float (nullable = true)
 |-- end_balance: float (nullable = true)



## Validations
1. all transaction references should be unique
2. the end balance needs to be validated

In [7]:
# all transaction references should be unique
reference_count = records.groupby("reference").count().withColumnRenamed("count", "reference_count")

In [8]:
# the end balance needs to be validated
records = records.withColumn("validated_end_balance", F.round(F.col("start_balance") + F.col("mutation"), 2))

In [9]:
records = records.join(reference_count, "reference", "left")

## Connect to Database

In [14]:
mode = "append"
url = "jdbc:postgresql://127.0.0.1:5432/postgres"
properties = {"user": "postgres", "password": "postgres", "driver": "org.postgresql.Driver"}

### Insert validated records into records table

In [17]:
validated = records.where((records.reference_count == 1) & (records.validated_end_balance == records.end_balance))
validated = validated.select("reference", "account_number", "description", "start_balance", "mutation", "end_balance")
validated.write.jdbc(url=url, table="validated_records", mode=mode, properties=properties)

### Insert failed records into invalid records table

In [18]:
invalid = records.where((records.reference_count > 1) | (records.validated_end_balance != records.end_balance))
invalid = invalid.select("reference", "account_number", "description", "start_balance", "mutation", "end_balance")
invalid.write.jdbc(url=url, table="invalid_records", mode=mode, properties=properties)