In [27]:
from pyspark.sql import SparkSession

# Create a Spark session
spark = SparkSession.builder \
        .getOrCreate()

# Read data from a CSV file
file_path = "transactions.csv"
df = spark.read.csv(file_path, header=True, inferSchema=True)

# Show the first few rows of the DataFrame
df.show()



+--------------+-----------+--------------------+----------------+-------------------+--------+
|transaction_id|  AccountID|TransactionOperation|transaction_type|   transaction_date|  Amount|
+--------------+-----------+--------------------+----------------+-------------------+--------+
|    TR-1000001|BUS-9125383|          Withdrawal|           debit|2019-08-15 00:00:00| 23496.3|
|    TR-1000002|BUS-2381560|          Withdrawal|           debit|2017-03-26 00:00:00|87460.96|
|    TR-1000003|BUS-1809830|             Deposit|          credit|2019-09-14 00:00:00|19398.73|
|    TR-1000004|BUS-7691907|          Withdrawal|           debit|2018-09-18 00:00:00|30873.71|
|    TR-1000005|SAV-2508541|            Transfer|          credit|2019-07-20 00:00:00|97272.09|
|    TR-1000006|SAV-9339161|            Transfer|          credit|2019-10-30 00:00:00| 97621.2|
|    TR-1000007|CUR-8975490|          Withdrawal|           debit|2017-08-21 00:00:00|75643.43|
|    TR-1000008|SAV-1231816|            

In [28]:
# Defining the structure/schema of the data 


from pyspark.sql.types import StructType, StructField, StringType, TimestampType, DoubleType
schema = StructType([
    StructField("transaction_id", StringType(), True),
    StructField("AccountID", StringType(), True),
    StructField("TransactionOperation", StringType(), True),
    StructField("transaction_type", StringType(), True),
    StructField("transaction_date", TimestampType(), True),
    StructField("Amount", DoubleType(), True)
])
df = spark.read.csv(file_path, header=True, schema=schema)

# Show the DataFrame
df.show()


+--------------+-----------+--------------------+----------------+-------------------+--------+
|transaction_id|  AccountID|TransactionOperation|transaction_type|   transaction_date|  Amount|
+--------------+-----------+--------------------+----------------+-------------------+--------+
|    TR-1000001|BUS-9125383|          Withdrawal|           debit|2019-08-15 00:00:00| 23496.3|
|    TR-1000002|BUS-2381560|          Withdrawal|           debit|2017-03-26 00:00:00|87460.96|
|    TR-1000003|BUS-1809830|             Deposit|          credit|2019-09-14 00:00:00|19398.73|
|    TR-1000004|BUS-7691907|          Withdrawal|           debit|2018-09-18 00:00:00|30873.71|
|    TR-1000005|SAV-2508541|            Transfer|          credit|2019-07-20 00:00:00|97272.09|
|    TR-1000006|SAV-9339161|            Transfer|          credit|2019-10-30 00:00:00| 97621.2|
|    TR-1000007|CUR-8975490|          Withdrawal|           debit|2017-08-21 00:00:00|75643.43|
|    TR-1000008|SAV-1231816|            

In [29]:
#Checking if there are any duplicates in the transaction id 
df.select("transaction_id").distinct().count() == df.count()

True

In [30]:
#Checking if there are any null values in the dataframe
from pyspark.sql.functions import *
null_counts = [df.where(col(c).isNull()).count() for c in df.columns]
null_counts

[0, 0, 0, 0, 0, 0]

In [31]:
# Checking for the types of data
dfs.dtypes

[('transaction_id', 'string'),
 ('AccountID', 'string'),
 ('TransactionOperation', 'string'),
 ('transaction_type', 'string'),
 ('transaction_date', 'timestamp'),
 ('Amount', 'double')]

In [32]:
# Dropping the columns where there are null values in the following columns
df_filtered = df.na.drop(subset=["transaction_id","Amount","AccountId"])

In [33]:
# replacing  the null values with unknown
df_filled = df.fillna("unknown", subset=["TransactionOperation", "transaction_type"])

In [45]:
#converting the string to upper and removing any spaces 

df_transformed = df.withColumn("transaction_type", upper(trim(col("transaction_type"))))
df_transformed = df_transformed.withColumn("TransactionOperation", upper(trim(col("TransactionOperation"))))
df_transformed.show()

+--------------+-----------+--------------------+----------------+-------------------+--------+
|transaction_id|  AccountID|TransactionOperation|transaction_type|   transaction_date|  Amount|
+--------------+-----------+--------------------+----------------+-------------------+--------+
|    TR-1000001|BUS-9125383|          WITHDRAWAL|           DEBIT|2019-08-15 00:00:00| 23496.3|
|    TR-1000002|BUS-2381560|          WITHDRAWAL|           DEBIT|2017-03-26 00:00:00|87460.96|
|    TR-1000003|BUS-1809830|             DEPOSIT|          CREDIT|2019-09-14 00:00:00|19398.73|
|    TR-1000004|BUS-7691907|          WITHDRAWAL|           DEBIT|2018-09-18 00:00:00|30873.71|
|    TR-1000005|SAV-2508541|            TRANSFER|          CREDIT|2019-07-20 00:00:00|97272.09|
|    TR-1000006|SAV-9339161|            TRANSFER|          CREDIT|2019-10-30 00:00:00| 97621.2|
|    TR-1000007|CUR-8975490|          WITHDRAWAL|           DEBIT|2017-08-21 00:00:00|75643.43|
|    TR-1000008|SAV-1231816|            

+--------------+-----------+--------------------+----------------+-------------------+--------+
|transaction_id|  AccountID|TransactionOperation|transaction_type|   transaction_date|  Amount|
+--------------+-----------+--------------------+----------------+-------------------+--------+
|    TR-1000001|BUS-9125383|          WITHDRAWAL|           debit|2019-08-15 00:00:00| 23496.3|
|    TR-1000002|BUS-2381560|          WITHDRAWAL|           debit|2017-03-26 00:00:00|87460.96|
|    TR-1000003|BUS-1809830|             DEPOSIT|          credit|2019-09-14 00:00:00|19398.73|
|    TR-1000004|BUS-7691907|          WITHDRAWAL|           debit|2018-09-18 00:00:00|30873.71|
|    TR-1000005|SAV-2508541|            TRANSFER|          credit|2019-07-20 00:00:00|97272.09|
|    TR-1000006|SAV-9339161|            TRANSFER|          credit|2019-10-30 00:00:00| 97621.2|
|    TR-1000007|CUR-8975490|          WITHDRAWAL|           debit|2017-08-21 00:00:00|75643.43|
|    TR-1000008|SAV-1231816|            