### Import Modules

In [1]:
import os

import ipywidgets as widgets
from IPython.display import display

from pyspark.sql import SparkSession
from pyspark.sql.types import StringType, DecimalType, LongType, ArrayType
from pyspark.sql.functions import to_date, from_json, regexp_replace, col, when, lower, initcap, explode_outer, datediff, desc, lit, round, date_format

### Define environment using widgets

In [2]:
# In the case of multiple environments, widgets can be utilized as
# parameters to schedule the notebook, allowing it to execute in
# different environments with corresponding configuration files.

environment_widget = widgets.Text(value="dev", disabled = False)
display(environment_widget)

Text(value='dev')

### Execute the shared notebook to utilize the common utilities

In [3]:
%run .//shared//load_yaml_config.ipynb

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


### Load Configurations

In [4]:
env = environment_widget.value
print(env)

config = load_configs(env)

source_dir = config["data"]["source_dir"]
customer_file = config["data"]["customers_file"]
books_file = config["data"]["books_file"]
checkouts_file = config["data"]["checkouts_file"]
libraries_file = config["data"]["libraries_file"]

return_limit_in_days = config["data"]["return_limit_in_days"]
source_data_format = config["data"]["source_data_format"]

spark_source_data_read_options = config["spark_source_data_read_options"]

customers_path = os.path.join(source_dir, customer_file)
books_path = os.path.join(source_dir, books_file)
checkouts_path = os.path.join(source_dir, checkouts_file)
libraries_path = os.path.join(source_dir, libraries_file)

print(customers_path)
print(books_path)
print(checkouts_path)
print(libraries_path)
print(return_limit_in_days)
print(source_data_format)
print(spark_source_data_read_options)

dev
C:\Users\Nikola\Downloads\library_data\customers.csv
C:\Users\Nikola\Downloads\library_data\books.csv
C:\Users\Nikola\Downloads\library_data\checkouts.csv
C:\Users\Nikola\Downloads\library_data\libraries.csv
28
csv
{'header': 'true', 'delimiter': ',', 'inferSchema': 'true', 'ignoreLeadingWhiteSpace': 'true', 'ignoreTrailingWhiteSpace': 'true'}


### Build spark session.

In [5]:
spark = SparkSession.builder.appName(
    "library_book_late_analysis").master("local").getOrCreate()

### Load Raw Source Data.

In [6]:
customers_raw_df = spark.read.format(source_data_format).options(
    **spark_source_data_read_options).load(customers_path)
if customers_raw_df.isEmpty():
    raise SystemExit("Customers collection is empty!")

customers_raw_df.show()

+--------------------+------------------+--------------------+------------+----------+--------+-------------------+------+---------------+-------------------+
|                  id|              name|      street_address|        city|     state| zipcode|         birth_date|gender|      education|         occupation|
+--------------------+------------------+--------------------+------------+----------+--------+-------------------+------+---------------+-------------------+
|df83ec2d0d409395c...| Cynthia Barnfield|     44 NE Meikle Pl|    Portland|    Oregon| 97213.0|2009-09-10 00:00:00|female|    High School|               null|
|6aec7ab2ea0d67161...|   Elizabeth Smith| 7511 SE Harrison St|    Portland|    Oregon| 97215.0|1956-12-15 00:00:00|female|        College|        Blue Collar|
|0c54340672f510fdb...|     Richard Pabla|     1404 SE Pine St|    Portland|    Oregon| 97214.0|1960-12-18 00:00:00|  male|        College| Education & Health|
|f0d9ce833ddc1f73c...|     Charles Baker|12271

In [7]:
books_raw_df = spark.read.format(source_data_format).options(
    **spark_source_data_read_options).load(books_path)
if books_raw_df.isEmpty():
    raise SystemExit("Books collection is empty!")

books_raw_df.show()

+------------+--------------------+--------------------+--------------------+-------------------+--------------------+---------+-----+
|          id|               title|             authors|           publisher|      publishedDate|          categories|    price|pages|
+------------+--------------------+--------------------+--------------------+-------------------+--------------------+---------+-----+
|hVFwAAAAQBAJ|Ogilvy on Adverti...|    ['David Ogilvy']|             Vintage|2013-09-11 00:00:00|  ['Social Science']|    72.99|  320|
|bRY9AAAAYAAJ|Foreign Publicati...|['United States. ...|                null|1913-01-01 00:00:00|     ['Advertising']|   469.99|  654|
|ZapAAAAAIAAJ|Advertising and t...|['John A. Howard'...|                null|1973-01-01 00:00:00|     ['Advertising']|    372.0|  784|
|A-HthMfF5moC|Profitable Advert...|                null|                null|1894-01-01 00:00:00|     ['Advertising']|240.99USD|  559|
|4Z9JAAAAMAAJ|Report of the Fed...|['United States. ...

In [8]:
checkouts_raw_df = spark.read.format(source_data_format).options(
    **spark_source_data_read_options).load(checkouts_path)
if checkouts_raw_df.isEmpty():
    raise SystemExit("Checkouts collection is empty!")

checkouts_raw_df.show()

+------------+--------------------+-------------------+-------------+-------------+
|          id|           patron_id|         library_id|date_checkout|date_returned|
+------------+--------------------+-------------------+-------------+-------------+
|-xFj0vTLbRIC|b071c9c68228a2b1d...|225-222@5xc-jtz-hkf|   2019-01-28|   2018-11-13|
|HUX-y4oXl04C|8d3f63e1deed89d7b...|223-222@5xc-jxr-tgk|   2018-05-29|   2018-06-12|
|TQpFnkku2poC|4ae202f8de7625917...|228-222@5xc-jtz-hwk|   2018-11-23|   2019-01-24|
|OQ6sDwAAQBAJ|f9372de3c8ea50160...|23v-222@5xc-jv7-v4v|   2018-01-15|   2018-04-25|
|7T9-BAAAQBAJ|2cf3cc3b9e9f6c608...|225-222@5xc-jtz-hkf|   2018-12-31|   1804-01-23|
|iGoXAQAAMAAJ|80f93362e97d9f610...|22c-222@5xc-jwj-pvz|   2018-04-07|   1815-08-11|
|CW-7tHAaVR0C|dd9f34e9d65126a2b...|22c-222@5xc-jwj-pvz|   2018-01-10|   2018-02-04|
|Cr74DwAAQBAJ|69a2fbbf7aaad8ac0...|23v-222@5xc-jv7-v4v|   2106-02-26|   2018-12-10|
|t1e3BWziAc8C|3b85b2c7b424618f5...|222-222@5xc-jv5-nt9|   2018-06-23|   2018

In [9]:
libraries_raw_df = spark.read.format(source_data_format).options(
    **spark_source_data_read_options).load(libraries_path)
if libraries_raw_df.isEmpty():
    raise SystemExit("Libraries collection is empty!")

libraries_raw_df.show()

+-------------------+--------------------+--------------------+--------+------+-----------+
|                 id|                name|      street_address|    city|region|postal_code|
+-------------------+--------------------+--------------------+--------+------+-----------+
|226-222@5xc-kc4-fpv|Multnomah   Count...|10723 SW capitol Hwy|Portland|    OR|      97219|
|23v-222@5xc-jv7-v4v|Multnomah County ...| 2300 NW Thurman  St|    null|    or|       null|
|222-222@5xc-jvf-skf|Multnomah County ...|7510 N Charleston...|portland|    or|      97203|
|227-222@5xc-jww-btv|Multnomah   Count...| 1525 SW Sunset blvd|Portland|    or|     -97239|
|22d-222@5xc-kcy-8sq|Multnomah County ...|    7860 SE 13th AVE|Portland|    OR|      97202|
|223-222@5xc-jxr-tgk|MULTNOMAH County ...|    6008 se 49TH AVE|Portland|    OR|     -97206|
|zzw-224@5xc-jwv-2rk|Multnomah  County...|   801  SW 10th  Ave|Portland|  null|      97205|
|zzw-223@5xc-jv7-ct9|Friends OF the mu...|    522  SW 5th  Ave|    null|    OR| 

### Function definitions.

In [10]:
"""
Input arguments:
    1. total_amount (int) - Represents the total value, equivalent to 100%.
    2. part_amount (int) - The value for which the percentage will be calculated relative to total_amount.

Output:
    1. (int) - Returns percentage calculated from part_amount and total_amount.

Description:
    This function calculates percentage of part_amount relative to total_amount and
    rounds the result to the specified number of decimal places.
"""


def calculate_percentage(total_amount: int, part_amount: int):
    decimals = 3

    try:
        return round((part_amount / total_amount) * 100, decimals)
    except ZeroDivisionError:
        print("Total amount can't be zero!")

In [11]:
"""
Input arguments:
    1. df (DataFrame) - The dataframe on which operation is performed.

Output:
    1. df (DataFrame) - Returns the modified dataframe.

Description:
    This function removes all leading and trailing whitespace from a string
    and replaces multiple consecutive spaces between words with a single space.
"""


def trim_white_spaces_from_df(df):
    for field in df.dtypes:
        column_name = field[0]
        column_type = field[1]

        if column_type != "string":
            continue

        df = df.withColumn(
            column_name, regexp_replace(column_name, "\s+", " ")
        )
    return df

In [12]:
"""
Input arguments:
    1. df (DataFrame) - The dataframe on which the operation is performed.
    2. perfix (str) - A string that will be prefixed to the beginning of 
    every column name, except for the id column.

Output:
    1. df (DataFrame) - Returns modified dataframe.

Description:
    This function is employed to rename columns prior to merging data from the 
    Libraries, Checkouts and Customers tables. This step is necessary to facilitate
    data manipulation, as Libraries and Customers share identical column names in
    some instances. The only exception to this renaming process is the id column, as
    the Checkouts table already include a prefix for its foreign key (i.e., library_id)
"""


def append_prefix_to_column_name(df, prefix: str):
    for column_name in df.columns:
        if column_name == "id":
            continue

        new_column_name = f"{prefix}_{column_name}"
        df = df.withColumnRenamed(column_name, new_column_name)

    return df

### Customers collection data cleaning.

In [13]:
customers_df = trim_white_spaces_from_df(df=customers_raw_df)

# Retain only numeric characters and dots in the zipcode to extract the number without decimals
# Convert the data types of the columns and apply the initicap function to capitalize the first letter of each word
customers_df = (
    customers_df.withColumn(
        "zipcode", regexp_replace("zipcode", "[^\d\.]", ""))
    .withColumn("zipcode", col("zipcode").cast(LongType()))
    .withColumn("city", initcap("city"))
    .withColumn("name", initcap("name"))
    .withColumn("education", initcap("education"))
    .withColumn("state", initcap("state"))
    .withColumn("gender", lower(col("gender")))
    .withColumn("occupation", initcap("occupation"))
)

print(customers_df.count())
customers_df.show()

2000
+--------------------+------------------+--------------------+------------+----------+-------+-------------------+------+---------------+------------------+
|                  id|              name|      street_address|        city|     state|zipcode|         birth_date|gender|      education|        occupation|
+--------------------+------------------+--------------------+------------+----------+-------+-------------------+------+---------------+------------------+
|df83ec2d0d409395c...| Cynthia Barnfield|     44 NE Meikle Pl|    Portland|    Oregon|  97213|2009-09-10 00:00:00|female|    High School|              null|
|6aec7ab2ea0d67161...|   Elizabeth Smith| 7511 SE Harrison St|    Portland|    Oregon|  97215|1956-12-15 00:00:00|female|        College|       Blue Collar|
|0c54340672f510fdb...|     Richard Pabla|     1404 SE Pine St|    Portland|    Oregon|  97214|1960-12-18 00:00:00|  male|        College|Education & Health|
|f0d9ce833ddc1f73c...|     Charles Baker|12271 N West

### Books collection data cleaning.

In [14]:
books_df = trim_white_spaces_from_df(df=books_raw_df)

# Retain only numeric characters and dots in the price column
# Retain only numeric characters in the pages column
books_df = (
    books_df.withColumn("price", regexp_replace("price", "[^\d\.]", ""))
    .withColumn("pages", regexp_replace("pages", "[^\d]", ""))
)

books_df = books_df.withColumn(
    "categories",
    when(
        col("categories").contains("&"), regexp_replace(
            "categories", "&", "','")
    ).otherwise(col("categories")),
)

books_df = books_df.withColumn(
    "categories",
    when(
        col("categories").contains(","), regexp_replace(
            "categories", ",", "','")
    ).otherwise(col("categories")),
)

# Convert columns to their appropriate data types
# Explode the data on the authors and categories fields
# Renamae the column to follow snake_case conventions
books_df = (
    books_df.withColumn("authors", from_json(
        "authors", ArrayType(StringType())))
    .withColumn("categories", from_json("categories", ArrayType(StringType())))
    .withColumn("pages", col("pages").cast(LongType()))
    .withColumn("price", col("price").cast(DecimalType(precision=10, scale=2)))
    .withColumn("authors", explode_outer(col("authors")))
    .withColumn("categories", explode_outer(col("categories")))
    .withColumnRenamed("publishedDate", "published_date")
)

books_df = trim_white_spaces_from_df(df=books_df)

print(books_df.count())
books_df.show()

329
+------------+--------------------+--------------------+--------------------+-------------------+--------------------+------+-----+
|          id|               title|             authors|           publisher|     published_date|          categories| price|pages|
+------------+--------------------+--------------------+--------------------+-------------------+--------------------+------+-----+
|hVFwAAAAQBAJ|Ogilvy on Adverti...|        David Ogilvy|             Vintage|2013-09-11 00:00:00|      Social Science| 72.99|  320|
|bRY9AAAAYAAJ|Foreign Publicati...|United States. Bu...|                null|1913-01-01 00:00:00|         Advertising|469.99|  654|
|ZapAAAAAIAAJ|Advertising and t...|      John A. Howard|                null|1973-01-01 00:00:00|         Advertising|372.00|  784|
|ZapAAAAAIAAJ|Advertising and t...|       James Hulbert|                null|1973-01-01 00:00:00|         Advertising|372.00|  784|
|A-HthMfF5moC|Profitable Advert...|                null|                

### Checkouts collection data cleaning.

In [15]:
checkouts_df = trim_white_spaces_from_df(df=checkouts_raw_df)

# Cast string columns to date data types
checkouts_df = checkouts_df.withColumn(
    "date_checkout", to_date("date_checkout", "yyyy-MM-dd")
).withColumn("date_returned", to_date("date_returned", "yyyy-MM-dd"))

print(checkouts_df.count())
checkouts_df.show()

2000
+------------+--------------------+-------------------+-------------+-------------+
|          id|           patron_id|         library_id|date_checkout|date_returned|
+------------+--------------------+-------------------+-------------+-------------+
|-xFj0vTLbRIC|b071c9c68228a2b1d...|225-222@5xc-jtz-hkf|   2019-01-28|   2018-11-13|
|HUX-y4oXl04C|8d3f63e1deed89d7b...|223-222@5xc-jxr-tgk|   2018-05-29|   2018-06-12|
|TQpFnkku2poC|4ae202f8de7625917...|228-222@5xc-jtz-hwk|   2018-11-23|   2019-01-24|
|OQ6sDwAAQBAJ|f9372de3c8ea50160...|23v-222@5xc-jv7-v4v|   2018-01-15|   2018-04-25|
|7T9-BAAAQBAJ|2cf3cc3b9e9f6c608...|225-222@5xc-jtz-hkf|   2018-12-31|   1804-01-23|
|iGoXAQAAMAAJ|80f93362e97d9f610...|22c-222@5xc-jwj-pvz|   2018-04-07|   1815-08-11|
|CW-7tHAaVR0C|dd9f34e9d65126a2b...|22c-222@5xc-jwj-pvz|   2018-01-10|   2018-02-04|
|Cr74DwAAQBAJ|69a2fbbf7aaad8ac0...|23v-222@5xc-jv7-v4v|   2106-02-26|   2018-12-10|
|t1e3BWziAc8C|3b85b2c7b424618f5...|222-222@5xc-jv5-nt9|   2018-06-23|  

### Libraries collection data cleaning.

In [16]:
libraries_df = trim_white_spaces_from_df(df=libraries_raw_df)

# Retain only numeric characters and dots in the postal_code to extract the number without decimals
# Convert the data types of the columns and apply the initicap function to capitalize the first letter of each word
libraries_df = (
    libraries_df.withColumn("region", lower(col("region")))
    .withColumn("postal_code", regexp_replace("postal_code", "[^\d\.]", ""))
    .withColumn("postal_code", col("postal_code").cast(LongType()))
    .withColumn("city", initcap("city"))
)

print(libraries_df.count())
libraries_df.show()

18
+-------------------+--------------------+--------------------+--------+------+-----------+
|                 id|                name|      street_address|    city|region|postal_code|
+-------------------+--------------------+--------------------+--------+------+-----------+
|226-222@5xc-kc4-fpv|Multnomah County ...|10723 SW capitol Hwy|Portland|    or|      97219|
|23v-222@5xc-jv7-v4v|Multnomah County ...|  2300 NW Thurman St|    null|    or|       null|
|222-222@5xc-jvf-skf|Multnomah County ...|7510 N Charleston...|Portland|    or|      97203|
|227-222@5xc-jww-btv|Multnomah County ...| 1525 SW Sunset blvd|Portland|    or|      97239|
|22d-222@5xc-kcy-8sq|Multnomah County ...|    7860 SE 13th AVE|Portland|    or|      97202|
|223-222@5xc-jxr-tgk|MULTNOMAH County ...|    6008 se 49TH AVE|Portland|    or|      97206|
|zzw-224@5xc-jwv-2rk|Multnomah County ...|     801 SW 10th Ave|Portland|  null|      97205|
|zzw-223@5xc-jv7-ct9|Friends OF the mu...|      522 SW 5th Ave|    null|    o

### Merge data into single dataframe.

In [17]:
checkouts_df = checkouts_df.withColumnRenamed("id", "checkout_id")
libraries_df = append_prefix_to_column_name(df=libraries_df, prefix="library")

library_checkouts = checkouts_df.join(
    libraries_df, on=checkouts_df.library_id == libraries_df.id, how="inner"
).drop("id")

customers_df = append_prefix_to_column_name(df=customers_df, prefix="customer")

final_df = library_checkouts.join(
    customers_df, on=library_checkouts.patron_id == customers_df.id, how="inner").drop("id")

# Calculate how many days customer kept the book
# This will be used for late return filtering
final_df = final_df.withColumn("days_kept", datediff(
    col("date_returned"), col("date_checkout")))

print(final_df.count())
final_df.show()

2000
+------------+--------------------+-------------------+-------------+-------------+--------------------+----------------------+------------+--------------+-------------------+-----------------+-----------------------+-------------+--------------+----------------+-------------------+---------------+------------------+-------------------+---------+
| checkout_id|           patron_id|         library_id|date_checkout|date_returned|        library_name|library_street_address|library_city|library_region|library_postal_code|    customer_name|customer_street_address|customer_city|customer_state|customer_zipcode|customer_birth_date|customer_gender|customer_education|customer_occupation|days_kept|
+------------+--------------------+-------------------+-------------+-------------+--------------------+----------------------+------------+--------------+-------------------+-----------------+-----------------------+-------------+--------------+----------------+-------------------+--------------

### Filter data where books were returned late.

In [18]:
# Considering only records where customers returned books late.
# The checkout date must not exceed the return date.
# The customer's birth date must not be later then the checkout date.
late_returns_df = final_df.filter(
    (col("days_kept") >= return_limit_in_days)
    & (col("date_checkout") < col("date_returned"))
    & (col("date_checkout") > col("customer_birth_date"))
)

if late_returns_df.isEmpty():
    raise SystemExit("No data found in unified table!")

print(late_returns_df.count())
late_returns_df.show()

201
+------------+--------------------+-------------------+-------------+-------------+--------------------+----------------------+------------+--------------+-------------------+------------------+-----------------------+-------------+--------------+----------------+-------------------+---------------+------------------+-------------------+---------+
| checkout_id|           patron_id|         library_id|date_checkout|date_returned|        library_name|library_street_address|library_city|library_region|library_postal_code|     customer_name|customer_street_address|customer_city|customer_state|customer_zipcode|customer_birth_date|customer_gender|customer_education|customer_occupation|days_kept|
+------------+--------------------+-------------------+-------------+-------------+--------------------+----------------------+------------+--------------+-------------------+------------------+-----------------------+-------------+--------------+----------------+-------------------+------------

### Analysis.

In [19]:
late_returns_df.describe().show()

+-------+------------+--------------------+-------------------+--------------------+----------------------+------------+--------------+-------------------+-----------------+-----------------------+-------------+--------------+------------------+---------------+------------------+-------------------+------------------+
|summary| checkout_id|           patron_id|         library_id|        library_name|library_street_address|library_city|library_region|library_postal_code|    customer_name|customer_street_address|customer_city|customer_state|  customer_zipcode|customer_gender|customer_education|customer_occupation|         days_kept|
+-------+------------+--------------------+-------------------+--------------------+----------------------+------------+--------------+-------------------+-----------------+-----------------------+-------------+--------------+------------------+---------------+------------------+-------------------+------------------+
|  count|         201|                 2

### Analysis by Customer City

In [20]:
customer_city_df = late_returns_df.filter((col("customer_city").isNotNull()))
total_late_returns_per_customer_city = customer_city_df.count()

customer_city_df = (
    customer_city_df.groupBy("customer_city")
    .count()
    .select("customer_city", col("count").alias("late_returns"))
    .orderBy(
        desc("late_returns"),
    )
)
customer_city_df = customer_city_df.withColumn(
    "percentage_late_returns",
    lit(calculate_percentage(total_amount=total_late_returns_per_customer_city,
        part_amount=col("late_returns"))),
)

print(total_late_returns_per_customer_city)
customer_city_df.show()

190
+-------------+------------+-----------------------+
|customer_city|late_returns|percentage_late_returns|
+-------------+------------+-----------------------+
|     Portland|         142|                 74.737|
|    Vancouver|          12|                  6.316|
|    Beaverton|           9|                  4.737|
|  Lake Oswego|           7|                  3.684|
| Happy Valley|           6|                  3.158|
|  Oregon City|           5|                  2.632|
|    West Linn|           5|                  2.632|
|     Tualatin|           2|                  1.053|
|    Clackamas|           1|                  0.526|
|     Damascus|           1|                  0.526|
+-------------+------------+-----------------------+



### Analysis by Customer Occupation

In [21]:
occupation_df = late_returns_df.filter(col("customer_occupation").isNotNull())
total_late_returns_per_occupation = occupation_df.count()

occupation_df = (
    occupation_df.groupBy("customer_occupation")
    .count()
    .select("customer_occupation", col("count").alias("late_returns"))
    .orderBy(desc("late_returns"))
)

occupation_df = occupation_df.withColumn(
    "percentage_late_returns",
    lit(calculate_percentage(total_amount=total_late_returns_per_occupation,
        part_amount=col("late_returns"))),
)

print(total_late_returns_per_occupation)
occupation_df.show()

196
+-------------------+------------+-----------------------+
|customer_occupation|late_returns|percentage_late_returns|
+-------------------+------------+-----------------------+
|               Tech|          35|                 17.857|
|    Admin & Support|          28|                 14.286|
| Business & Finance|          28|                 14.286|
|             Others|          28|                 14.286|
|              Sales|          26|                 13.265|
|        Blue Collar|          26|                 13.265|
| Education & Health|          25|                 12.755|
+-------------------+------------+-----------------------+



### Analysis by Customer Education Level

In [22]:
education_df = late_returns_df.filter(col("customer_education").isNotNull())
total_late_returns_per_education = education_df.count()

education_df = (
    education_df.groupBy("customer_education")
    .count()
    .select("customer_education", col("count").alias("late_returns"))
    .orderBy(desc("late_returns"))
)

education_df = education_df.withColumn(
    "percentage_late_returns",
    lit(calculate_percentage(total_amount=total_late_returns_per_education,
        part_amount=col("late_returns"))),
)

print(total_late_returns_per_education)
education_df.show()

186
+------------------+------------+-----------------------+
|customer_education|late_returns|percentage_late_returns|
+------------------+------------+-----------------------+
|           College|          49|                 26.344|
|   Graduate Degree|          47|                 25.269|
|       High School|          46|                 24.731|
|            Others|          44|                 23.656|
+------------------+------------+-----------------------+



### Analysis by Customer Education Level and Occupation

In [23]:
education_occupation_df = late_returns_df.filter(
    (col("customer_education").isNotNull()) & (
        col("customer_occupation").isNotNull())
)
total_late_returns_per_education_occupation = education_occupation_df.count()

education_occupation_df = (
    education_occupation_df.groupBy(
        "customer_education", "customer_occupation")
    .count()
    .select(
        "customer_education",
        "customer_occupation",
        col("count").alias("late_returns"),
    )
    .orderBy(desc("late_returns"))
)

education_occupation_df = education_occupation_df.withColumn(
    "percentage_late_returns",
    lit(
        calculate_percentage(
            total_amount=total_late_returns_per_education_occupation, part_amount=col(
                "late_returns")
        )
    ),
)

print(total_late_returns_per_education_occupation)
education_occupation_df.show()

182
+------------------+-------------------+------------+-----------------------+
|customer_education|customer_occupation|late_returns|percentage_late_returns|
+------------------+-------------------+------------+-----------------------+
|       High School|               Tech|          11|                  6.044|
|           College|             Others|           9|                  4.945|
|           College|              Sales|           9|                  4.945|
|   Graduate Degree|               Tech|           8|                  4.396|
|           College| Business & Finance|           8|                  4.396|
|   Graduate Degree|    Admin & Support|           8|                  4.396|
|       High School| Business & Finance|           8|                  4.396|
|       High School|    Admin & Support|           8|                  4.396|
|            Others|               Tech|           8|                  4.396|
|            Others|        Blue Collar|           8|       

### Analysis by Customer Gender, Education Level and Occupation

In [24]:
gender_education_occupation_df = late_returns_df.filter(
    (col("customer_education").isNotNull())
    & (col("customer_occupation").isNotNull())
    & (col("customer_gender").isNotNull())
)
total_late_returns_per_gender_education_occupation = (
    gender_education_occupation_df.count()
)

gender_occupation_education_df = (
    gender_education_occupation_df.groupBy(
        "customer_gender", "customer_occupation", "customer_education"
    )
    .count()
    .select(
        "customer_gender",
        "customer_occupation",
        "customer_education",
        col("count").alias("late_returns"),
    )
    .orderBy(desc("late_returns"))
)
gender_occupation_education_df = gender_occupation_education_df.withColumn(
    "percentage_late_returns",
    lit(
        calculate_percentage(
            total_amount=total_late_returns_per_gender_education_occupation, part_amount=col(
                "late_returns")
        )
    ),
)

print(total_late_returns_per_gender_education_occupation)
gender_occupation_education_df.show()

174
+---------------+-------------------+------------------+------------+-----------------------+
|customer_gender|customer_occupation|customer_education|late_returns|percentage_late_returns|
+---------------+-------------------+------------------+------------+-----------------------+
|           male|              Sales|           College|           7|                  4.023|
|         female|               Tech|       High School|           6|                  3.448|
|           male|    Admin & Support|   Graduate Degree|           6|                  3.448|
|         female|        Blue Collar|            Others|           6|                  3.448|
|           male| Business & Finance|           College|           5|                  2.874|
|           male| Business & Finance|       High School|           5|                  2.874|
|           male|    Admin & Support|           College|           5|                  2.874|
|         female|               Tech|   Graduate Degree|

### Analysis by the Day of the Return

In [25]:
day_of_week_return_df = late_returns_df.withColumn(
    "day_of_the_week_returned", date_format("date_returned", "EEEE")
)
total_late_returns_per_day_of_week_return = day_of_week_return_df.count()

day_of_week_return_df = (
    day_of_week_return_df.groupBy("day_of_the_week_returned")
    .count()
    .select("day_of_the_week_returned", col("count").alias("late_returns"))
    .orderBy(desc("late_returns"))
)
day_of_week_return_df = day_of_week_return_df.withColumn(
    "percentage_late_returns",
    lit(
        calculate_percentage(
            total_amount=total_late_returns_per_day_of_week_return, part_amount=col(
                "late_returns")
        )
    ),
)

print(total_late_returns_per_day_of_week_return)
day_of_week_return_df.show()

201
+------------------------+------------+-----------------------+
|day_of_the_week_returned|late_returns|percentage_late_returns|
+------------------------+------------+-----------------------+
|                  Monday|          33|                 16.418|
|                Thursday|          31|                 15.423|
|                 Tuesday|          28|                  13.93|
|                  Sunday|          28|                  13.93|
|               Wednesday|          28|                  13.93|
|                  Friday|          27|                 13.433|
|                Saturday|          26|                 12.935|
+------------------------+------------+-----------------------+



### Analysis by Checkout Month

In [26]:
month_checkout_df = late_returns_df.withColumn(
    "month_checkout", date_format("date_checkout", "MMMM")
)
total_late_returns_per_month_checkout = month_checkout_df.count()

month_checkout_df = (
    month_checkout_df.groupBy("month_checkout")
    .count()
    .select("month_checkout", col("count").alias("late_returns"))
    .orderBy(desc("late_returns"))
)
month_checkout_df = month_checkout_df.withColumn(
    "percentage_late_returns",
    lit(
        calculate_percentage(
            total_amount=total_late_returns_per_month_checkout, part_amount=col(
                "late_returns")
        )
    ),
)

print(total_late_returns_per_month_checkout)
month_checkout_df.show()

201
+--------------+------------+-----------------------+
|month_checkout|late_returns|percentage_late_returns|
+--------------+------------+-----------------------+
|      November|          22|                 10.945|
|      December|          21|                 10.448|
|       January|          20|                   9.95|
|        August|          19|                  9.453|
|         April|          17|                  8.458|
|     September|          16|                   7.96|
|      February|          16|                   7.96|
|          July|          16|                   7.96|
|       October|          15|                  7.463|
|          June|          14|                  6.965|
|         March|          13|                  6.468|
|           May|          12|                   5.97|
+--------------+------------+-----------------------+



### Analysis by Return Month

In [27]:
month_returned_df = late_returns_df.withColumn(
    "month_returned", date_format("date_returned", "MMMM")
)
total_late_returns_per_month_returned = month_returned_df.count()

month_returned_df = (
    month_returned_df.groupBy("month_returned")
    .count()
    .select("month_returned", col("count").alias("late_returns"))
    .orderBy(desc("late_returns"))
)
month_returned_df = month_returned_df.withColumn(
    "percentage_late_returns",
    lit(
        calculate_percentage(
            total_amount=total_late_returns_per_month_returned, part_amount=col(
                "late_returns")
        )
    ),
)

print(total_late_returns_per_month_returned)
month_returned_df.show()

201
+--------------+------------+-----------------------+
|month_returned|late_returns|percentage_late_returns|
+--------------+------------+-----------------------+
|        August|          23|                 11.443|
|          July|          21|                 10.448|
|         March|          21|                 10.448|
|      November|          20|                   9.95|
|         April|          20|                   9.95|
|       January|          19|                  9.453|
|           May|          15|                  7.463|
|      February|          14|                  6.965|
|      December|          14|                  6.965|
|     September|          14|                  6.965|
|          June|          12|                   5.97|
|       October|           8|                   3.98|
+--------------+------------+-----------------------+

