### Imports.

In [60]:
import os

import pyspark
from pyspark.sql import SparkSession

from pyspark.sql.functions import to_date, from_json, regexp_replace, col, when, trim, lower, initcap, explode_outer, datediff, desc, split
from pyspark.sql.types import StringType, DecimalType, LongType, ArrayType

### Run shared notebook to be able to use shared utilities.

In [61]:
%run .//shared//load_yaml_config.ipynb

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


### Configurations.

In [62]:
config = load_configs()

source_dir = config["data"]["source_dir"]
customer_file = config["data"]["customers_file"]
libraries_file = config["data"]["libraries_file"]
checkouts_file = config["data"]["checkouts_file"]
books_file = config["data"]["books_file"]
csv_read_options = config["spark_csv_read_options"]
return_limit_in_days = config["data"]["return_limit_in_days"]

customers_path = os.path.join(source_dir, customer_file)
libraries_path = os.path.join(source_dir, libraries_file)
checkouts_path = os.path.join(source_dir, checkouts_file)
books_path = os.path.join(source_dir, books_file)

print(customers_path)
print(libraries_path)
print(checkouts_path)
print(books_path)
print(csv_read_options)
print(return_limit_in_days)

C:\Users\Nikola\Downloads\library_data\customers.csv
C:\Users\Nikola\Downloads\library_data\libraries.csv
C:\Users\Nikola\Downloads\library_data\checkouts.csv
C:\Users\Nikola\Downloads\library_data\books.csv
{'header': 'true', 'delimiter': ',', 'inferSchema': 'true'}
28


### Build spark session.

In [63]:
conf = pyspark.SparkConf()
spark = SparkSession.builder.appName("library_book_late_analysis").master(
    "local").config(conf=conf).getOrCreate()

### Read raw data.

In [64]:
customers_raw_df = spark.read.format("csv").options(**csv_read_options).load(customers_path)
books_raw_df = spark.read.format("csv").options(**csv_read_options).load(books_path)
checkouts_raw_df = spark.read.format("csv").options(**csv_read_options).load(checkouts_path)
libraries_raw_df = spark.read.format("csv").options(**csv_read_options).load(libraries_path)

### Function definitions.

In [65]:
"""
Input arguments:
    1. df (DataFrame) - Dataframe on which we perform action.

Output:
    1. df (DataFrame) - Returns modified dataframe.

Description:
    This function strips all white spaces on the right and the left side of a string and also,
    replace multiple whitespaces between words with a single whitespace.
"""


def trim_white_spaces_from_df(df):
    for field in df.dtypes:
        column_name = field[0]
        column_type = field[1]

        if column_type != "string":
            continue

        df = df.withColumn(column_name, trim(column_name)).withColumn(
            column_name, regexp_replace(column_name, "\s+", " ")
        )
    return df

In [66]:
"""
Input arguments:
    1. df (DataFrame) - Dataframe on which we perform action.
    2. perfix (str) - Will be added at the begining of every column name except id.

Output:
    1. df (DataFrame) - Returns modified dataframe.

Description:
    This function is used to rename columns before performing merge of the data between Libraries, Checkouts and Customers.
    In order to be able to manipulate with data, since Libraries and Customers have the same name for few columns.
    The only exception made is with 'id' column because Checkouts table has prefix already added to foreign key (library_id).
"""


def append_prefix_to_column_name(df, prefix: str):
    for column_name in df.columns:
        if column_name == "id":
            continue

        new_column_name = f"{prefix}_{column_name}"
        df = df.withColumnRenamed(column_name, new_column_name)

    return df

### Checkouts collection data cleaning.

In [67]:
checkouts_df = trim_white_spaces_from_df(checkouts_raw_df)

# Cast string to dates columns
checkouts_df = checkouts_df.withColumn(
    "date_checkout", to_date("date_checkout", "yyyy-MM-dd")
).withColumn("date_returned", to_date("date_returned", "yyyy-MM-dd"))

print(checkouts_df.count())
checkouts_df.show()

2000
+------------+--------------------+-------------------+-------------+-------------+
|          id|           patron_id|         library_id|date_checkout|date_returned|
+------------+--------------------+-------------------+-------------+-------------+
|-xFj0vTLbRIC|b071c9c68228a2b1d...|225-222@5xc-jtz-hkf|   2019-01-28|   2018-11-13|
|HUX-y4oXl04C|8d3f63e1deed89d7b...|223-222@5xc-jxr-tgk|   2018-05-29|   2018-06-12|
|TQpFnkku2poC|4ae202f8de7625917...|228-222@5xc-jtz-hwk|   2018-11-23|   2019-01-24|
|OQ6sDwAAQBAJ|f9372de3c8ea50160...|23v-222@5xc-jv7-v4v|   2018-01-15|   2018-04-25|
|7T9-BAAAQBAJ|2cf3cc3b9e9f6c608...|225-222@5xc-jtz-hkf|   2018-12-31|   1804-01-23|
|iGoXAQAAMAAJ|80f93362e97d9f610...|22c-222@5xc-jwj-pvz|   2018-04-07|   1815-08-11|
|CW-7tHAaVR0C|dd9f34e9d65126a2b...|22c-222@5xc-jwj-pvz|   2018-01-10|   2018-02-04|
|Cr74DwAAQBAJ|69a2fbbf7aaad8ac0...|23v-222@5xc-jv7-v4v|   2106-02-26|   2018-12-10|
|t1e3BWziAc8C|3b85b2c7b424618f5...|222-222@5xc-jv5-nt9|   2018-06-23|  

### Books collection data cleaning.

In [68]:
books_df = trim_white_spaces_from_df(books_raw_df)

# Remove everything but numbers and dot from price column
# Remove everything but numbers from pages column
books_df = (
    books_df.withColumn("price", regexp_replace("price", "[^\d\.]", ""))
    .withColumn("pages", regexp_replace("pages", "[^\d]", ""))
)

books_df = books_df.withColumn(
    "categories",
    when(
        col("categories").contains("&"), regexp_replace("categories", "&", "','")
    ).otherwise(col("categories")),
)

books_df = books_df.withColumn(
    "categories",
    when(
        col("categories").contains(","), regexp_replace("categories", ",", "','")
    ).otherwise(col("categories")),
)

# Cast columns to correct data type
# Explode on authors and categories
# Rename column to snake case
books_df = (
    books_df.withColumn("authors", from_json("authors", ArrayType(StringType())))
    .withColumn("categories", from_json("categories", ArrayType(StringType())))
    .withColumn("pages", col("pages").cast(LongType()))
    .withColumn("price", col("price").cast(DecimalType(precision=10, scale=2)))
    .withColumn("authors", explode_outer(col("authors")))
    .withColumn("categories", explode_outer(col("categories")))
    .withColumnRenamed("publishedDate", "published_date")
)

books_df = trim_white_spaces_from_df(books_df)

print(books_df.count())
books_df.show()

329
+------------+--------------------+--------------------+--------------------+-------------------+--------------------+------+-----+
|          id|               title|             authors|           publisher|     published_date|          categories| price|pages|
+------------+--------------------+--------------------+--------------------+-------------------+--------------------+------+-----+
|hVFwAAAAQBAJ|Ogilvy on Adverti...|        David Ogilvy|             Vintage|2013-09-11 00:00:00|      Social Science| 72.99|  320|
|bRY9AAAAYAAJ|Foreign Publicati...|United States. Bu...|                null|1913-01-01 00:00:00|         Advertising|469.99|  654|
|ZapAAAAAIAAJ|Advertising and t...|      John A. Howard|                null|1973-01-01 00:00:00|         Advertising|372.00|  784|
|ZapAAAAAIAAJ|Advertising and t...|       James Hulbert|                null|1973-01-01 00:00:00|         Advertising|372.00|  784|
|A-HthMfF5moC|Profitable Advert...|                null|                

### Customers collection data cleaning.

In [69]:
customers_df = trim_white_spaces_from_df(customers_raw_df)

# Remove everything but numbers and dots from zipcode to get number without decimals
# Cast columns and apply initcap which makes first letter upper case of every word
customers_df = (
    customers_df.withColumn("zipcode", regexp_replace("zipcode", "[^\d\.]", ""))
    .withColumn("zipcode", col("zipcode").cast(LongType()))
    .withColumn("city", initcap("city"))
    .withColumn("name", initcap("name"))
    .withColumn("education", initcap("education"))
    .withColumn("state", initcap("state"))
    .withColumn("gender", lower(col("gender")))
    .withColumn("occupation", initcap("occupation"))
)

print(customers_df.count())
customers_df.show()

2000
+--------------------+------------------+--------------------+------------+----------+-------+-------------------+------+---------------+------------------+
|                  id|              name|      street_address|        city|     state|zipcode|         birth_date|gender|      education|        occupation|
+--------------------+------------------+--------------------+------------+----------+-------+-------------------+------+---------------+------------------+
|df83ec2d0d409395c...| Cynthia Barnfield|     44 NE Meikle Pl|    Portland|    Oregon|  97213|2009-09-10 00:00:00|female|    High School|              null|
|6aec7ab2ea0d67161...|   Elizabeth Smith| 7511 SE Harrison St|    Portland|    Oregon|  97215|1956-12-15 00:00:00|female|        College|       Blue Collar|
|0c54340672f510fdb...|     Richard Pabla|     1404 SE Pine St|    Portland|    Oregon|  97214|1960-12-18 00:00:00|  male|        College|Education & Health|
|f0d9ce833ddc1f73c...|     Charles Baker|12271 N West

### Libraries collection data cleaning.

In [70]:
libraries_df = trim_white_spaces_from_df(libraries_raw_df)

# Remove everything but numbers and dots from postal_code to get number without decimals
# Cast columns and apply initcap which makes first letter upper case of every word
libraries_df = (
    libraries_df.withColumn("region", lower(col("region")))
    .withColumn("postal_code", regexp_replace("postal_code", "[^\d\.]", ""))
    .withColumn("postal_code", col("postal_code").cast(LongType()))
    .withColumn("city", initcap("city"))
)

print(libraries_df.count())
libraries_df.show()

18
+-------------------+--------------------+--------------------+--------+------+-----------+
|                 id|                name|      street_address|    city|region|postal_code|
+-------------------+--------------------+--------------------+--------+------+-----------+
|226-222@5xc-kc4-fpv|Multnomah County ...|10723 SW capitol Hwy|Portland|    or|      97219|
|23v-222@5xc-jv7-v4v|Multnomah County ...|  2300 NW Thurman St|    null|    or|       null|
|222-222@5xc-jvf-skf|Multnomah County ...|7510 N Charleston...|Portland|    or|      97203|
|227-222@5xc-jww-btv|Multnomah County ...| 1525 SW Sunset blvd|Portland|    or|      97239|
|22d-222@5xc-kcy-8sq|Multnomah County ...|    7860 SE 13th AVE|Portland|    or|      97202|
|223-222@5xc-jxr-tgk|MULTNOMAH County ...|    6008 se 49TH AVE|Portland|    or|      97206|
|zzw-224@5xc-jwv-2rk|Multnomah County ...|     801 SW 10th Ave|Portland|  null|      97205|
|zzw-223@5xc-jv7-ct9|Friends OF the mu...|      522 SW 5th Ave|    null|    o

### Merge data into single dataframe.

In [None]:
checkouts_df = checkouts_df.withColumnRenamed("id", "checkout_id")
libraries_df = append_prefix_to_column_name(df=libraries_df, prefix="library")

library_checkouts = checkouts_df.join(
    libraries_df, on=checkouts_df.library_id == libraries_df.id, how="inner"
).drop("id")

customers_df = append_prefix_to_column_name(df=customers_df, prefix="customer")

final_df = library_checkouts.join(
    customers_df, on=library_checkouts.patron_id == customers_df.id, how="inner").drop("id")
final_df = final_df.withColumn("days_kept", datediff(
    col("date_returned"), col("date_checkout")))

final_df.show()

+------------+--------------------+-------------------+-------------+-------------+--------------------+----------------------+------------+--------------+-------------------+-----------------+-----------------------+-------------+--------------+----------------+-------------------+---------------+------------------+-------------------+---------+
| checkout_id|           patron_id|         library_id|date_checkout|date_returned|        library_name|library_street_address|library_city|library_region|library_postal_code|    customer_name|customer_street_address|customer_city|customer_state|customer_zipcode|customer_birth_date|customer_gender|customer_education|customer_occupation|days_kept|
+------------+--------------------+-------------------+-------------+-------------+--------------------+----------------------+------------+--------------+-------------------+-----------------+-----------------------+-------------+--------------+----------------+-------------------+---------------+---

### Filter data where books were returned equal or above return limit in days.

In [72]:
late_returns_df = final_df.filter(col("days_kept") >= return_limit_in_days)
late_returns_df.show()

+------------+--------------------+-------------------+-------------+-------------+--------------------+----------------------+------------+--------------+-------------------+------------------+-----------------------+-------------+--------------+----------------+-------------------+---------------+------------------+-------------------+---------+
| checkout_id|           patron_id|         library_id|date_checkout|date_returned|        library_name|library_street_address|library_city|library_region|library_postal_code|     customer_name|customer_street_address|customer_city|customer_state|customer_zipcode|customer_birth_date|customer_gender|customer_education|customer_occupation|days_kept|
+------------+--------------------+-------------------+-------------+-------------+--------------------+----------------------+------------+--------------+-------------------+------------------+-----------------------+-------------+--------------+----------------+-------------------+---------------+

### Analysis.

In [73]:
full_name_df = (
    late_returns_df.select("customer_name")
    .withColumn("first_name", split(col("customer_name"), " ").getItem(0))
    .withColumn("last_name", split(col("customer_name"), " ").getItem(1))
    .drop("customer_name")
)

first_name_df = full_name_df.groupBy("first_name").count().orderBy(desc("count"))
first_name_df.show()

+-----------+-----+
| first_name|count|
+-----------+-----+
|       John|    8|
|    Richard|    6|
|      James|    6|
|      David|    6|
|   Patricia|    5|
|    William|    5|
|      Jason|    5|
|    Michael|    4|
|    Timothy|    3|
|     Joseph|    3|
|    Shirley|    3|
|      Frank|    3|
|     Willie|    3|
|     Robert|    3|
|      Linda|    3|
|     Walter|    3|
|Christopher|    2|
|       Mary|    2|
|       Luke|    2|
|      Jerry|    2|
+-----------+-----+
only showing top 20 rows



In [74]:
last_name_df = full_name_df.groupBy("last_name").count().orderBy(desc("count"))
last_name_df.show()

+---------+-----+
|last_name|count|
+---------+-----+
|    Smith|    5|
|      Lee|    3|
|   Cooper|    2|
|   Bolton|    2|
|   Watson|    2|
|  Hawkins|    2|
|   Morgan|    2|
| Williams|    2|
|   Barber|    2|
|   Dennis|    2|
|  Kennedy|    2|
|   Hughes|    2|
|  Johnson|    2|
|    Adams|    2|
|  Saldana|    2|
|  Jackson|    2|
|    Lopez|    2|
|Castaneda|    1|
|  Donnell|    1|
|     Hawk|    1|
+---------+-----+
only showing top 20 rows



In [75]:
occupation_df = late_returns_df.groupBy("customer_occupation").count().orderBy(desc("count"))
occupation_df.show()

+-------------------+-----+
|customer_occupation|count|
+-------------------+-----+
|    Admin & Support|   46|
|              Sales|   45|
|        Blue Collar|   44|
|               Tech|   44|
|             Others|   43|
| Business & Finance|   42|
| Education & Health|   36|
|               null|   14|
+-------------------+-----+



In [76]:
education_df = late_returns_df.groupBy("customer_education").count().orderBy(desc("count"))
education_df.show()

+------------------+-----+
|customer_education|count|
+------------------+-----+
|           College|   75|
|            Others|   74|
|       High School|   72|
|   Graduate Degree|   71|
|              null|   22|
+------------------+-----+



In [77]:
gender_df = late_returns_df.groupBy("customer_gender").count().orderBy(desc("count"))
gender_df.show()

+---------------+-----+
|customer_gender|count|
+---------------+-----+
|           male|  161|
|         female|  141|
|           null|   12|
+---------------+-----+



In [78]:
gender_occupation_education_df = late_returns_df.groupBy("customer_gender", "customer_occupation", "customer_education").count().orderBy(desc("count"))
gender_occupation_education_df.show()

+---------------+-------------------+------------------+-----+
|customer_gender|customer_occupation|customer_education|count|
+---------------+-------------------+------------------+-----+
|           male|              Sales|           College|   12|
|         female|              Sales|            Others|    8|
|           male|    Admin & Support|           College|    8|
|         female|        Blue Collar|            Others|    8|
|           male| Business & Finance|           College|    8|
|           male|    Admin & Support|   Graduate Degree|    8|
|           male|              Sales|   Graduate Degree|    8|
|           male| Education & Health|       High School|    7|
|         female|               Tech|   Graduate Degree|    7|
|           male|             Others|           College|    7|
|           male|        Blue Collar|       High School|    7|
|         female|        Blue Collar|       High School|    7|
|           male|             Others|       High School