# Environment Preparation
---

### Importing the required libraries

In [1]:
import re, os
from pyspark.sql.types import *
import pyspark.sql.functions as sf
from pyspark.sql import SparkSession
import logging
from datetime import date, datetime

### Setup the loggers

In [2]:
# Separate pyspark loggers
pyspark_log = logging.getLogger('pyspark').setLevel(logging.ERROR)
py4j_logger = logging.getLogger("py4j").setLevel(logging.ERROR)

In [3]:
# First logger for logging the spark job
spark_logger = logging.getLogger('SparkJobs')
spark_logger.setLevel(logging.DEBUG)
today_date = date.today().isoformat()
logger_file_name = f'Logs/SparkLogs/SparkJobsLogs-{today_date}.log'
fh1 = logging.FileHandler(logger_file_name)
fh1.setLevel(logging.DEBUG)
fh1.setFormatter(logging.Formatter('%(asctime)s - %(message)s', datefmt='%d-%b-%y %H:%M:%S'))
spark_logger.addHandler(fh1)

In [4]:
# Second logger for reporting some information about the files
data_logger = logging.getLogger('DataLogs')
current_time = datetime.now()
data_logger.setLevel(logging.DEBUG)
if not os.path.exists(f'Logs/DataLogs/{today_date}'):
    os.makedirs(f'Logs/DataLogs/{today_date}')
data_logs_file_name = f'Logs/DataLogs/{today_date}/DataInfo-H-{current_time.hour}.log'
fh2 = logging.FileHandler(data_logs_file_name)
fh2.setLevel(logging.DEBUG)
fh2.setFormatter(logging.Formatter('%(message)s'))
data_logger.addHandler(fh2)

### Start Spark Session

In [5]:
spark_logger.info('Starting Spark Session ...')
spark = SparkSession\
    .builder\
    .master("local[4]")\
    .appName("CleaningData")\
    .config("spark.eventLog.logBlockUpdates.enabled", True)\
    .getOrCreate()

sc = spark.sparkContext
spark_logger.info('Spark Session Started')

---

# General Purpose Functions for Cleaning
---

### Trimming String Columns

In [6]:
# This function extracts the string columns and trims them, then returns the dataframe after trimming its string columns
def trim_columns(dataframe):
    columns = [column for column, dtype in dataframe.dtypes if dtype.startswith('string')]
    for column in columns:
        dataframe = dataframe.withColumn(column, sf.trim(sf.col(column)))
    return dataframe

### Information about the dataframe

In [7]:
# This function takes a dataframe and a list of columns to check if there's duplicates or not,
# then returns the number of records of this dataframe and if there're duplicates according to the passed columns
def get_number_of_records(dataframe, duplicate_columns):
    number_of_records = dataframe.count()
    number_of_duplicates = number_of_records - dataframe.dropDuplicates(duplicate_columns).count()
    return [number_of_records, number_of_duplicates]

### Information about the dataframe's columns

In [8]:
# This function takes a dataframe which contains only one column, and returns its datatype, number of unique values, number of actual values, 
def get_columns_info(dataframe):
    return dataframe.dtypes

---

# Files Pre-Processing

In [9]:
spark_logger.info(f'Batch in Hour: {current_time.hour}')

In [10]:
spark_logger.info('Starting reading files...\n')

In [11]:
data_logger.info('Hi .. Here\'s some information about our batch !\n')

In [12]:
# Define the directory of the files, depending on the date and batch's hour
files_directory_path = f'/user/itversity/q-retail-company/bronze/{today_date}/hour-{current_time.hour}'

# Define the sliver layer directory
silver_layer_directory_path = f'/user/itversity/q-retail-company/silver/{today_date}/hour-{current_time.hour}'

## Branches Files Processing
---

In [13]:
spark_logger.info('Processing Branches File...')

In [14]:
data_logger.info('Branches File:\n-------------')

### Schema Handling

In [15]:
BranchesSchema = StructType([
    StructField("branch_id", IntegerType(), nullable=False),
    StructField("location", StringType(), nullable=False),
    StructField("establish_date", DateType(), nullable=False),
    StructField("class", StringType(), nullable=False)
])

In [16]:
spark_logger.info('DONE : Schema Handling')

### Reading the file

In [17]:
branches = spark.read.csv(fr'{files_directory_path}/branches_*', header=True, schema=BranchesSchema)
branches.show(5)

+---------+-----------+--------------+-----+
|branch_id|   location|establish_date|class|
+---------+-----------+--------------+-----+
|        1|   New York|    2017-01-15|    A|
|        2|Los Angeles|    2016-07-28|    B|
|        3|    Chicago|    2015-03-10|    A|
|        4|    Houston|    2016-11-05|    D|
|        5|    Phoenix|    2017-09-20|    C|
+---------+-----------+--------------+-----+
only showing top 5 rows



In [18]:
branches_info = get_number_of_records(branches, ['location', 'establish_date', 'class'])
number_of_records = branches_info[0]
number_of_duplicates = branches_info[1]
data_logger.info(f'Number of records = {number_of_records}\nNumber of duplicates = {number_of_duplicates}\n')

In [19]:
data_logger.info('Columns:')
counter = 1
for column, dtype in get_columns_info(branches):
    number_of_unique_values = branches.select(f'{column}').distinct().count()
    data_logger.info(f'Column {counter}: {column} - Data Type: {dtype} - Number of Unique Values: {number_of_unique_values}')
    counter += 1
    
data_logger.info('\n')

In [20]:
spark_logger.info('DONE : Reading Branches File')

### Cleaning Process

In [21]:
# Trim Branches data (for String columns only)
branches_cleaned = trim_columns(branches)
branches_cleaned.show(5)

+---------+-----------+--------------+-----+
|branch_id|   location|establish_date|class|
+---------+-----------+--------------+-----+
|        1|   New York|    2017-01-15|    A|
|        2|Los Angeles|    2016-07-28|    B|
|        3|    Chicago|    2015-03-10|    A|
|        4|    Houston|    2016-11-05|    D|
|        5|    Phoenix|    2017-09-20|    C|
+---------+-----------+--------------+-----+
only showing top 5 rows



In [22]:
spark_logger.info('DONE : Trmming String Columns')

In [23]:
# Check for duplicates
if number_of_duplicates > 0:
    branches_cleaned = branches_cleaned.dropDuplicates(['location', 'establish_date', 'class'])

In [24]:
spark_logger.info('DONE : Checking for Duplicates')

### Writing to HDFS

In [25]:
branches_cleaned.coalesce(1).write.mode('overwrite').option('header','true').csv(f'{silver_layer_directory_path}/branches_cleaned.csv')

In [26]:
spark_logger.info('DONE : Writing to HDFS\n')

---

## Sales Agents Files Processing
---

In [27]:
spark_logger.info('Processing Sales Agents File...')

In [28]:
data_logger.info('Sales Agents File:\n-------------')

### Schema Handling

In [29]:
SalesAgentsSchema = StructType([
    StructField("sales_person_id", IntegerType(), nullable=False),
    StructField("name", StringType(), nullable=False),
    StructField("hire_date", DateType(), nullable=False)
])

In [30]:
spark_logger.info('DONE : Schema Handling')

### Reading the file

In [31]:
salesAgents = spark.read.csv(fr'{files_directory_path}/sales_agents_*', header=True, schema=SalesAgentsSchema)
salesAgents.show(5)

+---------------+---------------+----------+
|sales_person_id|           name| hire_date|
+---------------+---------------+----------+
|              1|       John Doe|2020-09-10|
|              2|     Jane Smith|2018-12-19|
|              3|Michael Johnson|2019-04-08|
|              4|    Emily Brown|2020-07-25|
|              5|   David Wilson|2019-03-19|
+---------------+---------------+----------+
only showing top 5 rows



In [32]:
salesAgents_info = get_number_of_records(salesAgents, ['name', 'hire_date'])
number_of_records = salesAgents_info[0]
number_of_duplicates = salesAgents_info[1]
data_logger.info(f'Number of records = {number_of_records}\nNumber of duplicates = {number_of_duplicates}\n')

In [33]:
data_logger.info('Columns:')
counter = 1
for column, dtype in get_columns_info(salesAgents):
    number_of_unique_values = salesAgents.select(f'{column}').distinct().count()
    data_logger.info(f'Column {counter}: {column} - Data Type: {dtype} - Number of Unique Values: {number_of_unique_values}')
    counter += 1
    
data_logger.info('\n')

In [34]:
spark_logger.info('DONE : Reading Sales Agents File')

### Cleaning Process

In [35]:
# Trim Sales Agents data (for String columns only)
salesAgents_cleaned = trim_columns(salesAgents)
salesAgents_cleaned.show(5)

+---------------+---------------+----------+
|sales_person_id|           name| hire_date|
+---------------+---------------+----------+
|              1|       John Doe|2020-09-10|
|              2|     Jane Smith|2018-12-19|
|              3|Michael Johnson|2019-04-08|
|              4|    Emily Brown|2020-07-25|
|              5|   David Wilson|2019-03-19|
+---------------+---------------+----------+
only showing top 5 rows



In [36]:
spark_logger.info('DONE : Trmming String Columns')

In [37]:
# Check for duplicates
if salesAgents_cleaned.count() > salesAgents_cleaned.dropDuplicates(['name', 'hire_date']).count():
    salesAgents_cleaned = salesAgents_cleaned.dropDuplicates(['name', 'hire_date'])

In [38]:
spark_logger.info('DONE : Checking for Duplicates')

### Writing to HDFS

In [39]:
salesAgents_cleaned.coalesce(1).write.mode('overwrite').option('header','true').csv(f'{silver_layer_directory_path}/salesAgents_cleaned.csv')

In [40]:
spark_logger.info('DONE : Writing to HDFS\n')

---

## Transactions Files Processing
---

In [41]:
spark_logger.info('Processing Transactions File...')

In [42]:
data_logger.info('Transactions File:\n-------------')

### Schema Handling

In [43]:
TransactionsSchema = StructType([
    StructField("transaction_date", DateType(), nullable=False),
    StructField("transaction_id", StringType(), nullable=False),
    StructField("customer_id", IntegerType(), nullable=False),
    StructField("customer_fname", StringType(), nullable=False),
    StructField("customer_lname", StringType(), nullable=False),
    StructField("customer_email", StringType(), nullable=False),
    StructField("sales_agent_id", IntegerType(), nullable=False),
    StructField("branch_id", IntegerType(), nullable=False),
    StructField("product_id", IntegerType(), nullable=False),
    StructField("product_name", StringType(), nullable=False),
    StructField("product_category", StringType(), nullable=False),
    StructField("offer_1", BooleanType(), nullable=True),
    StructField("offer_2", BooleanType(), nullable=True),
    StructField("offer_3", BooleanType(), nullable=True),
    StructField("offer_4", BooleanType(), nullable=True),
    StructField("offer_5", BooleanType(), nullable=True),
    StructField("units", IntegerType(), nullable=False),
    StructField("unit_price", FloatType(), nullable=False),
    StructField("is_online", StringType(), nullable=False),
    StructField("payment_method", StringType(), nullable=False),
    StructField("shipping_address", StringType(), nullable=True)
])

In [44]:
spark_logger.info('DONE : Schema Handling')

### Reading the file

In [45]:
transactions = spark.read.csv(fr'{files_directory_path}/sales_transactions_*', header=True, schema=TransactionsSchema)
transactions.show(5)

+----------------+----------------+-----------+--------------+--------------+--------------------+--------------+---------+----------+-----------------+----------------+-------+-------+-------+-------+-------+-----+----------+---------+--------------+----------------+
|transaction_date|  transaction_id|customer_id|customer_fname|customer_lname|      customer_email|sales_agent_id|branch_id|product_id|     product_name|product_category|offer_1|offer_2|offer_3|offer_4|offer_5|units|unit_price|is_online|payment_method|shipping_address|
+----------------+----------------+-----------+--------------+--------------+--------------------+--------------+---------+----------+-----------------+----------------+-------+-------+-------+-------+-------+-----+----------+---------+--------------+----------------+
|      2022-12-13|trx-675189967400|      85552|         James|         Smith|james.smith@hotma...|             8|        3|        20|            Heels|        Footwear|   null|   null|   true|

In [46]:
transactions_info = get_number_of_records(transactions, ['transaction_date', 'transaction_id'])
number_of_records = transactions_info[0]
number_of_duplicates = transactions_info[1]
data_logger.info(f'Number of records = {number_of_records}\nNumber of duplicates = {number_of_duplicates}\n')

In [47]:
data_logger.info('Columns:')
counter = 1
for column, dtype in get_columns_info(transactions):
    number_of_unique_values = transactions.select(f'{column}').distinct().count()
    data_logger.info(f'Column {counter}: {column} - Data Type: {dtype} - Number of Unique Values: {number_of_unique_values}')
    counter += 1
    
data_logger.info('\n')

In [48]:
spark_logger.info('DONE : Reading Transactions File')

### Cleaning Process

In [49]:
# Trim Transactions data (for String columns only)
transactions_cleaned = trim_columns(transactions)
transactions_cleaned.show(5, truncate=False)

+----------------+----------------+-----------+--------------+--------------+--------------------------+--------------+---------+----------+-----------------+----------------+-------+-------+-------+-------+-------+-----+----------+---------+--------------+----------------+
|transaction_date|transaction_id  |customer_id|customer_fname|customer_lname|customer_email            |sales_agent_id|branch_id|product_id|product_name     |product_category|offer_1|offer_2|offer_3|offer_4|offer_5|units|unit_price|is_online|payment_method|shipping_address|
+----------------+----------------+-----------+--------------+--------------+--------------------------+--------------+---------+----------+-----------------+----------------+-------+-------+-------+-------+-------+-----+----------+---------+--------------+----------------+
|2022-12-13      |trx-675189967400|85552      |James         |Smith         |james.smith@hotmail.com^  |8             |3        |20        |Heels            |Footwear        |

In [50]:
spark_logger.info('DONE : Trmming String Columns')

In [51]:
# Clean the customer_email column
transactions_cleaned = transactions_cleaned.withColumn('customer_email', sf.expr("substring(customer_email, 1, locate('.com', customer_email) + 3)"))
transactions_cleaned.show(5, truncate=False)

+----------------+----------------+-----------+--------------+--------------+-------------------------+--------------+---------+----------+-----------------+----------------+-------+-------+-------+-------+-------+-----+----------+---------+--------------+----------------+
|transaction_date|transaction_id  |customer_id|customer_fname|customer_lname|customer_email           |sales_agent_id|branch_id|product_id|product_name     |product_category|offer_1|offer_2|offer_3|offer_4|offer_5|units|unit_price|is_online|payment_method|shipping_address|
+----------------+----------------+-----------+--------------+--------------+-------------------------+--------------+---------+----------+-----------------+----------------+-------+-------+-------+-------+-------+-----+----------+---------+--------------+----------------+
|2022-12-13      |trx-675189967400|85552      |James         |Smith         |james.smith@hotmail.com  |8             |3        |20        |Heels            |Footwear        |null

In [52]:
spark_logger.info('DONE : Cleaning "customer_email" Column')

In [53]:
# Check for duplicates
if transactions_cleaned.count() > transactions_cleaned.dropDuplicates(['transaction_date', 'transaction_id']).count():
    transactions_cleaned = transactions_cleaned.dropDuplicates(['transaction_date', 'transaction_id'])

In [54]:
spark_logger.info('DONE : Checking for Duplicates')

### Writing to HDFS

In [55]:
transactions_cleaned.coalesce(1).write.mode('overwrite').option('header','true').csv(f'{silver_layer_directory_path}/transactions_cleaned.csv')

In [56]:
spark_logger.info('DONE : Writing to HDFS\n')

---

## Stop Spark context

In [57]:
spark.stop()

In [58]:
spark_logger.info('Spark Session Stopped\n--------------------------------------------------------------------------------------------------------------------')

---