[REDACTED]


In [0]:
from pyspark.sql import SparkSession
from pyspark.sql import DataFrame
from pyspark.sql.functions import col

In [0]:
spark = SparkSession.builder.appName("data_profiling").getOrCreate()

In [0]:
metadata = spark.read.format("csv").option("header", "true").load("wasbs://dlbikestorelanding@adlsbikestoreinterns.blob.core.windows.net/Mostafa_Landing/metadata/metadata_mostafa.csv")

In [0]:
display(metadata)

table_id,source_type,source_filename,destination_filename,destination_path,primary_key,table_column,table_name,is_PK?,can_be_nulled,data_type,unique?
brands_stg,stg,brands,brands,Mostafa_Landing/brands,brand_id,brands,brand_id,Y,N,int,Y
brands_stg,stg,brands,brands,Mostafa_Landing/brands,brand_id,brands,brand_name,N,Y,string,N
categories_stg,stg,categories,categories,Mostafa_Landing/categories,categorie_id,categories,category_id,Y,N,int,Y
categories_stg,stg,categories,categories,Mostafa_Landing/categories,categorie_id,categories,category_name,N,Y,string,N
customers_stg,stg,customers,customers,Mostafa_Landing/customers,customer_id,customers,customer_id,Y,N,int,Y
customers_stg,stg,customers,customers,Mostafa_Landing/customers,customer_id,customers,first_name,N,Y,string,N
customers_stg,stg,customers,customers,Mostafa_Landing/customers,customer_id,customers,last_name,N,Y,string,N
customers_stg,stg,customers,customers,Mostafa_Landing/customers,customer_id,customers,phone,N,Y,int,N
customers_stg,stg,customers,customers,Mostafa_Landing/customers,customer_id,customers,email,N,Y,string,Y
customers_stg,stg,customers,customers,Mostafa_Landing/customers,customer_id,customers,street,N,Y,string,N


In [0]:
def load_source_data(spark, metadata_df):

    dataframes = {}

    # Extract unique folder names from the metadata DataFrame
    folder_names = metadata_df.select("destination_filename").distinct().rdd.flatMap(lambda x: x).collect()

    # Load files for each folder name
    for folder_name in folder_names:
        file_path = f"wasbs://dlbikestorelanding@adlsbikestoreinterns.blob.core.windows.net/Mostafa_Landing/{folder_name}/"
        df = spark.read.parquet(file_path, inferSchema=True)
        dataframes[folder_name] = df
    for source_filename, df in dataframes.items():
     globals()[f"{source_filename}_df"] = df
    
    return dataframes


In [0]:
load_source_data(spark, metadata)

{'brands': DataFrame[brand_id: string, brand_name: string, time_stamp: string, process_id: string],
 'order_items': DataFrame[order_id: string, item_id: string, product_id: string, quantity: string, list_price: string, discount: string, time_stamp: string, process_id: string],
 'staffs': DataFrame[staff_id: string, first_name: string, last_name: string, email: string, phone: string, active: string, store_id: string, manager_id: string, time_stamp: string, process_id: string],
 'categories': DataFrame[category_id: string, category_name: string, time_stamp: string, process_id: string],
 'customers': DataFrame[customer_id: string, first_name: string, last_name: string, phone: string, email: string, street: string, city: string, state: string, zip_code: string, time_stamp: string, process_id: string],
 'order': DataFrame[order_id: int, customer_id: int, order_status: int, order_date: string, required_date: string, shipped_date: string, store_id: int, staff_id: int, time_stamp: string, proc

In [0]:
display(customers_df)

In [0]:
display(products_df)

In [0]:
display(brands_df)

brand_id,brand_name,time_stamp,process_id
1,Electra,2024-08-29T19:53:29.5252651Z,7bd3dd10-00eb-45b6-9e68-6ca32ed8f8e6
2,Haro,2024-08-29T19:53:29.5252651Z,7bd3dd10-00eb-45b6-9e68-6ca32ed8f8e6
3,Heller,2024-08-29T19:53:29.5252651Z,7bd3dd10-00eb-45b6-9e68-6ca32ed8f8e6
4,Pure Cycles,2024-08-29T19:53:29.5252651Z,7bd3dd10-00eb-45b6-9e68-6ca32ed8f8e6
5,Ritchey,2024-08-29T19:53:29.5252651Z,7bd3dd10-00eb-45b6-9e68-6ca32ed8f8e6
6,Strider,2024-08-29T19:53:29.5252651Z,7bd3dd10-00eb-45b6-9e68-6ca32ed8f8e6
7,Sun Bicycles,2024-08-29T19:53:29.5252651Z,7bd3dd10-00eb-45b6-9e68-6ca32ed8f8e6
8,Surly,2024-08-29T19:53:29.5252651Z,7bd3dd10-00eb-45b6-9e68-6ca32ed8f8e6
9,Trek,2024-08-29T19:53:29.5252651Z,7bd3dd10-00eb-45b6-9e68-6ca32ed8f8e6
4,Pure Cycles,2024-08-29T19:53:29.5252651Z,7bd3dd10-00eb-45b6-9e68-6ca32ed8f8e6


Databricks data profile. Run in Databricks to view.

In [0]:
display(categories_df)

category_id,category_name,time_stamp,process_id
1,Children Bicycles,2024-08-29T19:53:29.5553813Z,60eb0385-b23a-43ae-9a8a-7afc7104ad93
2,Comfort Bicycles,2024-08-29T19:53:29.5553813Z,60eb0385-b23a-43ae-9a8a-7afc7104ad93
3,Cruisers Bicycles,2024-08-29T19:53:29.5553813Z,60eb0385-b23a-43ae-9a8a-7afc7104ad93
4,Cyclocross Bicycles,2024-08-29T19:53:29.5553813Z,60eb0385-b23a-43ae-9a8a-7afc7104ad93
5,Electric Bikes,2024-08-29T19:53:29.5553813Z,60eb0385-b23a-43ae-9a8a-7afc7104ad93
5,Cruisers Bicycles,2024-08-29T19:53:29.5553813Z,60eb0385-b23a-43ae-9a8a-7afc7104ad93
6,Mountain Bikes,2024-08-29T19:53:29.5553813Z,60eb0385-b23a-43ae-9a8a-7afc7104ad93
7,Road Bikes,2024-08-29T19:53:29.5553813Z,60eb0385-b23a-43ae-9a8a-7afc7104ad93
8,PwC Cars,2024-08-29T19:53:29.5553813Z,60eb0385-b23a-43ae-9a8a-7afc7104ad93
3,Comfort Bicycles,2024-08-29T19:53:29.5553813Z,60eb0385-b23a-43ae-9a8a-7afc7104ad93


Databricks data profile. Run in Databricks to view.

In [0]:
display(stores_df)

store_id,store_name,phone,email,street,city,state,zip_code,time_stamp,process_id
1,Santa Cruz Bikes,(831) 476-4321,santacruz@bikes.shop,3700 Portola Drive,Santa Cruz,CA,95060,2024-08-29T19:54:49.7832203Z,914fbcbb-bfb6-48fd-a12f-48b89a59f9e2
2,Baldwin Bikes,(516) 379-8888,baldwin@bikes.shop,4200 Chestnut Lane,Baldwin,NY,11432,2024-08-29T19:54:49.7832203Z,914fbcbb-bfb6-48fd-a12f-48b89a59f9e2
3,Rowlett Bikes,(972) 530-5555,rowlett@bikes.shop,8000 Fairway Avenue,Rowlett,TX,75088,2024-08-29T19:54:49.7832203Z,914fbcbb-bfb6-48fd-a12f-48b89a59f9e2
4,Mohamed Ali,,Mohamed.d.ali@pwc.com,101 Good Morning,Cairo,,11835,2024-08-29T19:54:49.7832203Z,914fbcbb-bfb6-48fd-a12f-48b89a59f9e2


Databricks data profile. Run in Databricks to view.

In [0]:
display(staffs_df)

staff_id,first_name,last_name,email,phone,active,store_id,manager_id,time_stamp,process_id
1,Fabiola,Jackson,fabiola.jackson@bikes.shop,(831) 555-5554,1,1,,2024-08-29T19:53:49.1301191Z,779c8c5b-b3cf-416d-873f-af7c02f6c3e5
2,Mireya,Copeland,mireya.copeland@bikes.shop,(831) 555-5555,1,1,1.0,2024-08-29T19:53:49.1301191Z,779c8c5b-b3cf-416d-873f-af7c02f6c3e5
2,Mohamed,Ali,Mohamed.d.ali@pwc.com,(xxx) xxx-xxx,1,1,1.0,2024-08-29T19:53:49.1301191Z,779c8c5b-b3cf-416d-873f-af7c02f6c3e5
3,Genna,Serrano,genna.serrano@bikes.shop,(831) 555-5556,1,1,2.0,2024-08-29T19:53:49.1301191Z,779c8c5b-b3cf-416d-873f-af7c02f6c3e5
4,Virgie,Wiggins,virgie.wiggins@bikes.shop,(831) 555-5557,1,1,2.0,2024-08-29T19:53:49.1301191Z,779c8c5b-b3cf-416d-873f-af7c02f6c3e5
5,Jannette,David,jannette.david@bikes.shop,(516) 379-4444,1,2,1.0,2024-08-29T19:53:49.1301191Z,779c8c5b-b3cf-416d-873f-af7c02f6c3e5
6,Marcelene,Boyer,marcelene.boyer@bikes.shop,(516) 379-4445,1,2,5.0,2024-08-29T19:53:49.1301191Z,779c8c5b-b3cf-416d-873f-af7c02f6c3e5
7,Venita,Daniel,venita.daniel@bikes.shop,(516) 379-4446,1,2,5.0,2024-08-29T19:53:49.1301191Z,779c8c5b-b3cf-416d-873f-af7c02f6c3e5
11,Virgie,Wiggins,virgie.wiggins@bikes.shop,(831) 555-5557,1,1,2.0,2024-08-29T19:53:49.1301191Z,779c8c5b-b3cf-416d-873f-af7c02f6c3e5
8,Kali,Vargas,kali.vargas@bikes.bop,(972) 530-5555,1,3,1.0,2024-08-29T19:53:49.1301191Z,779c8c5b-b3cf-416d-873f-af7c02f6c3e5


Databricks data profile. Run in Databricks to view.

In [0]:
display(customers_df)

In [0]:
display(order_df)

In [0]:
display(order_items_df)

In [0]:
display()

In [0]:
def read_files_from_metadata(metadata_path: str, base_path: str = None) -> DataFrame:
    # Initialize SparkSession
    spark = SparkSession.builder.appName("ReadFilesFromMetadata").getOrCreate()

    # Load metadata CSV into DataFrame
    metadata = spark.read.csv(metadata_path, header=True)

    # Initialize an empty DataFrame to concatenate all file data
    combined_df = None

    for row in metadata.collect():
        # Construct the file path
        file_path = row['file_path']
        if base_path:
            file_path = f"{base_path}/{file_path}"
        
        # Determine the file type and read the file accordingly
        file_type = row['file_type'].lower()
        
        if file_type == 'csv':
            df = spark.read.csv(file_path, header=True)
        elif file_type == 'parquet':
            df = spark.read.parquet(file_path)
        elif file_type == 'json':
            df = spark.read.json(file_path)
        else:
            raise ValueError(f"Unsupported file type: {file_type}")

        # Combine the DataFrames
        if combined_df is None:
            combined_df = df
        else:
            combined_df = combined_df.union(df)

    return combined_df


In [0]:
brands_df = spark.read.format("parquet").load("wasbs://dlbikestorelanding@adlsbikestoreinterns.blob.core.windows.net/Mostafa_Landing/brands")

In [0]:
display(brands_df)

brand_id,brand_name,time_stamp,process_id
1,Electra,2024-08-26T17:36:27.3510467Z,d1a875a5-45ef-4b49-aae6-1a434596676a
2,Haro,2024-08-26T17:36:27.3510467Z,d1a875a5-45ef-4b49-aae6-1a434596676a
3,Heller,2024-08-26T17:36:27.3510467Z,d1a875a5-45ef-4b49-aae6-1a434596676a
4,Pure Cycles,2024-08-26T17:36:27.3510467Z,d1a875a5-45ef-4b49-aae6-1a434596676a
5,Ritchey,2024-08-26T17:36:27.3510467Z,d1a875a5-45ef-4b49-aae6-1a434596676a
6,Strider,2024-08-26T17:36:27.3510467Z,d1a875a5-45ef-4b49-aae6-1a434596676a
7,Sun Bicycles,2024-08-26T17:36:27.3510467Z,d1a875a5-45ef-4b49-aae6-1a434596676a
8,Surly,2024-08-26T17:36:27.3510467Z,d1a875a5-45ef-4b49-aae6-1a434596676a
9,Trek,2024-08-26T17:36:27.3510467Z,d1a875a5-45ef-4b49-aae6-1a434596676a
4,Pure Cycles,2024-08-26T17:36:27.3510467Z,d1a875a5-45ef-4b49-aae6-1a434596676a


Databricks data profile. Run in Databricks to view.