In [1]:
import pandas as pd
import csv

In [4]:
import pandas as pd
import numpy as np
from faker import Faker

def create_customers(num_customers=10000, filename='customers.csv'):
    fake = Faker()
    customer_data = {
        'CustomerID': np.arange(1, num_customers + 1),
        'FirstName': [fake.first_name() for _ in range(num_customers)],
        'LastName': [fake.last_name() for _ in range(num_customers)],
        'Email': [fake.email() for _ in range(num_customers)],
        'Address': [fake.address().replace('\n', ', ') for _ in range(num_customers)]
    }
    df = pd.DataFrame(customer_data)
    df.to_csv(filename, index=False)
    print(f'Data saved to {filename}')

# Call the function
create_customers(num_customers=100000, filename='customers2.csv')

Data saved to customers2.csv


In [5]:
import pandas as pd
import numpy as np
from faker import Faker
from multiprocessing import Pool, cpu_count

def generate_customer_data(num_customers):
    fake = Faker()
    customer_data = {
        'CustomerID': np.arange(1, num_customers + 1),
        'FirstName': [fake.first_name() for _ in range(num_customers)],
        'LastName': [fake.last_name() for _ in range(num_customers)],
        'Email': [fake.email() for _ in range(num_customers)],
        'Address': [fake.address().replace('\n', ', ') for _ in range(num_customers)]
    }
    return pd.DataFrame(customer_data)

def create_customers_multiprocessing(num_customers=100_000, filename='customers3.csv'):
    num_processes = cpu_count()
    pool = Pool(processes=num_processes)

    # Divide the work among processes
    num_customers_per_process = num_customers // num_processes
    args = [num_customers_per_process] * num_processes

    # Generate data in parallel
    dataframes = pool.map(generate_customer_data, args)

    # Combine the dataframes
    df = pd.concat(dataframes, ignore_index=True)

    # Save to csv
    df.to_csv(filename, index=False)
    print(f'Data saved to {filename}')

# Call the function
create_customers_multiprocessing()

Data saved to customers3.csv


In [None]:
import pandas as pd
import numpy as np
from faker import Faker
from multiprocessing import Process, Value, Lock
from random import randint
from datetime import datetime
import os

fake = Faker()

def generate_sales_data(
    n_orders_per_process, 
    order_date, 
    max_customer_id_shared,
    start_sales_order_id,
    lock,
    process_id
):
    customers_df = pd.DataFrame(columns=["CustomerID", "Name", "Email", "Age", "ModifiedDate"])
    sales_order_header_df = pd.DataFrame(columns=["SalesOrderID", "OrderDate", "CustomerID"])
    sales_order_detail_df = pd.DataFrame(columns=["SalesOrderID", "SalesOrderLineNumber", "ProductKey", "Qty", "ModifiedDate"])

    for i in range(n_orders_per_process):
        if randint(1, 10) > 8:
            lock.acquire()
            max_customer_id_shared.value += 1
            curr_customer_id = max_customer_id_shared.value
            lock.release()
            
            new_customer = {
                "CustomerID": curr_customer_id,
                "Name": fake.name(),
                "Email": fake.email(),
                "Age": randint(18, 90),
                "ModifiedDate": datetime.now()
            }
            customers_df = customers_df.append(new_customer, ignore_index=True)
        else:
            lock.acquire()
            curr_customer_id = randint(1, max_customer_id_shared.value)
            lock.release()

        new_sales_order_header = {
            "SalesOrderID": start_sales_order_id + i,
            "OrderDate": order_date,
            "CustomerID": curr_customer_id
        }
        sales_order_header_df = sales_order_header_df.append(new_sales_order_header, ignore_index=True)

        n_lines = randint(1, 5)
        for line_number in range(n_lines):
            new_sales_order_detail = {
                "SalesOrderID": start_sales_order_id + i,
                "SalesOrderLineNumber": line_number + 1,
                "ProductKey": randint(1, 1000),
                "Qty": randint(1, 20),
                "ModifiedDate": datetime.now()
            }
            sales_order_detail_df = sales_order_detail_df.append(new_sales_order_detail, ignore_index=True)

    # Save DataFrames to CSV files
    customers_df.to_csv(f'customers_{process_id}.csv', index=False)
    sales_order_header_df.to_csv(f'sales_order_header_{process_id}.csv', index=False)
    sales_order_detail_df.to_csv(f'sales_order_detail_{process_id}.csv', index=False)

def main(
    n_orders: int, 
    order_date: str, 
    max_customer_id: int, 
    max_sales_order_id: int, 
    n_processes: int
):
    n_orders_per_process = n_orders // n_processes
    max_customer_id_shared = Value('i', max_customer_id)
    processes = []
    lock = Lock()
    
    for i in range(n_processes):
        start_sales_order_id = max_sales_order_id + i * n_orders_per_process
        args = (n_orders_per_process, order_date, max_customer_id_shared, start_sales_order_id, lock, i)
        p = Process(target=generate_sales_data, args=args)
        p.start()
        processes.append(p)
    
    for p in processes:
        p.join()

if __name__ == "__main__":
    main(
        n_orders=100, 
        order_date="2024-05-21", 
        max_customer_id=1000, 
        max_sales_order_id=5000, 
        n_processes=4
    )


In [6]:
sales_order_header_df = pd.DataFrame(columns=["SalesOrderID", "OrderDate", "CustomerID"])
new_sales_order_header = {
            "SalesOrderID": 1,
            "OrderDate": "asd",
            "CustomerID": 3
        }
sales_order_header_df.loc[len(sales_order_header_df)] = new_sales_order_header
sales_order_header_df

Unnamed: 0,SalesOrderID,OrderDate,CustomerID
0,1,asd,3


In [13]:
import pandas as pd
import numpy as np
from faker import Faker
from multiprocessing import Process, Value, Lock
from random import randint

def generate_sales_data(
    n_orders_per_process, 
    order_date, 
    max_customer_id_shared,
    start_sales_order_id,
    lock,
    process_id
):
    fake = Faker()
    # Create dataframes
    customers_df = pd.DataFrame(columns=["CustomerID", "Name", "Email", "Age", "ModifiedDate"])
    sales_order_header_df = pd.DataFrame(columns=["SalesOrderID", "OrderDate", "CustomerID"])
    sales_order_detail_df = pd.DataFrame(columns=["SalesOrderID", "SalesOrderLineNumber", "ProductKey", "Qty", "ModifiedDate"])
    
    for i in range(n_orders_per_process):
        if randint(1, 10) > 8:
            lock.acquire()
            max_customer_id_shared.value += 1
            curr_customer_id = max_customer_id_shared.value
            lock.release()
            # Generate customer infor
            # Append new customer to Customer Dataframe
            new_customer = {
                "CustomerID": curr_customer_id,
                "Name": fake.name(),
                "Email": fake.email(),
                "Age": randint(18, 90),
                "ModifiedDate": order_date
            }
            customers_df.loc[len(customers_df)] = new_customer
        else:
            curr_customer_id = randint(1, max_customer_id_shared.value)
        # Generate SalesOrderHeader row
        # Append new row to SalesOrderHeader DATAFRAME
        new_sales_order_header = {
            "SalesOrderID": start_sales_order_id + i,
            "OrderDate": order_date,
            "CustomerID": curr_customer_id
        }
        sales_order_header_df.loc[len(sales_order_header_df)] = new_sales_order_header
        
        # Generate SalesOrderHeader row
        # Append new row to SalesOrderHeader DATAFRAME
        n_lines = randint(1, 10)
        for line_number in range(n_lines):
            new_sales_order_detail = {
                "SalesOrderID": start_sales_order_id + i,
                "SalesOrderLineNumber": line_number + 1,
                "ProductKey": randint(1, 606),
                "Qty": randint(1, 20),
                "ModifiedDate": order_date
            }
            sales_order_detail_df.loc[len(sales_order_detail_df)] = new_sales_order_detail

        # Save DataFrames to CSV files
        customers_df.to_csv(f'data/customers_{process_id}.csv', index=False)
        sales_order_header_df.to_csv(f'data/sales_order_header_{process_id}.csv', index=False)
        sales_order_detail_df.to_csv(f'data/sales_order_detail_{process_id}.csv', index=False)

def main(
    n_orders: int, 
    order_date: str, 
    max_customer_id, 
    max_sales_order_id, 
    n_processes: int
):
    n_orders_per_process = n_orders // n_processes
    max_customer_id_shared = Value('i', max_customer_id)
    processes = []
    lock = Lock()
    for i in range(n_processes):
        start_sales_order_id = max_sales_order_id + i * n_orders_per_process
        args = (n_orders_per_process, order_date, max_customer_id_shared, start_sales_order_id, lock, i)
        p = Process(target=generate_sales_data, args=args)
        p.start()
        processes.append(p)
    
    for p in processes:
        p.join()

main(
    n_orders=10000,
    order_date="2022-10-15",
    max_customer_id=100,
    max_sales_order_id=20000,
    n_processes=8
)

In [9]:
main(
    n_orders=100000,
    order_date="2022-10-15",
    max_customer_id=100,
    max_sales_order_id=20000,
    n_processes=8
)

In [11]:
import pandas as pd
import numpy as np
from faker import Faker
from multiprocessing import Process, Value, Lock, Manager
from random import randint
import os

def generate_sales_data(
    n_orders_per_process, 
    order_date, 
    max_customer_id_shared,
    start_sales_order_id,
    lock,
    process_id,
    result_queue
):
    fake = Faker()
    customer_data = []
    sales_order_header_data = []
    sales_order_detail_data = []
    
    for i in range(n_orders_per_process):
        if randint(1, 10) > 8:
            lock.acquire()
            max_customer_id_shared.value += 1
            curr_customer_id = max_customer_id_shared.value
            lock.release()
            new_customer = (
                curr_customer_id,
                fake.name(),
                fake.email(),
                randint(18, 90),
                order_date
            )
            customer_data.append(new_customer)
        else:
            curr_customer_id = randint(1, max_customer_id_shared.value)
        
        new_sales_order_header = (
            start_sales_order_id + i,
            order_date,
            curr_customer_id
        )
        sales_order_header_data.append(new_sales_order_header)
        
        n_lines = randint(1, 10)
        for line_number in range(n_lines):
            new_sales_order_detail = (
                start_sales_order_id + i,
                line_number + 1,
                randint(1, 606),
                randint(1, 20),
                order_date
            )
            sales_order_detail_data.append(new_sales_order_detail)
    
    result_queue.put((process_id, customer_data, sales_order_header_data, sales_order_detail_data))

def save_to_csv(result_queue, n_processes):
    customers_data = []
    sales_order_header_data = []
    sales_order_detail_data = []
    
    for _ in range(n_processes):
        process_id, customer_data, sales_order_header_data, sales_order_detail_data = result_queue.get()
        customers_data.extend(customer_data)
        sales_order_header_data.extend(sales_order_header_data)
        sales_order_detail_data.extend(sales_order_detail_data)
    
    customers_df = pd.DataFrame(customers_data, columns=["CustomerID", "Name", "Email", "Age", "ModifiedDate"])
    sales_order_header_df = pd.DataFrame(sales_order_header_data, columns=["SalesOrderID", "OrderDate", "CustomerID"])
    sales_order_detail_df = pd.DataFrame(sales_order_detail_data, columns=["SalesOrderID", "SalesOrderLineNumber", "ProductKey", "Qty", "ModifiedDate"])

    customers_df.to_csv('data/customers.csv', index=False)
    sales_order_header_df.to_csv('data/sales_order_header.csv', index=False)
    sales_order_detail_df.to_csv('data/sales_order_detail.csv', index=False)

def main(
    n_orders: int, 
    order_date: str, 
    max_customer_id, 
    max_sales_order_id, 
    n_processes: int
):
    n_orders_per_process = n_orders // n_processes
    max_customer_id_shared = Value('i', max_customer_id)
    manager = Manager()
    result_queue = manager.Queue()
    processes = []
    lock = Lock()
    
    for i in range(n_processes):
        start_sales_order_id = max_sales_order_id + i * n_orders_per_process
        args = (n_orders_per_process, order_date, max_customer_id_shared, start_sales_order_id, lock, i, result_queue)
        p = Process(target=generate_sales_data, args=args)
        p.start()
        processes.append(p)
    
    for p in processes:
        p.join()
    
    save_to_csv(result_queue, n_processes)

if __name__ == "__main__":
    if not os.path.exists('data'):
        os.makedirs('data')
    
    main(
        n_orders=100000,
        order_date="2022-10-15",
        max_customer_id=100,
        max_sales_order_id=20000,
        n_processes=8
    )

In [8]:
import pandas as pd
from faker import Faker
from multiprocessing import Process, Value, Lock, Manager
from random import randint
import os
import csv
import time

def generate_sales_data(
    n_orders_per_process, 
    order_date, 
    max_customer_id_shared,
    start_sales_order_id,
    lock,
    process_id,
    result_queue
):       
    # print(f"Process {process_id} start generating data.")
    fake = Faker()
    customer_data = []
    sales_order_header_data = []
    sales_order_detail_data = []
    
    for i in range(n_orders_per_process):
        if randint(1, 10) > 8:
            lock.acquire()
            max_customer_id_shared.value += 1
            curr_customer_id = max_customer_id_shared.value
            lock.release()
            new_customer = (
                curr_customer_id,
                fake.name(),
                fake.email(),
                randint(18, 90),
                order_date
            )
            customer_data.append(new_customer)
        else:
            curr_customer_id = randint(1, max_customer_id_shared.value)
        
        new_sales_order_header = (
            start_sales_order_id + i,
            order_date,
            curr_customer_id
        )
        sales_order_header_data.append(new_sales_order_header)
        
        n_lines = randint(1, 10)
        for line_number in range(n_lines):
            new_sales_order_detail = (
                start_sales_order_id + i,
                line_number + 1,
                randint(1, 606),
                randint(1, 20),
                order_date
            )
            sales_order_detail_data.append(new_sales_order_detail)
    
    # Put data into the Queue
    result_queue.put((process_id, customer_data, sales_order_header_data, sales_order_detail_data))

def save_to_csv(result_queue, n_processes):
    print("Start writing data to .csv files")
    customers_data = []
    all_sales_order_header_data = []
    all_sales_order_detail_data = []
    
    for _ in range(n_processes):
        process_id, customer_data, sales_order_header_data, sales_order_detail_data = result_queue.get()
        customers_data.extend(customer_data)
        all_sales_order_header_data.extend(sales_order_header_data)
        all_sales_order_detail_data.extend(sales_order_detail_data)
    
    customers_df = pd.DataFrame(customers_data, columns=["CustomerID", "Name", "Email", "Age", "ModifiedDate"])
    customers_df.sort_values(by='CustomerID', ascending=True, inplace=True)
    sales_order_header_df = pd.DataFrame(all_sales_order_header_data, columns=["SalesOrderID", "OrderDate", "CustomerID"])
    sales_order_header_df.sort_values(by='SalesOrderID', ascending=True, inplace=True)
    sales_order_detail_df = pd.DataFrame(all_sales_order_detail_data, columns=["SalesOrderID", "SalesOrderLineNumber", "ProductKey", "Qty", "ModifiedDate"])
    sales_order_detail_df.sort_values(by=['SalesOrderID', 'SalesOrderLineNumber'], ascending=[True, True], inplace=True)
    
    customers_df.to_csv('data/customers.csv', index=False, sep=',', quotechar='"', lineterminator='\n', quoting=csv.QUOTE_ALL)
    sales_order_header_df.to_csv('data/sales_order_header.csv', index=False, sep=',', quotechar='"', lineterminator='\n', quoting=csv.QUOTE_ALL)
    sales_order_detail_df.to_csv('data/sales_order_detail.csv', index=False, sep=',', quotechar='"', lineterminator='\n', quoting=csv.QUOTE_ALL)
    print("Finish writing data to .csv files")

def main(
    n_orders: int, 
    order_date: str, 
    max_customer_id, 
    max_sales_order_id, 
    n_processes: int
):
    n_orders_per_process = n_orders // n_processes
    remainder_orders = n_orders % n_processes
    max_customer_id_shared = Value('i', max_customer_id)
    manager = Manager()
    result_queue = manager.Queue()
    processes = []
    lock = Lock()
    
    for i in range(n_processes):
        additional_order = 1 if i < remainder_orders else 0
        start_sales_order_id = max_sales_order_id + i * (n_orders_per_process + additional_order)
        args = (n_orders_per_process + additional_order, order_date, max_customer_id_shared, start_sales_order_id, lock, i, result_queue)
        p = Process(target=generate_sales_data, args=args)
        p.start()
        processes.append(p)
    
    for p in processes:
        p.join()
    
    save_to_csv(result_queue, n_processes)


if not os.path.exists('data'):
    os.makedirs('data')

main(
    n_orders=100000,
    order_date="2022-11-06",
    max_customer_id=47029,
    max_sales_order_id=154733,
    n_processes=8
)

Process 3 start sending generated data into the queue.

Process 1 start sending generated data into the queue.

Process 7 start sending generated data into the queue.

Process 4 start sending generated data into the queue.

Process 1 finish sending generated data into the queue.
Process 0 start sending generated data into the queue.


Process 2 start sending generated data into the queue.

Process 3 finish sending generated data into the queue.

Process 5 start sending generated data into the queue.

Process 6 start sending generated data into the queue.

Process 0 finish sending generated data into the queue.

Process 7 finish sending generated data into the queue.

Process 4 finish sending generated data into the queue.

Process 5 finish sending generated data into the queue.

Process 6 finish sending generated data into the queue.

Process 2 finish sending generated data into the queue.

Start writing data to .csv files
Finish writing data to .csv files


In [2]:
from src.utils.db_connection import DBConnection

conn = DBConnection()
try:
    conn.connect()
    conn.list_schemas()
finally:
    conn.close()

Connection to MySQL database is successful
information_schema
performance_schema
retail_db
MySQL connection is closed


In [9]:
def load_csv_to_database(csv_file_path: str, table_name: str, conn: DBConnection):
    load_query = f"""
    LOAD DATA LOCAL INFILE '{csv_file_path}'
    INTO TABLE {table_name}
    FIELDS TERMINATED BY ','  -- specify the delimiter used in your CSV file
    ENCLOSED BY '"'           -- specify if the fields are enclosed by a specific character
    LINES TERMINATED BY '\n'  -- specify the line terminator
    IGNORE 1 LINES            -- skip the header row if your CSV has a header
    """
    print(f"Start loading csv file from {csv_file_path} into table {table_name}")
    conn.execute_query(load_query)
    print(f"Finish loading csv file from {csv_file_path} into table {table_name}")

try:
    conn.connect()
    load_csv_to_database("/home/phinguyen/ETL_Pipeline_with_Spark_01/notebooks/data/customers.csv", "Customer", conn)
except Exception as e:
    raise Exception(e)
finally:
    conn.close()

Connection to MySQL database is successful
Start loading csv file from /home/phinguyen/ETL_Pipeline_with_Spark_01/notebooks/data/customers.csv into table Customer
Finish loading csv file from /home/phinguyen/ETL_Pipeline_with_Spark_01/notebooks/data/customers.csv into table Customer
MySQL connection is closed


In [10]:
try:
    conn.connect()
    load_csv_to_database("/home/phinguyen/ETL_Pipeline_with_Spark_01/notebooks/data/sales_order_header.csv", 
                         "SalesOrderHeader", 
                         conn)
except Exception as e:
    raise Exception(e)
finally:
    conn.close()

Connection to MySQL database is successful
Start loading csv file from /home/phinguyen/ETL_Pipeline_with_Spark_01/notebooks/data/sales_order_header.csv into table SalesOrderHeader
Finish loading csv file from /home/phinguyen/ETL_Pipeline_with_Spark_01/notebooks/data/sales_order_header.csv into table SalesOrderHeader
MySQL connection is closed


In [11]:
try:
    conn.connect()
    load_csv_to_database("/home/phinguyen/ETL_Pipeline_with_Spark_01/notebooks/data/sales_order_detail.csv", 
                         "SalesOrderDetail", 
                         conn)
except Exception as e:
    raise Exception(e)
finally:
    conn.close()

Connection to MySQL database is successful
Start loading csv file from /home/phinguyen/ETL_Pipeline_with_Spark_01/notebooks/data/sales_order_detail.csv into table SalesOrderDetail
Finish loading csv file from /home/phinguyen/ETL_Pipeline_with_Spark_01/notebooks/data/sales_order_detail.csv into table SalesOrderDetail
MySQL connection is closed


In [12]:
try:
    conn.connect()
    max_customer_id_query = "SELECT MAX(CustomerId) FROM Customer"
    max_sales_order_id_query = "SELECT MAX(SalesOrderId) FROM SalesOrderHeader"
    max_customer_id = conn.execute_query(max_customer_id_query)
    max_sales_order_id = conn.execute_query(max_sales_order_id_query)
except Exception as e:
    raise Exception(e)
finally:
    conn.close()

Connection to MySQL database is successful
MySQL connection is closed


In [16]:
max_customer_id[0][0]

67180