In [1]:
# Set JAVA_HOME environment variable
import os
os.environ['JAVA_HOME'] = r'C:\Program Files\Java\jre1.8.0_451'

from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import IntegerType, DoubleType, DateType
from datetime import datetime
import logging

# Initialize Spark session
spark = SparkSession.builder \
    .appName("Data Transformation") \
    .getOrCreate()

In [3]:
# Define paths - use absolute paths to avoid confusion
current_dir = os.path.dirname(os.path.abspath("__file__"))
base_dir = os.path.dirname(current_dir)  # Go up one level to reach the root directory

# Get the latest date from bronze layer
bronze_base_path = os.path.join(base_dir, "output", "bronzeLayer")
silver_base_path = os.path.join(base_dir, "output", "silverLayer")

# Use current date for silver layer
date_str = datetime.now().strftime("%Y-%m-%d")

# Log path organized by date
log_dir = os.path.join(base_dir, "logs", "data_transformation", date_str)
log_path = os.path.join(log_dir, "data_transformation.log")

# Ensure the logs directory exists
os.makedirs(log_dir, exist_ok=True)

# Configure logging to write to the log file
logging.basicConfig(filename=log_path, level=logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(message)s')

def log_message(message, level="info"):
    """Logs messages with the specified level."""
    if level == "info":
        logging.info(message)
    elif level == "error":
        logging.error(message)
    print(message)

# Find the latest date folder in bronze layer
import glob
bronze_date_folders = glob.glob(os.path.join(bronze_base_path, "*"))
if not bronze_date_folders:
    raise Exception(f"No date folders found in {bronze_base_path}")

latest_bronze_folder = max(bronze_date_folders)
bronze_path = latest_bronze_folder
silver_path = os.path.join(silver_base_path, date_str)

print(f"Bronze path: {bronze_path}")
print(f"Silver path: {silver_path}")

Bronze path: c:\Users\oussema\OneDrive\Bureau\FSEGN\DW-Odoo\output\bronzeLayer\2025-05-08
Silver path: c:\Users\oussema\OneDrive\Bureau\FSEGN\DW-Odoo\output\silverLayer\2025-05-08


In [None]:
#audit report


In [4]:
# Read data from bronze layer
df = spark.read.parquet(bronze_path)

# Display initial data stats
print(f"Initial row count: {df.count()}")
print(f"Initial columns: {df.columns}")

# Display sample data
display(df.limit(5).toPandas())

Initial row count: 1734
Initial columns: ['Component Status', 'Product', 'Reference', 'Source', 'Responsible', 'Start', 'End', 'Deadline', 'State', 'Quantity Producing', 'Quantity To Produce', 'Total Quantity', 'Product/Cost', 'Product/Sales Price']


Unnamed: 0,Component Status,Product,Reference,Source,Responsible,Start,End,Deadline,State,Quantity Producing,Quantity To Produce,Total Quantity,Product/Cost,Product/Sales Price
0,,[FURN_8855] Drawer,WH/MO/01679,,OdooBot,2024-01-03 01:00:00,2024-02-19 01:00:00,2024-01-25 01:00:00,Cancelled,12.0,17.0,17.0,100.0,110.5
1,,[FURN_8522] Table Top,WH/MO/07299,SO-8299,Default User Template,2024-01-03 01:00:00,2024-02-27 01:00:00,2024-01-22 01:00:00,Cancelled,4.0,13.0,13.0,240.0,380.0
2,Available,[FURN_9666] Table,WH/MO/02264,,Public user,2024-01-06 01:00:00,2024-02-06 01:00:00,2024-03-29 01:00:00,,19.0,19.0,19.0,290.0,520.0
3,Available,[FURN_7023] Wood Panel,WH/MO/03734,,Mitchell Admin,2024-01-06 01:00:00,2024-02-17 01:00:00,2024-03-20 01:00:00,,10.0,10.0,10.0,80.0,100.0
4,Available,[FURN_7023] Wood Panel,WH/MO/03614,SO-4614,Mitchell Admin,2024-01-07 01:00:00,2024-02-03 01:00:00,2024-03-11 01:00:00,,15.0,15.0,15.0,80.0,100.0


In [6]:
#remove unused columns
df = df.drop("Component Status", "Source")

In [13]:
# count Responsible with null values and print the result
print(f"Number of rows with null Responsible: {df.filter(F.col('Responsible').isNull()).count()}")
# check responsible values
df.select("Responsible").distinct().show()

Number of rows with null Responsible: 0
+--------------------+
|         Responsible|
+--------------------+
|Default User Temp...|
|         Public user|
|             OdooBot|
|      Mitchell Admin|
|Portal User Template|
+--------------------+



In [16]:
# Change values in responsible column
df = df.withColumn("Responsible", 
                  F.when(F.col("Responsible") == "OdooBot", "Department 1")
                   .when(F.col("Responsible") == "Mitchell Admin", "Department 2")
                   .when(F.col("Responsible") == "Default User Template", "Department 3")
                   .when(F.col("Responsible") == "Public user", "Department 4")
                   .otherwise(F.col("Responsible"))  # Keep original value for other cases
                  )
# Display sample data
display(df.limit(5).toPandas())

Unnamed: 0,Product,Reference,Responsible,Start,End,Deadline,State,Quantity Producing,Quantity To Produce,Total Quantity,Product/Cost,Product/Sales Price
0,[FURN_8855] Drawer,WH/MO/01679,Department 1,2024-01-03 01:00:00,2024-02-19 01:00:00,2024-01-25 01:00:00,Cancelled,12.0,17.0,17.0,100.0,110.5
1,[FURN_8522] Table Top,WH/MO/07299,Department 3,2024-01-03 01:00:00,2024-02-27 01:00:00,2024-01-22 01:00:00,Cancelled,4.0,13.0,13.0,240.0,380.0
2,[FURN_9666] Table,WH/MO/02264,Department 4,2024-01-06 01:00:00,2024-02-06 01:00:00,2024-03-29 01:00:00,,19.0,19.0,19.0,290.0,520.0
3,[FURN_7023] Wood Panel,WH/MO/03734,Department 2,2024-01-06 01:00:00,2024-02-17 01:00:00,2024-03-20 01:00:00,,10.0,10.0,10.0,80.0,100.0
4,[FURN_7023] Wood Panel,WH/MO/03614,Department 2,2024-01-07 01:00:00,2024-02-03 01:00:00,2024-03-11 01:00:00,,15.0,15.0,15.0,80.0,100.0


In [None]:
# 1. Data Cleaning
# ----------------

# Handle missing values for important columns
df_cleaned = df.na.fill({
    "Quantity To Produce": 0,
    "Quantity Produced": 0,
    "Quantity Producing": 0,
    "Total Quantity": 0,
    "Product/Cost": 0,
    "Product/Sales Price": 0
})

# Display cleaned data
display(df_cleaned.limit(5).toPandas())

AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `Quantity Produced` cannot be resolved. Did you mean one of the following? [`Component Status`, `Product`, `Reference`, `Source`, `Responsible`, `Start`, `End`, `Deadline`, `State`, `Quantity Producing`, `Quantity To Produce`, `Total Quantity`, `Product/Cost`, `Product/Sales Price`].

In [None]:
# 2. Feature Engineering
# ----------------------

# Calculate production duration in days
df_transformed = df_cleaned.withColumn(
    "Production_Duration_Days",
    F.when(
        (F.col("End").isNotNull() & F.col("Start").isNotNull()),
        F.datediff(F.col("End"), F.col("Start"))
    ).otherwise(None)
)

# Calculate delay (positive means late, negative means early)
df_transformed = df_transformed.withColumn(
    "Delay_Days",
    F.when(
        (F.col("End").isNotNull() & F.col("Deadline").isNotNull()),
        F.datediff(F.col("End"), F.col("Deadline"))
    ).otherwise(None)
)

# Add a flag for late productions
df_transformed = df_transformed.withColumn(
    "Is_Late",
    F.when(F.col("Delay_Days") > 0, True).otherwise(False)
)

# Calculate production efficiency (Quantity Produced / Quantity To Produce)
df_transformed = df_transformed.withColumn(
    "Production_Efficiency",
    F.when(
        (F.col("Quantity To Produce") > 0),
        F.round(F.col("Quantity Produced") / F.col("Quantity To Produce"), 2)
    ).otherwise(None)
)

# Display transformed data with new features
display(df_transformed.select(
    "Reference", "Product", "Start", "End", "Deadline", 
    "Quantity To Produce", "Quantity Produced",
    "Production_Duration_Days", "Delay_Days", "Is_Late", "Production_Efficiency"
).limit(5).toPandas())

In [None]:
# 3. Add Time Dimensions
# ----------------------

# Add year, month, day, weekday for Start date
df_transformed = df_transformed.withColumn("Start_Year", F.year(F.col("Start")))
df_transformed = df_transformed.withColumn("Start_Month", F.month(F.col("Start")))
df_transformed = df_transformed.withColumn("Start_Day", F.dayofmonth(F.col("Start")))
df_transformed = df_transformed.withColumn("Start_Weekday", F.dayofweek(F.col("Start")))

# Add year, month, day, weekday for End date
df_transformed = df_transformed.withColumn("End_Year", F.year(F.col("End")))
df_transformed = df_transformed.withColumn("End_Month", F.month(F.col("End")))
df_transformed = df_transformed.withColumn("End_Day", F.dayofmonth(F.col("End")))
df_transformed = df_transformed.withColumn("End_Weekday", F.dayofweek(F.col("End")))

# Display data with time dimensions
display(df_transformed.select(
    "Reference", "Start", "Start_Year", "Start_Month", "Start_Day", "Start_Weekday",
    "End", "End_Year", "End_Month", "End_Day", "End_Weekday"
).limit(5).toPandas())

In [None]:
# 4. Categorize Products
# ----------------------

# Create a cost category based on Product/Cost
df_transformed = df_transformed.withColumn(
    "Cost_Category",
    F.when(F.col("Product/Cost") < 50, "Low Cost")
     .when(F.col("Product/Cost") < 100, "Medium Cost")
     .otherwise("High Cost")
)

# Create a profit margin column and category
df_transformed = df_transformed.withColumn(
    "Profit_Margin",
    F.when(
        (F.col("Product/Sales Price") > 0) & (F.col("Product/Cost") > 0),
        F.round((F.col("Product/Sales Price") - F.col("Product/Cost")) / F.col("Product/Cost"), 2)
    ).otherwise(None)
)

df_transformed = df_transformed.withColumn(
    "Profit_Category",
    F.when(F.col("Profit_Margin") < 0.2, "Low Margin")
     .when(F.col("Profit_Margin") < 0.5, "Medium Margin")
     .otherwise("High Margin")
)

# Display categorized data
display(df_transformed.select(
    "Product", "Product/Cost", "Product/Sales Price", 
    "Cost_Category", "Profit_Margin", "Profit_Category"
).limit(5).toPandas())

In [None]:
# 5. Basic Analysis
# ----------------

# Count by State
state_counts = df_transformed.groupBy("State").count().orderBy(F.desc("count"))
display(state_counts.toPandas())

# Average production duration by product
avg_duration = df_transformed.groupBy("Product") \
    .agg(F.avg("Production_Duration_Days").alias("Avg_Duration")) \
    .orderBy(F.desc("Avg_Duration"))
display(avg_duration.limit(10).toPandas())

# Efficiency by responsible person
efficiency_by_responsible = df_transformed.groupBy("Responsible") \
    .agg(
        F.avg("Production_Efficiency").alias("Avg_Efficiency"),
        F.count("*").alias("Production_Count")
    ) \
    .filter(F.col("Production_Count") > 5) \
    .orderBy(F.desc("Avg_Efficiency"))
display(efficiency_by_responsible.toPandas())

# Late production percentage
late_percentage = df_transformed.filter(F.col("End").isNotNull() & F.col("Deadline").isNotNull()) \
    .agg(
        (F.sum(F.when(F.col("Is_Late") == True, 1).otherwise(0)) / F.count("*") * 100).alias("Late_Percentage")
    )
display(late_percentage.toPandas())

In [None]:
# Ensure the silver path exists
os.makedirs(silver_path, exist_ok=True)

# Write to Silver layer in Parquet format
print(f"Writing transformed data to {silver_path}...")
df_transformed.write.mode("overwrite").parquet(silver_path)

# Create a sample of the transformed data for quick analysis
sample_path = os.path.join(silver_path, "sample")
os.makedirs(sample_path, exist_ok=True)
df_transformed.sample(0.1).write.mode("overwrite").parquet(sample_path)
print(f"Created sample data at {sample_path}")

# Stop Spark session
spark.stop()
print("Data transformation process completed.")