In [0]:
%run "../source_bronze/source_bronze_utils"

In [0]:
## CREATE TABLE

from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType

# Define snake_case schema for the employee table
employee_schema = StructType([
    StructField("employee_id", IntegerType(), True),
    StructField("employee_name", StringType(), True),
    StructField("department", StringType(), True),
    StructField("country", StringType(), True),
    StructField("salary", IntegerType(), True),
    StructField("age", IntegerType(), True),
    StructField("load_date", DateType(), True)
])

# Create empty DataFrame with schema
df_employee = spark.createDataFrame([], schema=employee_schema)

# Set database, table name, and location
db_name = "employee_info_abu"
table_name = "dim_employee"
table_path = f"/FileStore/silver/employee/{db_name}/{table_name}"

# Create the database if not exists
spark.sql(f"CREATE schema IF NOT EXISTS {db_name}")

# Create the table only if it doesn't exist
if not spark._jsparkSession.catalog().tableExists(f"{db_name}.{table_name}"):
    df_employee.write.format("delta").mode("overwrite").save(table_path)
    spark.sql(f"""
        CREATE TABLE {db_name}.{table_name}
        USING DELTA
        LOCATION '{table_path}'
    """)

print(f"✅ Empty table `{db_name}.{table_name}` created at {table_path}")


✅ Empty table `employee_info_abu.dim_employee` created at /FileStore/silver/employee/employee_info_abu/dim_employee


In [0]:
import re
from pyspark.sql.functions import col, current_date
from pyspark.sql.types import IntegerType, StringType

# Function to convert camelCase to snake_case
def camel_to_snake(name):
    name = re.sub(r'(.)([A-Z][a-z]+)', r'\1_\2', name)
    name = re.sub(r'([a-z0-9])([A-Z])', r'\1_\2', name)
    return name.lower()

def convert_column_names_to_snake_case(df):
    new_col_names = [camel_to_snake(col) for col in df.columns]
    for old, new in zip(df.columns, new_col_names):
        df = df.withColumnRenamed(old, new)
    return df

# 1. Read source data
df_employee = spark.read.option("header", True).csv("/FileStore/source_to_bronze/employee")

# 2. Convert column names to snake_case
df_employee = convert_column_names_to_snake_case(df_employee)

# 3. Check your column names (optional but useful)
print("✅ Columns after rename:", df_employee.columns)

# 4. Cast to match target table schema
df_employee = df_employee.select(
    col("employee_id").cast(IntegerType()),
    col("employee_name").cast(StringType()),
    col("department").cast(StringType()),
    col("country").cast(StringType()),
    col("salary").cast(IntegerType()),
    col("age").cast(IntegerType())
)

# 5. Add load_date and remove duplicates
df_employee = df_employee.withColumn("load_date", current_date())
df_employee = df_employee.dropDuplicates(["employee_id"])

# 6. Append to Delta table
df_employee.write.format("delta").mode("overwrite").saveAsTable("employee_info_abu.dim_employee")


✅ Columns after rename: ['employee_id', 'employee_name', 'department', 'country', 'salary', 'age']


In [0]:
%sql
select * from employee_info_abu.dim_employee

employee_id,employee_name,department,country,salary,age,load_date
1,James,D101,IN,9000,25,2025-04-19
2,Michel,D102,SA,8000,26,2025-04-19
3,James son,D101,IN,10000,35,2025-04-19
4,Robert,D103,MY,11000,34,2025-04-19
5,Scott,D104,MA,6000,36,2025-04-19
6,Gen,D105,JA,21345,24,2025-04-19
7,John,D102,MY,87654,40,2025-04-19
8,Maria,D105,SA,38144,38,2025-04-19
9,Soffy,D103,IN,23456,29,2025-04-19
10,Amy,D103,CN,21345,24,2025-04-19


In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType
depart_schema = StructType([
    StructField("department_id", StringType(), True),
    StructField("department_name", StringType(), True),
    StructField("load_date", DateType(), True)
])
df_depart = spark.createDataFrame([], schema=depart_schema)

db_name = "employee_info_abu"
table_name = "dim_depart"
table_path = f"/FileStore/silver/department/{db_name}/{table_name}"

spark.sql(f"CREATE schema IF NOT EXISTS {db_name}")
if not spark._jsparkSession.catalog().tableExists(f"{db_name}.{table_name}"):
    df_depart.write.format("delta").mode("overwrite").save(table_path)
    spark.sql(f"""
        CREATE TABLE {db_name}.{table_name}
        USING DELTA
        LOCATION '{table_path}'
    """)
    print(f"✅ Empty table `{db_name}.{table_name}` created at {table_path}")




✅ Empty table `employee_info_abu.dim_depart` created at /FileStore/silver/department/employee_info_abu/dim_depart


In [0]:
import re
from pyspark.sql.functions import col, current_date
from pyspark.sql.types import IntegerType, StringType

# Function to convert camelCase to snake_case
def camel_to_snake(name):
    name = re.sub(r'(.)([A-Z][a-z]+)', r'\1_\2', name)
    name = re.sub(r'([a-z0-9])([A-Z])', r'\1_\2', name)
    return name.lower()

def convert_column_names_to_snake_case(df):
    new_col_names = [camel_to_snake(col) for col in df.columns]
    for old, new in zip(df.columns, new_col_names):
        df = df.withColumnRenamed(old, new)
    return df

# 1. Read source data
df_depart = spark.read.option("header", True).csv("/FileStore/source_to_bronze/department_df")

# 2. Convert column names to snake_case
df_depart = convert_column_names_to_snake_case(df_depart)

# 3. Check your column names (optional but useful)
print("✅ Columns after rename:", df_depart.columns)

# 4. Cast to match target table schema
df_depart = df_depart.select(
    col("department_id").cast(StringType()),
    col("department_name").cast(StringType())
    
    
)

# 5. Add load_date and remove duplicates
df_depart = df_depart.withColumn("load_date", current_date())


# 6. Append to Delta table
df_depart.write.format("delta").mode("overwrite").option("mergeSchema", "true").saveAsTable("employee_info_abu.dim_depart")

✅ Columns after rename: ['department_id', 'department_name']


In [0]:
%sql
select * from employee_info_abu.dim_depart

department_id,department_name,load_date
D101,Sales,2025-04-19
D102,Marketing,2025-04-19
D103,Finance,2025-04-19
D104,Support,2025-04-19
D105,HR,2025-04-19


In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType
country_schema = StructType([
    StructField("country_code", StringType(), True),
    StructField("country_name", StringType(), True),
    StructField("load_date", DateType(), True)
])

df_country = spark.createDataFrame([], schema=country_schema)

db_name = "employee_info_abu"
table_name = "dim_country"
table_path = f"/FileStore/silver/country/{db_name}/{table_name}"

spark.sql(f"CREATE schema IF NOT EXISTS {db_name}")
if not spark._jsparkSession.catalog().tableExists(f"{db_name}.{table_name}"):
    df_country.write.format("delta").mode("overwrite").option("mergeSchema", "true").save(table_path)
    spark.sql(f"""
        CREATE TABLE {db_name}.{table_name}
        USING DELTA
        LOCATION '{table_path}'
    """)
    print(f"✅ Empty table `{db_name}.{table_name}` created at {table_path}")

✅ Empty table `employee_info_abu.dim_country` created at /FileStore/silver/country/employee_info_abu/dim_country


In [0]:
import re
from pyspark.sql.functions import col
from pyspark.sql.types import StringType

# Function to convert camelCase to snake_case
def camel_to_snake(name):
    name = re.sub(r'([a-z0-9])([A-Z])', r'\1_\2', name)  # Captures lowercase followed by uppercase
    return name.lower()

def convert_column_names_to_snake_case(df):
    new_col_names = [camel_to_snake(col) for col in df.columns]
    for old, new in zip(df.columns, new_col_names):
        df = df.withColumnRenamed(old, new)
    return df

# 1. Read source data
df_country = spark.read.option("header", True).csv("/FileStore/source_to_bronze/country_df")

# 2. Convert column names to snake_case (renaming camelCase columns)
df_country = convert_column_names_to_snake_case(df_country)
df_country = df_country.withColumnRenamed("countrycode", "country_code") \
                       .withColumnRenamed("countryname", "country_name")
# 3. Check the new column names
print("✅ Columns after rename:", df_country.columns)

# 4. Cast to match target table schema (now that the columns are in snake_case)
df_country = df_country.select(
    col("country_code").cast(StringType()),  # Should match the new column names
    col("country_name").cast(StringType())   # Should match the new column names
)

# 5. Show the dataframe (optional for debugging)
df_country.show()
df_country.write.format("delta").mode("overwrite").option("mergeSchema","true").saveAsTable("employee_info_abu.dim_country")


✅ Columns after rename: ['country_code', 'country_name']
+------------+------------+
|country_code|country_name|
+------------+------------+
|          CN|       China|
|          IN|       India|
|          SA|South Africa|
|          JA|       Japan|
|          MY|    Malaysia|
|          MA|     Morocco|
+------------+------------+



In [0]:
%sql
select * from employee_info_abu.dim_country

