# Employee Data

- employee_id
- first_name
- last_name
- phone_num
    - country_code
    - area_code
    - exchange_code
    - subscriber_number
- email_id
- address
    - street_name
    - unit_number
    - city
    - county
    - state
    - zip_code
    - extended_zip_code
        - sector
        - segment

In [23]:
# Import required modules/libraries
from pyspark.sql.types import (StructType, StructField, StringType, IntegerType)
from pyspark.sql import SparkSession, Row
from datetime import date, datetime
from pyspark.sql.functions import col, struct
import random

In [24]:
# Create a spark connection
spark = SparkSession.builder.appName("Employee Analysis").getOrCreate()

In [25]:
#Employee Schema

extended_zip_code_schema = StructType([
    StructField("sector", IntegerType()),  # Example: 54
    StructField("segment", IntegerType()) # Example: 01
])

address_schema = StructType([
    StructField("street_name", StringType(), False),
    StructField("unit_number", StringType()),
    StructField("city", StringType(), False),
    StructField("county", StringType()),
    StructField("state", StringType(), False),
    StructField("zip_code", IntegerType(), False),
    StructField("extended_zip_code", extended_zip_code_schema) # Nesting the previous structure
])

phone_num_schema = StructType([                                 # e.g., 1 (555)-123-4567
    StructField("country_code", IntegerType(), False),          # e.g., "1"
    StructField("area_code", IntegerType(), False),             # e.g., "555"
    StructField("exchange_code", IntegerType(), False),         # e.g., "123"
    StructField("subscriber_number", IntegerType(), False)      # e.g., "4567"
])

employee_schema = StructType([
    StructField("employee_id", IntegerType(), False),
    StructField("dept_id", IntegerType(), False),    
    StructField("first_name", StringType(), False),
    StructField("middle_name", StringType()),
    StructField("last_name", StringType(), False),
    StructField("phone_num", phone_num_schema, False),         # Nested Phone Structure
    StructField("email_id", StringType(), False),
    StructField("address", address_schema, False)              # Nested Address Structure
])

In [26]:
# Department Schema

dept_schema = StructType([
    StructField("dept_id", IntegerType(), False),
    StructField("dept_name", StringType(), False),
    StructField("dept_head_emp_id", IntegerType(), False),
    StructField("loc_address", address_schema, False)
])

In [27]:
# Generate synthetic data

FIRST_NAMES = ["Alice", "Bob", "Charlie", "Diana", "Ethan", "Fiona", "George", "Hannah", "Ivy", "Jack"]
LAST_NAMES = ["Smith", "Jones", "Williams", "Brown", "Davis", "Miller", "Wilson", "Moore", "Taylor", "Anderson"]
CITIES = ["New York", "Chicago", "Boston", "Seattle", "Austin", "Denver"]
STATES = ["NY", "IL", "MA", "WA", "TX", "CO"]
DEPARTMENTS = [
    (10, "Engineering"),
    (20, "Sales"),
    (30, "HR"),
    (40, "Finance"),
    (50, "Marketing"),
]

In [28]:
# Create a list of department IDs that we will assign to employees
DEPT_IDS = [id for id, name in DEPARTMENTS]

In [29]:
def generate_address(city, state):
    """Generates a nested address Row consistent with address_schema."""
    zip_code_base = random.randint(10000, 99999)
    return Row(
        street_name=f"{random.randint(100, 999)} {random.choice(['Oak', 'Pine', 'Main'])} St",
        unit_number=random.choice([None, str(random.randint(1, 200))]),
        city=city,
        county=f"{city} County",
        state=state,
        zip_code=zip_code_base,
        extended_zip_code=Row(
            sector=random.randint(10, 99),
            segment=random.randint(10, 99)
        )
    )

In [30]:
dept_data = []
# Ensure each department gets a location
for dept_id, dept_name in DEPARTMENTS:
    city = random.choice(CITIES)
    state = STATES[CITIES.index(city)]
    
    # Generate a unique employee ID for the department head (ensuring it's not the same as a future employee ID)
    dept_head_emp_id = 1000 + dept_id
    
    dept_data.append(Row(
        dept_id=dept_id,
        dept_name=dept_name,
        dept_head_emp_id=dept_head_emp_id,
        loc_address=generate_address(city, state)
    ))

In [31]:
# Create a test data
employee_data = []
for i in range(1, 101): # 100 employees
    emp_id = 1000 + i
    dept_id = random.choice(DEPT_IDS) # Ensure dept_id is valid
    
    first = random.choice(FIRST_NAMES)
    last = random.choice(LAST_NAMES)
    
    city = random.choice(CITIES)
    state = STATES[CITIES.index(city)]
    
    employee_data.append(Row(
        employee_id=emp_id,
        dept_id=dept_id,
        first_name=first,
        middle_name=random.choice([None, "Xavier", "Yancy"]), # Allow for null middle_name
        last_name=last,
        
        # Nested Phone Structure (Non-Nullable fields generated)
        phone_num=Row(
            country_code=1,
            area_code=random.randint(200, 999),
            exchange_code=random.randint(100, 999),
            subscriber_number=random.randint(1000, 9999)
        ),
        
        email_id=f"{first.lower()}.{last.lower()}@company.com",
        
        # Nested Address Structure (Non-Nullable fields generated)
        address=generate_address(city, state)
    ))

print(employee_data)

[Row(employee_id=1001, dept_id=30, first_name='Diana', middle_name='Xavier', last_name='Taylor', phone_num=Row(country_code=1, area_code=527, exchange_code=318, subscriber_number=9046), email_id='diana.taylor@company.com', address=Row(street_name='843 Pine St', unit_number=None, city='Boston', county='Boston County', state='MA', zip_code=13735, extended_zip_code=Row(sector=91, segment=34))), Row(employee_id=1002, dept_id=50, first_name='Ethan', middle_name='Yancy', last_name='Wilson', phone_num=Row(country_code=1, area_code=319, exchange_code=375, subscriber_number=4104), email_id='ethan.wilson@company.com', address=Row(street_name='694 Pine St', unit_number='4', city='Boston', county='Boston County', state='MA', zip_code=24219, extended_zip_code=Row(sector=90, segment=82))), Row(employee_id=1003, dept_id=40, first_name='George', middle_name='Yancy', last_name='Smith', phone_num=Row(country_code=1, area_code=454, exchange_code=504, subscriber_number=2259), email_id='george.smith@compan

In [32]:
df_employee = spark.createDataFrame(employee_data, schema=employee_schema)

df_employee.show()

+-----------+-------+----------+-----------+---------+-------------------+--------------------+--------------------+
|employee_id|dept_id|first_name|middle_name|last_name|          phone_num|            email_id|             address|
+-----------+-------+----------+-----------+---------+-------------------+--------------------+--------------------+
|       1001|     30|     Diana|     Xavier|   Taylor|{1, 527, 318, 9046}|diana.taylor@comp...|{843 Pine St, NUL...|
|       1002|     50|     Ethan|      Yancy|   Wilson|{1, 319, 375, 4104}|ethan.wilson@comp...|{694 Pine St, 4, ...|
|       1003|     40|    George|      Yancy|    Smith|{1, 454, 504, 2259}|george.smith@comp...|{561 Main St, 25,...|
|       1004|     10|       Bob|      Yancy| Williams|{1, 205, 339, 6275}|bob.williams@comp...|{827 Oak St, 166,...|
|       1005|     40|     Fiona|       NULL|    Davis|{1, 703, 719, 1300}|fiona.davis@compa...|{162 Main St, 24,...|
|       1006|     30|       Bob|      Yancy|    Moore|{1, 773, 8

In [33]:
df_department = spark.createDataFrame(dept_data, schema=dept_schema)

df_department.show()

+-------+-----------+----------------+--------------------+
|dept_id|  dept_name|dept_head_emp_id|         loc_address|
+-------+-----------+----------------+--------------------+
|     10|Engineering|            1010|{732 Oak St, 37, ...|
|     20|      Sales|            1020|{647 Pine St, 139...|
|     30|         HR|            1030|{558 Main St, 60,...|
|     40|    Finance|            1040|{812 Main St, 127...|
|     50|  Marketing|            1050|{236 Oak St, 89, ...|
+-------+-----------+----------------+--------------------+



In [34]:
df_employee.filter(col("employee_id") == 1010).show()

+-----------+-------+----------+-----------+---------+-------------------+--------------------+--------------------+
|employee_id|dept_id|first_name|middle_name|last_name|          phone_num|            email_id|             address|
+-----------+-------+----------+-----------+---------+-------------------+--------------------+--------------------+
|       1010|     10|       Bob|      Yancy|    Davis|{1, 360, 570, 2918}|bob.davis@company...|{367 Pine St, 24,...|
+-----------+-------+----------+-----------+---------+-------------------+--------------------+--------------------+



In [35]:
df_employee.alias("e").join(df_department.alias("d"), on=(col("e.employee_id") == col("d.dept_head_emp_id")), how="inner").select(col("e.employee_id")).show()

+-----------+
|employee_id|
+-----------+
|       1010|
|       1020|
|       1030|
|       1040|
|       1050|
+-----------+



In [39]:
df_employee.write.csv("emp.csv")

UnsupportedOperationException: getSubject is not supported

In [38]:
configs = spark.sparkContext.getConf().getAll()

for key, value in configs:
    # Use the second line if you used spark.conf.getAll() which returns a dictionary:
    # for key, value in configs.items():
    
    # We are specifically looking for file system and security configs
    if 'fs.' in key or 'security' in key or 'impl' in key:
        print(f"** {key}: {value} **")
    else:
        print(f"{key}: {value}")

spark.app.id: local-1767492962386
spark.rdd.compress: True
** spark.hadoop.fs.s3a.vectored.read.min.seek.size: 128K **
spark.app.name: Employee Analysis
spark.app.submitTime: 1767492961148
spark.sql.artifact.isolation.enabled: false
spark.executor.extraJavaOptions: -Djava.net.preferIPv6Addresses=false -XX:+IgnoreUnrecognizedVMOptions --add-modules=jdk.incubator.vector --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/jdk.internal.ref=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/sun.nio.cs=ALL-UNNAMED --add-opens=java.base/sun.security.action=ALL-UNNAME

In [36]:
# spark.stop()