In [2]:
# Declare constant paths
INPUT_DIR = "/home/jovyan/output/final_df.parquet"

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

spark = SparkSession.builder \
                    .appName("SF-Data-Analysis-Work-Sample") \
                    .master("local[*]") \
                    .config("spark.driver.memory", "4g") \
                    .config("spark.executor.memory", "8g") \
                    .getOrCreate()
spark

In [4]:
df = spark.read.parquet(INPUT_DIR).filter(F.col("active_hh") == 1)   # filter to read only active households
df.printSchema()
df.show(truncate=False)

root
 |-- car_id: integer (nullable = true)
 |-- cust_id: integer (nullable = true)
 |-- hh_id: integer (nullable = true)
 |-- active_hh: integer (nullable = true)
 |-- hh_start_date: string (nullable = true)
 |-- phone_number: string (nullable = true)
 |-- zip: integer (nullable = true)
 |-- country: string (nullable = true)
 |-- referral_source: string (nullable = true)
 |-- date_of_birth: string (nullable = true)
 |-- marital_status: string (nullable = true)
 |-- employment_type: string (nullable = true)
 |-- income: string (nullable = true)
 |-- status: string (nullable = true)
 |-- model_year: integer (nullable = true)
 |-- make: string (nullable = true)
 |-- body_style: string (nullable = true)
 |-- vehicle_value: double (nullable = true)
 |-- annual_miles_driven: integer (nullable = true)
 |-- business_use: integer (nullable = true)
 |-- antique_vehicle: integer (nullable = true)
 |-- lien: integer (nullable = true)
 |-- lease: integer (nullable = true)
 |-- driver_safety_discou

***<span style="color: orange;">1. Finding average number of cars per household:</span>***

In [5]:
from pyspark.sql import functions as F

average_cars_per_household = (
    df.groupBy("hh_id")                              # Group by household ID
    .agg(F.count("car_id").alias("car_count"))       # Count the number of cars per household
    .agg(F.avg("car_count").alias("avg_car_count"))  # Calculate the average number of cars
)

average_cars_per_household.show()
print(f"Average number of cars per ACTIVE household: {average_cars_per_household.collect()[0][0]}")

+-----------------+
|    avg_car_count|
+-----------------+
|3.808907159335808|
+-----------------+

Average number of cars per ACTIVE household: 3.808907159335808


***<span style="color: orange;">2. Finding count of cars by model year:</span>***

In [6]:
cars_by_model_year = (
                        df.groupBy("model_year")                      # Group by model year
                        .agg(F.count("car_id").alias("car_count"))  # Count the number of cars per model_year
                        .orderBy("model_year")                      # Order by model year
                        .collect()
                    )

for item in cars_by_model_year:
    print(f"model_year: {item[0]}, count: {item[1]}")

model_year: 1952, count: 2025
model_year: 1953, count: 2022
model_year: 1954, count: 2069
model_year: 1955, count: 2059
model_year: 1956, count: 2007
model_year: 1957, count: 2013
model_year: 1958, count: 2078
model_year: 1959, count: 2008
model_year: 1960, count: 2017
model_year: 1961, count: 2012
model_year: 1962, count: 2034
model_year: 1963, count: 1995
model_year: 1964, count: 2008
model_year: 1965, count: 2018
model_year: 1966, count: 1966
model_year: 1967, count: 2040
model_year: 1968, count: 1945
model_year: 1969, count: 2003
model_year: 1970, count: 2025
model_year: 1971, count: 2039
model_year: 1972, count: 2038
model_year: 1973, count: 1967
model_year: 1974, count: 1977
model_year: 1975, count: 2047
model_year: 1976, count: 2069
model_year: 1977, count: 1993
model_year: 1978, count: 2026
model_year: 1979, count: 2009
model_year: 1980, count: 1898
model_year: 1981, count: 2046
model_year: 1982, count: 2088
model_year: 1983, count: 1993
model_year: 1984, count: 2026
model_year

<u style="color: green;">***OBSERVATION:***</u> &nbsp;

- **The number of cars produced per year between 1952 and 2001 remains consistent, averaging around 2,000 cars annually, with no major fluctuations.**
- **Starting from the year 2002, there is a significant spike in car counts, reaching around 13,000 cars annually. This trend continues steadily up to 2024.**
- **The highest production volume is observed in the year 2021, with a count of 13,248 cars.**

**This suggests a sharp increase in car production starting in the early 2000s, continuing consistently through the 2020s.**

***<span style="color: orange;">3. Finding count of cars by make:</span>***

In [7]:
cars_by_make = ( df.groupBy("make")
                    .agg({"car_id": "count"})
                    .orderBy("count(car_id)", ascending=False)
                    .collect()
                )
                    
for item in cars_by_make:
    print(f"make: {item[0]}, count: {item[1]}")

make: Manufacturer3, count: 80293
make: Manufacturer2, count: 80235
make: Manufacturer1, count: 80166
make: Manufacturer6, count: 52369
make: Manufacturer7, count: 52338
make: Manufacturer5, count: 27929
make: Manufacturer4, count: 27866


<u style="color: green;">**OBSERVATION:**</u> &nbsp;

- **Manufacturer3 has the highest count of cars, with 80,293 vehicles.**
- **Manufacturer2 and Manufacturer1 follow closely behind, with 80,235 and 80,166 cars, respectively.**
- **Manufacturer6 and Manufacturer7 have moderate car counts, with 52,369 and 52,338 vehicles, respectively.**
- **Manufacturer5 and Manufacturer4 have the lowest counts among the group, with 27,929 and 27,866 cars, respectively.**

**This suggests that Manufacturers 1, 2, and 3 dominate in terms of car production, while Manufacturers 5 and 4 have significantly smaller outputs.**

***<span style="color: orange;">4. Finding the safest cars:</span>***

**There could be 2 definitions of "safe car" in this context:**  
  
*&nbsp;1. Intrinsic safety of the car, where a car is defined by (model_year, make, body_style)*  
*&nbsp;2. Safety of each individual car viewed holistically from an insurance standpoint (includes safety rating of car and the driver's track record)*

In [8]:
from pyspark.sql import functions as F

# Query to identify the safest cars based on average vehicle safety discount, with a minimum threshold of 0.75.
intrinsic_safety = (
    df.groupBy("model_year", "make", "body_style")
    .agg(
        F.count("car_id").alias("car_count"),
        F.avg("vehicle_safety_discount").alias("avg_vehicle_safety_discount")
    )
)

# Total count of cars after the groupBy operation
total_cars_count = intrinsic_safety.count()
print(f"Total Count of Cars: {total_cars_count}\n")
print("-"*20)

# Filter for safety discount >= 0.75
filtered_intrinsic_safety = intrinsic_safety.filter(F.col("avg_vehicle_safety_discount") >= 0.75) 
# Order by average vehicle safety discount
filtered_intrinsic_safety = filtered_intrinsic_safety.orderBy("avg_vehicle_safety_discount", ascending=False)

# Overall count of distinct cars that meet the safety criteria
distinct_cars_count = filtered_intrinsic_safety.count()
print(f"Total Count of Distinct Cars with Average Vehicle Safety Discount >= 0.75: {distinct_cars_count}\n")
print("-"*20)
print("DETAILS:")
print("-"*20)
filtered_intrinsic_safety.show(distinct_cars_count)


Total Count of Cars: 2044

--------------------
Total Count of Distinct Cars with Average Vehicle Safety Discount >= 0.75: 27

--------------------
DETAILS:
--------------------
+----------+-------------+----------+---------+---------------------------+
|model_year|         make|body_style|car_count|avg_vehicle_safety_discount|
+----------+-------------+----------+---------+---------------------------+
|      1998|Manufacturer4|     Truck|        1|                        1.0|
|      2023|Manufacturer7|     Truck|      114|         0.8333333333333334|
|      2000|Manufacturer5|     Truck|        6|         0.8333333333333334|
|      2021|Manufacturer7|     Truck|      129|         0.7984496124031008|
|      2024|Manufacturer6|    2 door|      509|          0.793713163064833|
|      2023|Manufacturer6|    4 door|      532|         0.7875939849624061|
|      2020|Manufacturer6|     Truck|      128|                    0.78125|
|      2024|Manufacturer6|     Truck|      128|               

<u style="color: green;">**OBSERVATION:**</u> &nbsp;

- **Out of 2044 cars in the active household, 27 cars are the safest with an avg_vehicle_safety_discount over 0.75**

In [9]:
from pyspark.sql import functions as F
from pyspark.sql import SparkSession
from datetime import datetime

# Exclude records where annual_miles_driven = 0, and ensure both discounts are enabled
filtered_df = (df.filter((F.col("annual_miles_driven") > 0) & 
                         (F.col("driver_safety_discount") == 1) & 
                         (F.col("vehicle_safety_discount") == 1)))

# Perform aggregation and calculate claims per mile
safety_indicators = (filtered_df.groupBy("car_id", "model_year", "make", "body_style")
                     .agg(
                         F.sum("claim_payout").alias("total_claim_payout"),                      # Sum of claim payouts
                         F.avg("vehicle_safety_discount").alias("avg_vehicle_safety_discount"),  # Avg vehicle safety discount
                         F.avg("driver_safety_discount").alias("avg_driver_safety_discount"),    # Avg driver safety discount
                         F.sum("annual_miles_driven").alias("total_annual_miles_driven")         # Sum of annual miles driven
                     )
                     .withColumn("risk_exposure_per_mile", 
                                        F.col("total_claim_payout") / F.col("total_annual_miles_driven"))
                     .orderBy("risk_exposure_per_mile", "total_annual_miles_driven")
                    )

# Get total record count
record_count = safety_indicators.count()
print(f"Total record count: {record_count}")

# Calculate top 10% of records
top_10_percent_count = int(record_count * 0.1)

# Get the top 10% of records based on claims_per_mile
top_10_percent_records = safety_indicators.limit(top_10_percent_count)

# Show the top 10% of records with the required columns
top_10_percent_records.select("car_id", "model_year", "make", "body_style", 
                              "avg_vehicle_safety_discount", "avg_driver_safety_discount", 
                              "total_claim_payout", "total_annual_miles_driven", 
                              "risk_exposure_per_mile").show(truncate=False)

Total record count: 34169
+------+----------+-------------+----------+---------------------------+--------------------------+------------------+-------------------------+----------------------+
|car_id|model_year|make         |body_style|avg_vehicle_safety_discount|avg_driver_safety_discount|total_claim_payout|total_annual_miles_driven|risk_exposure_per_mile|
+------+----------+-------------+----------+---------------------------+--------------------------+------------------+-------------------------+----------------------+
|614639|1989      |Manufacturer3|SUV       |1.0                        |1.0                       |0                 |2                        |0.0                   |
|904603|2011      |Manufacturer7|2 door    |1.0                        |1.0                       |0                 |2                        |0.0                   |
|422559|2024      |Manufacturer4|SUV       |1.0                        |1.0                       |0                 |2               

<u style="color: red;">**NOTE:**</u> &nbsp;

- **The top 10% of cars with the lowest risk exposure per mile are identified as the safest cars.**
- **The number of safest cars is determined by the value of 'n' in the top n% of cars with the least risk exposure per mile.**
- **The logic based on risk exposure per mile is essential, as average vehicle safety discount and average driver safety discount alone are insufficient to determine the safest cars.**
- **For instance, consider the below record where both average vehicle safety discount and average driver safety discount are 1, yet the risk exposure per mile exceeds $3,100.**

In [10]:
last_record = safety_indicators.orderBy(F.col("risk_exposure_per_mile").desc(), F.col("total_annual_miles_driven").desc()).limit(1)

# Step 4: Show the last record
last_record.select("car_id", "model_year", "make", "body_style", 
                   "avg_vehicle_safety_discount", "avg_driver_safety_discount", 
                   "total_claim_payout", "total_annual_miles_driven", 
                   "risk_exposure_per_mile").show(truncate=False)

+------+----------+-------------+----------+---------------------------+--------------------------+------------------+-------------------------+----------------------+
|car_id|model_year|make         |body_style|avg_vehicle_safety_discount|avg_driver_safety_discount|total_claim_payout|total_annual_miles_driven|risk_exposure_per_mile|
+------+----------+-------------+----------+---------------------------+--------------------------+------------------+-------------------------+----------------------+
|170245|2013      |Manufacturer3|4 door    |1.0                        |1.0                       |990176            |317                      |3123.583596214511     |
+------+----------+-------------+----------+---------------------------+--------------------------+------------------+-------------------------+----------------------+



***<span style="color: orange;">5. Finding states that have the largest households (defined as number of customers in a household):</span>***

In [11]:
from pyspark.sql import functions as F

# Group by hh_id to aggregate household data
household_data = (df.groupBy("hh_id")  
                  .agg(
                      F.countDistinct("cust_id").alias("household_size"),  # Count distinct cust_id for household size
                      F.first("state").alias("state")                      # Retain the state (first occurrence)
                  )
                 )

# Group by state and calculate average household size
state_average_household_size = (household_data.groupBy("state")
                                 .agg(F.avg("household_size").alias("average_household_size"))
                                 .orderBy(F.desc("average_household_size")))

state_average_household_size.show(50)

+-----+----------------------+
|state|average_household_size|
+-----+----------------------+
|   AR|    3.8767441860465115|
|   MA|     3.861652739090065|
|   NE|    3.8559322033898304|
|   MD|    3.8551724137931034|
|   WA|     3.853885528152629|
|   ME|     3.853099072718399|
|   HI|    3.8526513374002818|
|   AL|     3.849606663581675|
|   MI|     3.849462365591398|
|   NV|    3.8488372093023258|
|   VA|    3.8472086190009795|
|   IA|    3.8444553483807655|
|   KS|    3.8362745098039217|
|   AK|     3.833788706739526|
|   UT|     3.832289156626506|
|   ND|    3.8296438883541866|
|   IN|     3.829003306565895|
|   GA|    3.8277959756668225|
|   SC|    3.8128681468056187|
|   MO|     3.811294117647059|
|   TN|      3.81115017579106|
|   ID|    3.8103870651641354|
|   VT|    3.8091942659416707|
|   IL|      3.80863309352518|
|   MT|    3.8058766859344892|
|   NY|    3.8053097345132745|
|   KY|    3.8037718904355637|
|   LA|    3.8008827856792546|
|   MN|     3.800182898948331|
|   WY| 

<u style="color: green;">**OBSERVATION:**</u> &nbsp;

- **Arkansas (AR) has the largest average household size, with approximately 3.88 customers per household.**
- **Following closely are Massachusetts (MA) and Nebraska (NE), both with average household sizes around 3.86.**
- **Overall, there isn't a significant difference in average household sizes across these states, indicating a relatively uniform pattern in household composition.**

***<span style="color: orange;">6. Finding number of active households as of 1/1/2021:</span>***

In [14]:
from pyspark.sql import functions as F
from pyspark.sql.functions import col

total_households = df.select("hh_id").distinct().count()
print(f"Total Count of Households: {total_households}")

active_households = (df.filter(F.to_date(col("hh_start_date"), "M/d/yy") <= F.to_date(F.lit("1/1/21"), "M/d/yy")) 
                     .select("hh_id") 
                     .distinct() 
                     .count())

print(f"Active households as of 1/1/2021: {active_households}")

Total Count of Households: 105331
Active households as of 1/1/2021: 63279


<u style="color: green;">**OBSERVATION:**</u> &nbsp;

- **Out of 105331 active households, 63279 of them were active as of 1/1/2021.**

***<span style="color: orange;">7. Finding average age of customers:</span>***

In [36]:
from pyspark.sql import functions as F
from datetime import datetime

# UDF to parse date_of_birth and handle the two-digit years in M/d/yy format
def parse_dob(dob):
    if not dob:
        return None
    
    # Split the date string and handle the parsing
    try:
        month, day, year = map(int, dob.split('/'))
        current_year = datetime.now().year
        cutoff_year = current_year % 100             # Get the last two digits of the current year

        # Determine the full year based on the logic for two-digit years
        if year >= 0 and year <= cutoff_year:        # 00 to cutoff_year => 2000-current_year
            year += 2000
        else:                                        # (cutoff_year+1) to 99 => 1925-1999
            year += 1900
        
        return f"{year}-{month:02}-{day:02}"
    except Exception:
        return None

parse_dob_udf = F.udf(parse_dob)
df_with_dob = df.withColumn("birth_date", parse_dob_udf(F.col("date_of_birth")))

# Show sample birth dates and parsed birth dates
df_with_dob.select("date_of_birth", "birth_date").show(truncate=False)

# Calculate age based on birth_date
current_date = datetime.now()
df_with_age = df_with_dob.withColumn("age", F.year(F.lit(current_date)) - F.year(F.col("birth_date")))

print(f"The average age of customers is: {average_age:.2f}")

+-------------+----------+
|date_of_birth|birth_date|
+-------------+----------+
|11/22/01     |2001-11-22|
|9/17/25      |1925-09-17|
|5/28/92      |1992-05-28|
|11/16/89     |1989-11-16|
|7/17/31      |1931-07-17|
|3/9/52       |1952-03-09|
|7/4/93       |1993-07-04|
|10/10/31     |1931-10-10|
|8/19/61      |1961-08-19|
|6/28/35      |1935-06-28|
|2/6/92       |1992-02-06|
|5/2/25       |1925-05-02|
|9/15/88      |1988-09-15|
|6/10/80      |1980-06-10|
|1/2/99       |1999-01-02|
|7/1/63       |1963-07-01|
|4/5/74       |1974-04-05|
|7/2/05       |2005-07-02|
|8/16/05      |2005-08-16|
|5/24/96      |1996-05-24|
+-------------+----------+
only showing top 20 rows

The average age of customers is: 49.86


***<span style="color: orange;">8. Variations in age of customers by region:</span>***

In [43]:
age_by_region = df_with_age.groupBy("state").agg(
    F.avg("age").alias("average_age"),
    F.min("age").alias("min_age"),
    F.max("age").alias("max_age"),
    F.count("*").alias("count")
)

age_by_region.orderBy(F.desc("average_age")).show(50) # show for all 50 states

+-----+------------------+-------+-------+-----+
|state|       average_age|min_age|max_age|count|
+-----+------------------+-------+-------+-----+
|   AZ| 50.65067466266866|      0|     99| 8004|
|   ME| 50.56493095147599|      0|     99| 7893|
|   KS| 50.40227534194043|      0|     99| 7823|
|   CO| 50.30515683147262|      0|     99| 7524|
|   IL| 50.27376904671956|      0|     99| 7941|
|   PA| 50.24231724859422|      0|     99| 7647|
|   DE| 50.21062764728533|      0|     99| 7791|
|   AR| 50.15776844631074|      0|     99| 8335|
|   FL| 50.15564738292011|      0|     99| 7986|
|   HI|50.051522533495735|      0|     99| 8210|
|   CT| 50.04209507681142|      0|     99| 8267|
|   MD| 50.04100664282064|      0|     99| 7828|
|   MS| 50.00397268777157|      0|     99| 8055|
|   CA| 50.00353892821031|      0|     99| 7912|
|   TN| 49.96718502899315|      0|     99| 7588|
|   IN| 49.96669133974834|      0|     99| 8106|
|   UT| 49.96554325955734|      0|     99| 7952|
|   NM|49.9653987730

<u style="color: green;">**OBSERVATION:**</u> &nbsp;

- **Arizona (AZ) has the highest average age at approximately 50.65 years, indicating a potentially aging population.**
- **States like Maine (ME), Kansas (KS), and Colorado (CO) also show average ages above 50 years, suggesting similar demographic trends.**
- **Conversely, states like Texas (TX) and Louisiana (LA) have lower average ages (around 49.32 to 49.30 years), reflecting a younger population in these regions.**
- **The minimum age recorded across all states is 0, while the maximum age reaches up to 99 years, showing a significant range in age distribution.**
- **The average ages across states tend to cluster closely.**

***<span style="color: orange;">9. Finding age group with the most expensive claims:</span>***

In [48]:
from pyspark.sql import functions as F

# Defining age groups: (0-18, 19-35, 36-50, 51-65, 66-80, 80+)
df_with_age_group = df_with_age.withColumn(
    "age_group",
    F.when(F.col("age") <= 18, "0-18")
    .when((F.col("age") > 18) & (F.col("age") <= 35), "19-35")
    .when((F.col("age") > 35) & (F.col("age") <= 50), "36-50")
    .when((F.col("age") > 50) & (F.col("age") <= 65), "51-65")
    .when((F.col("age") > 65) & (F.col("age") <= 80), "66-80")
    .otherwise("80+")
)

# Calculate average claim amount per age group
avg_claim_by_age_group = df_with_age_group.groupBy("age_group").agg(
    F.avg("claim_payout").alias("average_claim_amount"),
    F.count("claim_payout").alias("claim_count")
)

avg_claim_by_age_group.orderBy(F.desc("average_claim_amount")).show()

# Find the age group with the most expensive claims
most_expensive_age_group = avg_claim_by_age_group.orderBy(F.desc("average_claim_amount")).first()

if most_expensive_age_group:
    print(f"Age group with the most expensive claims: {most_expensive_age_group['age_group']}")
    print(f"Average claim amount: {most_expensive_age_group['average_claim_amount']:.2f}")

+---------+--------------------+-----------+
|age_group|average_claim_amount|claim_count|
+---------+--------------------+-----------+
|    51-65|   1404.081490853279|      48214|
|    19-35|   1254.098253557568|     123680|
|     0-18|   1199.234251686038|      27876|
|    66-80|  1187.6606105978162|      54766|
|      80+|  1178.7362523851198|      69703|
|    36-50|  1159.3664643892043|      76957|
+---------+--------------------+-----------+

Age group with the most expensive claims: 51-65
Average claim amount: 1404.08


<u style="color: green;">**OBSERVATION:**</u> &nbsp;

- **While the 51-65 age group has the highest average claim amount at $1404.08, they have a relatively low claim count (48,214) compared to other age groups.**
- **The 19-35 age group has a significantly higher claim count (123,680) but a lower average claim amount of $1254.10.**
- **This suggests that although fewer claims are made by the 51-65 age group, their claims tend to be more expensive.**
- **Conversely, the 19-35 age group files a large number of claims, but these tend to be less costly.**