In [0]:
# ==============================================================================
# CELL 1: SETUP, LOAD, AND PREPARE USED CARS DATA (Corrected for "null" strings)
# ==============================================================================

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, regexp_replace, when, split

# Initialize Spark Session
spark = SparkSession.builder.appName("UsedCarAnalytics").getOrCreate()

# Load the data from the Databricks table you created.
table_name = "workspace.default.used_cars_data" 
df_raw = spark.table(table_name)

# --- Feature Engineering & Cleaning ---
print("Cleaning and transforming data...")

# 1. Clean string columns to extract numeric values
#    We add a 'when' condition to handle the literal string "null" before casting.
df_cleaned = df_raw.withColumn("Mileage_temp", regexp_replace(col("Mileage"), r"(\s*kmpl|\s*km/kg)", "")) \
                   .withColumn("Engine_temp", regexp_replace(col("Engine"), r"\s*CC", "")) \
                   .withColumn("Power_temp", regexp_replace(col("Power"), r"\s*bhp", ""))

# 2. Convert string "null" to actual NULL values
df_cleaned = df_cleaned.withColumn("Mileage_num", when(col("Mileage_temp") == "null", None).otherwise(col("Mileage_temp")).cast("double")) \
                       .withColumn("Engine_num", when(col("Engine_temp") == "null", None).otherwise(col("Engine_temp")).cast("double")) \
                       .withColumn("Power_num", when(col("Power_temp") == "null", None).otherwise(col("Power_temp")).cast("double"))

# 3. Extract the car's brand name from the 'Name' column
df_cleaned = df_cleaned.withColumn("Brand", split(col("Name"), " ")[0])

# 4. Handle null values. This will now correctly drop rows where 'Power' was "null bhp".
cols_to_check = ["Kilometers_Driven", "Mileage_num", "Engine_num", "Power_num", "Seats", "Price"]
df_cleaned = df_cleaned.na.drop(subset=cols_to_check)

# 5. Select the final columns we will work with.
final_df = df_cleaned.select(
    "Brand", "Location", "Year", "Kilometers_Driven", "Fuel_Type", "Transmission", 
    "Owner_Type", "Seats",
    col("Mileage_num").alias("Mileage"),
    col("Engine_num").alias("Engine"),
    col("Power_num").alias("Power"),
    col("Price").cast("double") # Ensure our target variable is a double
)

# --- Verification ---
print("Data preparation complete.")
final_df.printSchema()
display(final_df)

Brand,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Seats,Mileage,Engine,Power,Price
Maruti,Mumbai,2010,72000,CNG,Manual,First,5,26.6,998.0,58.16,1.75
Hyundai,Pune,2015,41000,Diesel,Manual,First,5,19.67,1582.0,126.2,12.5
Honda,Chennai,2011,46000,Petrol,Manual,First,5,18.2,1199.0,88.7,4.5
Maruti,Chennai,2012,87000,Diesel,Manual,First,7,20.77,1248.0,88.76,6.0
Audi,Coimbatore,2013,40670,Diesel,Automatic,Second,5,15.2,1968.0,140.8,17.74
Hyundai,Hyderabad,2012,75000,LPG,Manual,First,5,21.1,814.0,55.2,2.35
Nissan,Jaipur,2013,86999,Diesel,Manual,First,5,23.08,1461.0,63.1,3.5
Toyota,Mumbai,2016,36000,Diesel,Automatic,First,8,11.36,2755.0,171.5,17.5
Volkswagen,Pune,2013,64430,Diesel,Manual,First,5,20.54,1598.0,103.6,5.2
Tata,Chennai,2012,65932,Diesel,Manual,Second,5,22.3,1248.0,74.0,1.95


Databricks visualization. Run in Databricks to view.

In [0]:
# ==============================================================================
# CELL 4: CASE 1 - BRAND PERFORMANCE ANALYSIS (Corrected for Used Cars Data)
# ==============================================================================
from pyspark.sql.functions import avg, count, desc

# Problem: Determine which car brands hold their value the best and have the highest market presence.
# Methodology: Use Spark SQL aggregations to group key performance metrics by 'Brand'.

print("--- Analyzing Car Brand Performance ---")

# 1. Aggregate data by Brand to calculate key metrics.
#    We will filter for brands with more than 20 cars listed to ensure a meaningful comparison.
brand_counts = final_df.groupBy("Brand").count()
popular_brands = brand_counts.filter(col("count") > 20).select("Brand")
popular_brands_df = final_df.join(popular_brands, "Brand", "inner")

brand_performance_df = popular_brands_df.groupBy("Brand").agg(
    count("*").alias("Number_of_Listings"),
    avg("Price").alias("Average_Resale_Price"),
    avg("Kilometers_Driven").alias("Average_Kms_Driven"),
    avg("Year").alias("Average_Age_of_Car")
).orderBy(desc("Average_Resale_Price"))

# 2. Prepare for Dashboarding
print("\n--- Output for Dashboard (Brand Performance Analysis) ---")
display(brand_performance_df)

Brand,Number_of_Listings,Average_Resale_Price,Average_Kms_Driven,Average_Age_of_Car
Land,57,39.86105263157896,59527.40350877193,2014.421052631579
Jaguar,40,37.63225,36380.725,2014.4
Mercedes-Benz,316,26.917848101265815,48445.648734177215,2013.5981012658228
Mini,26,26.896923076923077,24890.30769230769,2015.076923076923
Audi,235,25.56978723404255,52879.289361702125,2013.6212765957448
BMW,262,25.43961832061068,79471.38167938932,2013.0839694656488
Volvo,21,18.802857142857142,70539.66666666667,2013.2857142857144
Toyota,394,11.907969543147187,83412.10913705584,2013.1040609137056
Mitsubishi,27,11.05888888888889,82250.62962962964,2011.111111111111
Mahindra,268,8.058955223880597,69510.72014925373,2013.7910447761196


Databricks visualization. Run in Databricks to view.

In [0]:
# ==============================================================================
# CELL 2: SAFETY ANALYSIS (HIGH MILEAGE/AGE HOTSPOTS)
# ==============================================================================
from pyspark.sql.functions import avg, count, desc

# Problem: Identify locations with a higher concentration of potentially "less safe" cars.
# Methodology: Define "high-risk" cars as those older than a certain year or with high kilometers.
#              Then, aggregate by location to find hotspots.

print("--- Analyzing Safety Hotspots (Older/High-Mileage Cars) ---")

# 1. Define thresholds for what we consider "high-risk".
YEAR_THRESHOLD = 2010
KMS_THRESHOLD = 100000

# 2. Filter for cars that meet either of these high-risk criteria.
risky_cars_df = final_df.filter(
    (col("Year") < YEAR_THRESHOLD) | (col("Kilometers_Driven") > KMS_THRESHOLD)
)

# 3. Aggregate by location to find where these cars are most common.
safety_hotspot_df = risky_cars_df.groupBy("Location").agg(
    count("*").alias("High_Risk_Car_Count")
).orderBy(desc("High_Risk_Car_Count"))

# 4. Prepare for Dashboarding
print(f"\n--- Output for Dashboard (Hotspots for cars older than {YEAR_THRESHOLD} or >{KMS_THRESHOLD} kms) ---")
display(safety_hotspot_df)

Location,High_Risk_Car_Count
Hyderabad,189
Pune,170
Chennai,160
Jaipur,104
Mumbai,100
Kolkata,60
Bangalore,55
Delhi,48
Ahmedabad,28
Coimbatore,26


Databricks visualization. Run in Databricks to view.

In [0]:
# ==============================================================================
# CELL 2: CASE 3 - MARKET DEMAND ANALYSIS (PASSENGER PREDICTION PROXY)
# ==============================================================================
from pyspark.sql.functions import count, desc

# Problem: Predict passenger demand. Re-interpreted as: "Analyze what drives market demand for used cars."
# Methodology: Use Spark SQL aggregations to count listings by Brand and Fuel Type.

print("--- Analyzing Market Demand by Brand and Fuel Type ---")

# 1. Aggregate data to find the most listed car brands.
demand_by_brand_df = final_df.groupBy("Brand").agg(
    count("*").alias("Number_of_Listings")
).orderBy(desc("Number_of_Listings"))

# 2. Prepare for Dashboarding
print("\n--- Output for Dashboard (Demand by Brand) ---")
display(demand_by_brand_df.limit(15))

Brand,Number_of_Listings
Maruti,1175
Hyundai,1058
Honda,600
Toyota,394
Mercedes-Benz,316
Volkswagen,314
Ford,294
Mahindra,268
BMW,262
Audi,235


Databricks visualization. Run in Databricks to view.

In [0]:
# ==============================================================================
# CELL 5: CASE 4 - FEATURE CORRELATION ANALYSIS (NAVIGATION PROXY)
# ==============================================================================
from pyspark.sql.functions import corr

# Problem: Automatic vehicle navigation. Re-interpreted as: "Automatically identify the strongest drivers of car price."
# Methodology: Calculate the correlation coefficient between key numeric features and the 'Price'.

print("--- Analyzing Feature Correlation with Price ---")

# 1. Calculate the correlation for each key feature against the Price
corr_year = final_df.select(corr("Year", "Price")).first()[0]
corr_kms = final_df.select(corr("Kilometers_Driven", "Price")).first()[0]
corr_mileage = final_df.select(corr("Mileage", "Price")).first()[0]
corr_engine = final_df.select(corr("Engine", "Price")).first()[0]
corr_power = final_df.select(corr("Power", "Price")).first()[0]

# 2. Create a DataFrame for visualization
correlation_data = [
    ("Year", corr_year),
    ("Kilometers_Driven", corr_kms),
    ("Mileage", corr_mileage),
    ("Engine", corr_engine),
    ("Power", corr_power)
]
correlation_df = spark.createDataFrame(correlation_data, ["Feature", "Correlation_with_Price"])

# 3. Prepare for Dashboarding
print("\n--- Output for Dashboard (Feature Correlation) ---")
display(correlation_df)

Feature,Correlation_with_Price
Year,0.2994754501870621
Kilometers_Driven,-0.0082485362784922
Mileage,-0.3416519550661074
Engine,0.6580472262886926
Power,0.7728428987389121


Databricks visualization. Run in Databricks to view.

In [0]:
# ==============================================================================
# USE CASE 5: FUEL EFFICIENCY ANALYSIS
# ==============================================================================
from pyspark.sql.functions import avg, count, desc

# Problem: Determine which fuel types offer the best and worst fuel efficiency.
# Methodology: Use Spark SQL aggregations (`groupBy` and `agg`) to calculate the
#              average mileage for each fuel type. This is a robust and efficient
#              method that is not blocked by cluster security policies.

print("--- Analyzing Fuel Efficiency by Fuel Type ---")

# 1. Calculate performance metrics for each fuel type.
#    We group by 'Fuel_Type' and calculate the average mileage and the number of cars.
fuel_efficiency_df = final_df.groupBy("Fuel_Type").agg(
    avg("Mileage").alias("Average_Mileage_kmpl"),
    count("*").alias("Number_of_Cars")
).orderBy(desc("Average_Mileage_kmpl"))

# 2. Prepare for Dashboarding.
#    The `display()` command will generate an interactive table that is ready for visualization.
print("\n--- Output for Dashboard (Fuel Efficiency Analysis) ---")
display(fuel_efficiency_df)

Fuel_Type,Average_Mileage_kmpl,Number_of_Cars
CNG,25.571090909090906,55
LPG,19.385,10
Diesel,18.673464467005147,3152
Petrol,17.65290018832386,2655


Databricks visualization. Run in Databricks to view.