In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
import math

# Initialize Spark
conf = SparkConf().setAppName("CarAnalysis").setMaster("local[4]")
sc = SparkContext(conf=conf)
spark = SparkSession(sc)

# Load the dataset
car_rdd = sc.textFile("CAR DETAILS FROM CAR DEKHO.csv")

# Remove header and filter out empty lines
header = car_rdd.first()
cars_data = car_rdd.filter(lambda line: line != header).filter(lambda line: len(line) > 0)

# Task 1: Split rows into columns and create tuples (Car_Name, Year, Selling_Price, Fuel_Type)
def parse_car_line(line):
    parts = line.split(',')
    if len(parts) >= 5:
        return (parts[0], int(parts[1]), float(parts[2]), parts[4])  # name, year, price, fuel
    return None

cars_tuples = cars_data.map(parse_car_line).filter(lambda x: x is not None)

# Task 2: Display first 10 records using take()
print("=== First 10 Car Records ===")
first_10_cars = cars_tuples.take(10)
for i, car in enumerate(first_10_cars, 1):
    print(f"{i}. {car}")

# Transformations
# Task 3: Extract list of unique fuel types
unique_fuel_types = cars_tuples.map(lambda x: x[3]).distinct()
print("\n=== Unique Fuel Types ===")
print(unique_fuel_types.collect())

# Task 4: Count the number of cars per fuel type using reduceByKey
cars_per_fuel = cars_tuples.map(lambda x: (x[3], 1)).reduceByKey(lambda a, b: a + b)
print("\n=== Number of Cars per Fuel Type ===")
for fuel, count in cars_per_fuel.collect():
    print(f"{fuel}: {count} cars")

# Task 5: Find the top 5 most expensive cars
top_5_expensive = cars_tuples.takeOrdered(5, key=lambda x: -x[2])
print("\n=== Top 5 Most Expensive Cars ===")
for i, car in enumerate(top_5_expensive, 1):
    print(f"{i}. {car[0]} - ₹{car[2]:,.0f}")

# Aggregation
# Task 6: Compute total selling price per fuel type
total_price_per_fuel = cars_tuples.map(lambda x: (x[3], x[2])).reduceByKey(lambda a, b: a + b)
print("\n=== Total Selling Price per Fuel Type ===")
for fuel, total in total_price_per_fuel.collect():
    print(f"{fuel}: ₹{total:,.0f}")

# Task 7: Compute average selling price per fuel type
# First, get (fuel_type, (total_price, count))
fuel_price_count = cars_tuples.map(lambda x: (x[3], (x[2], 1))).reduceByKey(lambda a, b: (a[0] + b[0], a[1] + b[1]))
avg_price_per_fuel = fuel_price_count.mapValues(lambda x: x[0] / x[1])
print("\n=== Average Selling Price per Fuel Type ===")
for fuel, avg in avg_price_per_fuel.collect():
    print(f"{fuel}: ₹{avg:,.0f}")

# Task 8: Find the fuel type with the highest average selling price
highest_avg_fuel = avg_price_per_fuel.max(key=lambda x: x[1])
print(f"\n=== Fuel Type with Highest Average Selling Price ===")
print(f"{highest_avg_fuel[0]}: ₹{highest_avg_fuel[1]:,.0f}")

# Additional analysis: Cars per year
print("\n=== Number of Cars Listed per Year ===")
cars_per_year = cars_tuples.map(lambda x: (x[1], 1)).reduceByKey(lambda a, b: a + b).sortByKey()
for year, count in cars_per_year.collect():
    print(f"{year}: {count} cars")

# Additional analysis: Price statistics by fuel type
print("\n=== Price Statistics by Fuel Type ===")
def calculate_stats(iterable):
    prices = [x[2] for x in iterable]
    if prices:
        avg_price = sum(prices) / len(prices)
        max_price = max(prices)
        min_price = min(prices)
        return (avg_price, max_price, min_price, len(prices))
    return (0, 0, 0, 0)

fuel_stats = cars_tuples.groupBy(lambda x: x[3]).mapValues(calculate_stats)
for fuel, stats in fuel_stats.collect():
    print(f"{fuel}:")
    print(f"  Average: ₹{stats[0]:,.0f}")
    print(f"  Maximum: ₹{stats[1]:,.0f}")
    print(f"  Minimum: ₹{stats[2]:,.0f}")
    print(f"  Count: {stats[3]} cars")

# Stop Spark context
sc.stop()

=== First 10 Car Records ===
1. ('Maruti 800 AC', 2007, 60000.0, 'Petrol')
2. ('Maruti Wagon R LXI Minor', 2007, 135000.0, 'Petrol')
3. ('Hyundai Verna 1.6 SX', 2012, 600000.0, 'Diesel')
4. ('Datsun RediGO T Option', 2017, 250000.0, 'Petrol')
5. ('Honda Amaze VX i-DTEC', 2014, 450000.0, 'Diesel')
6. ('Maruti Alto LX BSIII', 2007, 140000.0, 'Petrol')
7. ('Hyundai Xcent 1.2 Kappa S', 2016, 550000.0, 'Petrol')
8. ('Tata Indigo Grand Petrol', 2014, 240000.0, 'Petrol')
9. ('Hyundai Creta 1.6 VTVT S', 2015, 850000.0, 'Petrol')
10. ('Maruti Celerio Green VXI', 2017, 365000.0, 'CNG')

=== Unique Fuel Types ===
['Diesel', 'CNG', 'Electric', 'Petrol', 'LPG']

=== Number of Cars per Fuel Type ===
Diesel: 2153 cars
CNG: 40 cars
Electric: 1 cars
Petrol: 2123 cars
LPG: 23 cars

=== Top 5 Most Expensive Cars ===
1. Audi RS7 2015-2019 Sportback Performance - ₹8,900,000
2. Mercedes-Benz S-Class S 350d Connoisseurs Edition - ₹8,150,000
3. Mercedes-Benz GLS 2016-2020 350d 4MATIC - ₹5,500,000
4. BMW X5 xD