In [16]:
import pandas as pd
import numpy as np

cars = pd.read_csv("/content/train.csv")

# Check missing values
cars.isna().sum()

Unnamed: 0,0
Unnamed: 0,0
Name,0
Location,0
Year,0
Kilometers_Driven,0
Fuel_Type,0
Transmission,0
Owner_Type,0
Mileage,2
Engine,36


In [17]:
import re

# Strategy:
# - Numerical: median (robust to skew)
# - Categorical: mode (most frequent category)
# - Drop only if >60% missing

# Helper function to extract numeric values from strings that might contain units
def clean_numeric_with_unit(value):
    if pd.isna(value):
        return np.nan
    s = str(value)
    # Use regex to find numbers (integers or floats)
    match = re.search(r'(\\d+\.?\\d*)', s)
    if match:
        try:
            return float(match.group(1))
        except ValueError:
            return np.nan
    return np.nan

# Clean columns that are expected to be numeric but might contain units
if 'Mileage' in cars.columns:
    cars['Mileage'] = cars['Mileage'].apply(clean_numeric_with_unit)
if 'Engine' in cars.columns:
    cars['Engine'] = cars['Engine'].apply(clean_numeric_with_unit)
if 'Power' in cars.columns:
    cars['Power'] = cars['Power'].apply(clean_numeric_with_unit)
if 'New_Price' in cars.columns:
    cars['New_Price'] = cars['New_Price'].apply(clean_numeric_with_unit)

# Define numerical and categorical columns for imputation
# Based on cars.isna().sum() and general understanding of car data
num_cols = ['Year', 'Kilometers_Driven', 'Mileage', 'Engine', 'Power', 'Seats', 'New_Price', 'Price']
cat_cols = ['Fuel_Type', 'Transmission', 'Location', 'Owner_Type']

for col in num_cols:
    if col in cars.columns:
        cars[col] = cars[col].fillna(cars[col].median())

for col in cat_cols:
    if col in cars.columns:
        cars[col] = cars[col].fillna(cars[col].mode()[0])



#justification:
- Median for skewed numeric features avoids distortion.
- Mode for categorical preserves existing categories.
- Drop only if a column is overwhelmingly missing (e.g., >60%)


In [18]:
import re

def extract_num(x):
    if pd.isna(x): return np.nan
    match = re.search(r'(\d+(\.\d+)?)', str(x))
    return float(match.group(1)) if match else np.nan

for col in ['Mileage','Engine','Power','New_price']:
    if col in cars.columns:
        cars[col] = cars[col].apply(extract_num)


In [19]:
cars = pd.get_dummies(cars, columns=['Fuel_Type','Transmission'], drop_first=True)


In [20]:
from datetime import datetime
current_year = datetime.now().year
cars['Car_Age'] = current_year - cars['Year']


In [22]:
import re

# Function to extract Make and Model from the 'Name' column
def get_make_and_model(name):
    if pd.isna(name):
        return np.nan, np.nan
    parts = str(name).split(' ', 2) # Split into at most 3 parts: Make, Model, Rest
    make = parts[0] if len(parts) > 0 else np.nan
    model = parts[1] if len(parts) > 1 else np.nan
    return make, model

# Apply the function to create 'Make' and 'Model' columns
cars[['Make', 'Model']] = cars['Name'].apply(lambda x: pd.Series(get_make_and_model(x)))

# Select the desired subset after creating 'Make' and 'Model' columns
subset = cars[['Make','Model','Location','Price']]

In [24]:
# Filter: cars newer than 2015, automatic transmission
filtered = cars[(cars['Year'] >= 2015) & (cars['Transmission_Manual'] == 0)]

In [25]:
# Rename
filtered = filtered.rename(columns={'Odometer':'Km_Driven'})



In [26]:
# Mutate: price per age
filtered['Price_per_Age'] = filtered['Price'] / filtered['Car_Age']


In [27]:
# Arrange (sort)
arranged = filtered.sort_values(by=['Price'], ascending=False)


In [29]:
# Summarize with groupby
summary = cars.groupby('Location').agg(
    Avg_Price=('Price','mean'),
    Median_Odo=('Kilometers_Driven','median'),
    Count=('Price','size')
).reset_index()