In [1]:
import pandas as pd
import numpy as np
import warnings
from datetime import datetime
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

In [2]:
def load_data(file_path):
    for sep in [',', '|', ';', '\t']:
        try:
            df = pd.read_csv(f"C:/Users/mayan/OneDrive/Desktop/Car Dekho Project/Data/cardekho_dataset.csv", sep=sep)
            if df.shape[1] > 1:
                return df
        except:
            continue
    return pd.read_csv(file_path)

df = load_data("cardekho_dataset.csv")
df.columns = df.columns.str.replace(r'^\d+\|', '', regex=True)
df.columns = df.columns.str.strip()

if df.columns[0].startswith('Unnamed') or df.iloc[0, 0] == '0':
    df = df.drop(df.columns[0], axis=1)

print(f"Dataset shape: {df.shape}")

Dataset shape: (15411, 13)


In [3]:
def clean_data(df):
    df_cleaned = df.copy()
    
    for col in df_cleaned.columns:
        if df_cleaned[col].isnull().sum() > 0:
            if df_cleaned[col].dtype in ['object', 'category']:
                mode_value = df_cleaned[col].mode().iloc[0] if not df_cleaned[col].mode().empty else 'Unknown'
                df_cleaned[col].fillna(mode_value, inplace=True)
            else:
                df_cleaned[col].fillna(df_cleaned[col].median(), inplace=True)
    
    df_cleaned = df_cleaned.drop_duplicates()
    
    for col in df_cleaned.columns:
        if df_cleaned[col].dtype == 'object':
            numeric_conversion = pd.to_numeric(df_cleaned[col], errors='coerce')
            if (numeric_conversion.notna().sum() / len(df_cleaned)) > 0.9:
                df_cleaned[col] = numeric_conversion
    
    return df_cleaned

df_cleaned = clean_data(df)
print(f"Cleaned shape: {df_cleaned.shape}")

Cleaned shape: (15244, 13)


In [4]:
def transform_data(df):
    df_transformed = df.copy()
    
    if 'vehicle_age' in df_transformed.columns:
        df_transformed['age_category'] = pd.cut(df_transformed['vehicle_age'], 
                                              bins=[0, 3, 7, 12, float('inf')], 
                                              labels=['New', 'Medium', 'Old', 'Very_Old'])
    
    if 'km_driven' in df_transformed.columns:
        df_transformed['km_category'] = pd.cut(df_transformed['km_driven'], 
                                             bins=[0, 20000, 50000, 100000, float('inf')], 
                                             labels=['Low', 'Medium', 'High', 'Very_High'])
    
    if 'selling_price' in df_transformed.columns:
        df_transformed['price_category'] = pd.cut(df_transformed['selling_price'], 
                                                bins=[0, 300000, 700000, 1500000, float('inf')], 
                                                labels=['Budget', 'Mid_Range', 'Premium', 'Luxury'])
    
    categorical_cols = df_transformed.select_dtypes(include=['object']).columns.tolist()
    for col in categorical_cols:
        unique_count = df_transformed[col].nunique()
        if unique_count > 10:
            le = LabelEncoder()
            df_transformed[f'{col}_encoded'] = le.fit_transform(df_transformed[col].astype(str))
        else:
            dummies = pd.get_dummies(df_transformed[col], prefix=col, drop_first=True)
            df_transformed = pd.concat([df_transformed, dummies], axis=1)
    
    return df_transformed

df_transformed = transform_data(df_cleaned)
print(f"Transformed shape: {df_transformed.shape}")

Transformed shape: (15244, 26)


In [5]:
numerical_cols = df_transformed.select_dtypes(include=[np.number]).columns
if len(numerical_cols) > 0:
    print(df_transformed[numerical_cols].describe())

categorical_cols = df_transformed.select_dtypes(include=['object', 'category']).columns
for col in categorical_cols[:3]:
    print(f"\n{col}:")
    print(df_transformed[col].value_counts().head())

        vehicle_age     km_driven       mileage        engine     max_power  \
count  15244.000000  1.524400e+04  15244.000000  15244.000000  15244.000000   
mean       6.041131  5.563958e+04     19.697333   1486.171543    100.607652   
std        3.016228  5.176630e+04      4.169307    520.419390     42.915687   
min        0.000000  1.000000e+02      4.000000    793.000000     38.400000   
25%        4.000000  3.000000e+04     17.000000   1197.000000     74.000000   
50%        6.000000  5.000000e+04     19.670000   1248.000000     88.500000   
75%        8.000000  7.000000e+04     22.700000   1582.000000    117.300000   
max       29.000000  3.800000e+06     33.540000   6592.000000    626.000000   

              seats  selling_price  car_name_encoded  brand_encoded  \
count  15244.000000   1.524400e+04      15244.000000   15244.000000   
mean       5.326161   7.747014e+05         60.770008      15.230714   
std        0.808760   8.946761e+05         30.634237       8.058158   
min 

In [8]:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_filename = f"C:/Users/mayan/OneDrive/Desktop/Car Dekho Project/Data/cardekho_processed.csv"
df_transformed.to_csv(output_filename, index=False)
print(f"Data exported to: {output_filename}")
print(f"Final dataset shape: {df_transformed.shape}")

Data exported to: C:/Users/mayan/OneDrive/Desktop/Car Dekho Project/Data/cardekho_processed_20250826_130107.csv
Final dataset shape: (15244, 26)
