In [None]:
# Imports and path setup
import sys
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Configure plotting style
sns.set(style="whitegrid")

# Define project-relative paths (assumes you run the notebook from the repository root)
ROOT = Path.cwd()
DATA_RAW = ROOT / 'data' / 'raw'
DATA_PROCESSED = ROOT / 'data' / 'processed'
OUTPUTS = ROOT / 'outputs'
GRAPHS = OUTPUTS / 'graphs'
RESULTS = OUTPUTS / 'results'

# Ensure output directories exist
for p in [DATA_PROCESSED, GRAPHS, RESULTS]:
    p.mkdir(parents=True, exist_ok=True)

CSV_PATH = DATA_RAW / 'car_data.csv'

print('Root directory:', ROOT)
print('Looking for dataset at:', CSV_PATH)

In [None]:
# Load data with error handling
try:
    df = pd.read_csv(CSV_PATH, low_memory=False)
    print('Loaded dataset with shape:', df.shape)
except FileNotFoundError:
    sys.exit(f'ERROR: Data file not found at {CSV_PATH}. Please place car_data.csv there.')
except Exception as e:
    sys.exit(f'ERROR reading CSV: {e}')

# Show top rows safely
display(df.head())

In [None]:
# Basic info and type-safe column checks
def col_exists(df, col):
    return col in df.columns

# Summary information
info = {
    'n_rows': df.shape[0],
    'n_cols': df.shape[1],
    'columns': list(df.columns)
}
pd.DataFrame([info]).T.rename(columns={0:'value'})

# Data types and missing values
dtypes = df.dtypes.astype(str).to_frame('dtype')
missing = df.isnull().sum().to_frame('missing_count')
summary_table = dtypes.join(missing)
summary_table.to_csv(RESULTS / 'eda_columns_summary.csv')
print('Saved column summary to', RESULTS / 'eda_columns_summary.csv')
display(summary_table.head(40))

In [None]:
# Numerical description (safe)
try:
    numeric_desc = df.select_dtypes(include=[np.number]).describe().T
    numeric_desc.to_csv(RESULTS / 'eda_numeric_description.csv')
    print('Saved numeric description to', RESULTS / 'eda_numeric_description.csv')
    display(numeric_desc)
except Exception as e:
    print('Could not compute numeric description:', e)

In [None]:
# Value counts for selected categorical columns (if present)
cat_cols = [c for c in ['Car_Name','Fuel_Type','Transmission','Owner'] if c in df.columns]
for c in cat_cols:
    try:
        vc = df[c].value_counts(dropna=False).rename_axis(c).reset_index(name='count')
        vc.to_csv(RESULTS / f'eda_value_counts_{c}.csv', index=False)
        print(f'Saved value counts for {c} to', RESULTS / f'eda_value_counts_{c}.csv')
        display(vc.head(10))
    except Exception as e:
        print('Could not compute value counts for', c, e)

In [None]:
# Plot: distribution of Selling_Price (if present)
target_col = 'Selling_Price'
if target_col in df.columns:
    try:
        plt.figure(figsize=(8,5))
        sns.histplot(df[target_col].dropna(), bins=40, kde=True)
        plt.title('Distribution of Selling_Price')
        plt.xlabel(target_col)
        plt.tight_layout()
        outpath = GRAPHS / 'selling_price_distribution.png'
        plt.savefig(outpath, dpi=150)
        plt.show()
        print('Saved figure to', outpath)
    except Exception as e:
        print('Could not plot Selling_Price distribution:', e)
else:
    print(f'Column {target_col} not found; skipping target distribution plot.')

In [None]:
# Plot: Selling_Price vs Mileage (if both present)
x_col = 'Mileage'
if target_col in df.columns and x_col in df.columns:
    try:
        plt.figure(figsize=(8,5))
        sns.scatterplot(x=df[x_col], y=df[target_col], alpha=0.6)
        plt.title(f'{target_col} vs {x_col}')
        plt.xlabel(x_col)
        plt.ylabel(target_col)
        plt.tight_layout()
        outpath = GRAPHS / 'selling_price_vs_mileage.png'
        plt.savefig(outpath, dpi=150)
        plt.show()
        print('Saved figure to', outpath)
    except Exception as e:
        print('Could not create scatter plot:', e)
else:
    print(f'Either {target_col} or {x_col} not found; skipping scatter plot.')

In [None]:
# Boxplot of Selling_Price by Transmission (if present)
group_col = 'Transmission'
if target_col in df.columns and group_col in df.columns:
    try:
        plt.figure(figsize=(8,5))
        sns.boxplot(x=df[group_col], y=df[target_col])
        plt.title(f'{target_col} by {group_col}')
        plt.xticks(rotation=45)
        plt.tight_layout()
        outpath = GRAPHS / 'price_by_transmission.png'
        plt.savefig(outpath, dpi=150)
        plt.show()
        print('Saved figure to', outpath)
    except Exception as e:
        print('Could not create boxplot:', e)
else:
    print(f'Either {target_col} or {group_col} not found; skipping boxplot.')

In [None]:
# Correlation heatmap for numeric columns (if at least 2 numeric cols)
num_df = df.select_dtypes(include=[np.number])
if num_df.shape[1] >= 2:
    try:
        corr = num_df.corr()
        plt.figure(figsize=(10,8))
        sns.heatmap(corr, annot=True, fmt='.2f', cmap='coolwarm', square=True)
        plt.title('Correlation matrix (numeric features)')
        plt.tight_layout()
        outpath = GRAPHS / 'correlation_matrix.png'
        plt.savefig(outpath, dpi=150)
        plt.show()
        corr.to_csv(RESULTS / 'eda_correlation_matrix.csv')
        print('Saved correlation matrix to', RESULTS / 'eda_correlation_matrix.csv')
    except Exception as e:
        print('Could not compute or plot correlation matrix:', e)
else:
    print('Not enough numeric columns for correlation matrix; skipping.')

In [None]:
# Save a small sample of the raw data for quick reference
try:
    sample = df.sample(n=min(100, len(df)), random_state=42)
    sample.to_csv(RESULTS / 'eda_sample_100rows.csv', index=False)
    print('Saved sample to', RESULTS / 'eda_sample_100rows.csv')
except Exception as e:
    print('Could not save sample file:', e)

## End of EDA

The notebook saved: summary CSVs to `outputs/results/` and figures to `outputs/graphs/`.
Proceed to `02_data_preprocessing.ipynb` to clean data and save `data/processed/cleaned_car_data.csv`.