In [6]:
# Imports and path setup
import sys
from pathlib import Path
import re
import pandas as pd
import numpy as np
from datetime import datetime

ROOT = Path.cwd()
DATA_RAW = ROOT / 'data' / 'raw'
DATA_PROCESSED = ROOT / 'data' / 'processed'
OUTPUTS = ROOT / 'outputs'
RESULTS = OUTPUTS / 'results'
GRAPHS = OUTPUTS / 'graphs'
for p in [DATA_PROCESSED, RESULTS, GRAPHS]:
    p.mkdir(parents=True, exist_ok=True)

CSV_IN = 'D:\\Livstream\\ Car Price Prediction with Machine Learning\\data\\raw\\car data.csv'
CSV_OUT = DATA_PROCESSED / 'cleaned_car_data.csv'
REPORT_TXT = RESULTS / 'preprocessing_report.txt'
SAMPLE_OUT = RESULTS / 'cleaned_sample.csv'

print('Root:', ROOT)
print('Input file:', CSV_IN)

Root: d:\Livstream\ Car Price Prediction with Machine Learning\notebooks
Input file: D:\Livstream\ Car Price Prediction with Machine Learning\data\raw\car data.csv


In [4]:
# Utility functions for cleaning
def extract_number(val):
    if pd.isna(val):
        return np.nan
    s = str(val)
    m = re.search(r'([0-9]+\.?[0-9]*)', s.replace(',',''))
    if m:
        try:
            return float(m.group(1))
        except Exception:
            return np.nan
    return np.nan

def safe_to_numeric(series):
    return series.apply(extract_number)

def compute_age(year_col):
    now = datetime.now().year
    return year_col.apply(lambda y: now - int(y) if not pd.isna(y) and str(y).isdigit() else np.nan)

In [7]:
# Load raw data with error handling
try:
    df = pd.read_csv(CSV_IN, low_memory=False)
    print('Loaded data with shape:', df.shape)
except FileNotFoundError:
    sys.exit(f'ERROR: {CSV_IN} not found. Place car_data.csv in data/raw.')
except Exception as e:
    sys.exit(f'ERROR reading CSV: {e}')

# Work on a copy to be safe
df_clean = df.copy()
report_lines = []
report_lines.append(f'Raw rows,cols: {df.shape}')

Loaded data with shape: (301, 9)


In [8]:
# Standard cleaning steps (applied only if columns exist)
# 1) Strip whitespace from string/object columns
for c in df_clean.select_dtypes(include=['object']).columns:
    try:
        df_clean[c] = df_clean[c].astype(str).str.strip().replace({'nan': np.nan})
    except Exception:
        pass
report_lines.append('Stripped whitespace from object columns')

# 2) Handle Year -> compute Age column if possible
if 'Year' in df_clean.columns:
    try:
        df_clean['Year'] = pd.to_numeric(df_clean['Year'], errors='coerce')
        df_clean['Age'] = compute_age(df_clean['Year'])
        report_lines.append('Computed Age from Year')
    except Exception as e:
        report_lines.append(f'Could not compute Age: {e}')
else:
    report_lines.append('Year column not present; skipped Age computation')

# 3) Convert Mileage, Engine, Power to numeric where present
for col in ['Mileage','Engine','Power']:
    if col in df_clean.columns:
        try:
            df_clean[col + '_num'] = safe_to_numeric(df_clean[col])
            report_lines.append(f'Converted {col} to {col}_num')
        except Exception as e:
            report_lines.append(f'Could not convert {col}: {e}')
    else:
        report_lines.append(f'{col} not present; skipped')

# 4) Handle Selling_Price (target): ensure numeric
if 'Selling_Price' in df_clean.columns:
    try:
        df_clean['Selling_Price'] = pd.to_numeric(df_clean['Selling_Price'], errors='coerce')
        report_lines.append('Converted Selling_Price to numeric')
    except Exception as e:
        report_lines.append(f'Could not convert Selling_Price: {e}')
else:
    report_lines.append('Selling_Price not present in data')

# 5) Drop exact duplicate rows
try:
    before = len(df_clean)
    df_clean = df_clean.drop_duplicates().reset_index(drop=True)
    after = len(df_clean)
    report_lines.append(f'Dropped duplicates: {before-after} rows')
except Exception as e:
    report_lines.append(f'Could not drop duplicates: {e}')

# 6) Optional: drop rows with missing target (do not drop if user wants to keep)
if 'Selling_Price' in df_clean.columns:
    before = len(df_clean)
    df_clean = df_clean[df_clean['Selling_Price'].notna()].reset_index(drop=True)
    after = len(df_clean)
    report_lines.append(f'Dropped rows with missing Selling_Price: {before-after}')
else:
    report_lines.append('Did not drop rows for missing target since Selling_Price absent')

In [10]:
# Save cleaned data and report with error handling
try:
    df_clean.to_csv(CSV_OUT, index=False)
    report_lines.append(f'Wrote cleaned CSV to {CSV_OUT}')
except Exception as e:
    report_lines.append(f'Failed to write cleaned CSV: {e}')

try:
    with open(REPORT_TXT, 'w', encoding='utf-8') as f:
        f.write(''.join(report_lines))
    print('Saved preprocessing report to', REPORT_TXT)
except Exception as e:
    print('Could not write report:', e)

# Save a small sample for quick inspection
try:
    sample = df_clean.sample(n=min(100, len(df_clean)), random_state=42)
    sample.to_csv(SAMPLE_OUT, index=False)
    print('Saved cleaned sample to', SAMPLE_OUT)
except Exception as e:
    print('Could not save sample:', e)

print('Preprocessing finished. Cleaned shape:', df_clean.shape)

Saved preprocessing report to d:\Livstream\ Car Price Prediction with Machine Learning\notebooks\outputs\results\preprocessing_report.txt
Saved cleaned sample to d:\Livstream\ Car Price Prediction with Machine Learning\notebooks\outputs\results\cleaned_sample.csv
Preprocessing finished. Cleaned shape: (299, 10)


## Notes and next steps
- The notebook writes cleaned data to `data/processed/cleaned_car_data.csv`.
- It creates `outputs/results/preprocessing_report.txt` and `outputs/results/cleaned_sample.csv`.
- Proceed to `03_feature_engineering.ipynb` to transform categorical variables and prepare training features.