In [1]:
# Imports and paths
import sys
from pathlib import Path
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

ROOT = Path.cwd()
DATA_PROCESSED = ROOT / 'data' / 'processed'
OUTPUTS = ROOT / 'outputs'
RESULTS = OUTPUTS / 'results'
GRAPHS = OUTPUTS / 'graphs'
MODELS = ROOT / 'models'
for p in [DATA_PROCESSED, RESULTS, GRAPHS, MODELS]:
    p.mkdir(parents=True, exist_ok=True)

CSV_IN = DATA_PROCESSED / 'cleaned_car_data.csv'
X_TRAIN_OUT = DATA_PROCESSED / 'X_train.csv'
X_TEST_OUT = DATA_PROCESSED / 'X_test.csv'
Y_TRAIN_OUT = DATA_PROCESSED / 'y_train.csv'
Y_TEST_OUT = DATA_PROCESSED / 'y_test.csv'
FEATURES_OUT = RESULTS / 'features_list.csv'
PICKLE_PREPROC = MODELS / 'preprocessor.pkl'

print('Input cleaned CSV:', CSV_IN)

Input cleaned CSV: d:\Livstream\ Car Price Prediction with Machine Learning\notebooks\data\processed\cleaned_car_data.csv


In [2]:
# Load cleaned data
try:
    df = pd.read_csv(CSV_IN, low_memory=False)
    print('Loaded cleaned data with shape:', df.shape)
except FileNotFoundError:
    sys.exit(f'ERROR: cleaned data not found at {CSV_IN}. Run preprocessing first.')
except Exception as e:
    sys.exit(f'ERROR reading cleaned CSV: {e}')

# Ensure target exists
if 'Selling_Price' not in df.columns:
    sys.exit('ERROR: Target column Selling_Price not found in cleaned data.')

# Work on a copy
df_feat = df.copy()

Loaded cleaned data with shape: (299, 10)


In [None]:
# Extract brand from Car_Name (first token) if Car_Name exists\n
if 'Car_Name' in df_feat.columns:\n
    try:\n
        df_feat['Brand'] = df_feat['Car_Name'].astype(str).str.split().str[0].str.upper().replace({'NAN':np.nan})\n
        print('Extracted Brand from Car_Name')\n
    except Exception as e:\n
        print('Could not extract Brand:', e)\n
else:\n
    print('Car_Name not present; skipping Brand extraction')\n
\n
# Drop original Car_Name to avoid leaving raw string columns (would break model training)\n
if 'Car_Name' in df_feat.columns:\n
    try:\n
        df_feat.drop(columns=['Car_Name'], inplace=True, errors='ignore')\n
        print('Dropped Car_Name to avoid string features')\n
    except Exception as e:\n
        print('Could not drop Car_Name:', e)\n
\n
# Choose numeric columns (prefer *_num variants created in preprocessing)\n
candidate_numeric = [c for c in ['Mileage_num','Engine_num','Power_num','Age'] if c in df_feat.columns]
# Also include any other numeric columns except target
other_numeric = df_feat.select_dtypes(include=[np.number]).columns.tolist()
other_numeric = [c for c in other_numeric if c not in candidate_numeric and c != 'Selling_Price']
numeric_cols = candidate_numeric + other_numeric
numeric_cols = list(dict.fromkeys(numeric_cols))  # preserve order, remove duplicates
print('Numeric columns used:', numeric_cols)

# Categorical columns: object dtype excluding Car_Name (we created Brand) and target
cat_cols = [c for c in df_feat.select_dtypes(include=['object']).columns if c not in ['Car_Name'] and c != 'Selling_Price']
# Ensure Brand included if present
if 'Brand' in df_feat.columns and 'Brand' not in cat_cols:
    cat_cols.append('Brand')
print('Categorical columns used:', cat_cols)

Extracted Brand from Car_Name
Numeric columns used: ['Age', 'Year', 'Present_Price', 'Driven_kms', 'Owner']
Categorical columns used: ['Fuel_Type', 'Selling_type', 'Transmission', 'Brand']


In [4]:
# Fill numeric missing values with median
for c in numeric_cols:
    try:
        med = df_feat[c].median()
        df_feat[c] = df_feat[c].fillna(med)
    except Exception:
        df_feat[c] = df_feat[c].fillna(0)

# For categoricals, fill missing with 'Unknown'
for c in cat_cols:
    try:
        df_feat[c] = df_feat[c].fillna('Unknown')
    except Exception:
        pass

# One-hot encode categoricals (use get_dummies on full dataset to avoid mismatch later)
try:
    df_encoded = pd.get_dummies(df_feat, columns=cat_cols, drop_first=True) if len(cat_cols)>0 else df_feat.copy()
    print('Applied one-hot encoding; new shape:', df_encoded.shape)
except Exception as e:
    sys.exit(f'ERROR during encoding: {e}')

Applied one-hot encoding; new shape: (299, 54)


In [5]:
# Prepare X and y
target = 'Selling_Price'
if target not in df_encoded.columns:
    sys.exit('ERROR: target missing after encoding')
X = df_encoded.drop(columns=[target])
y = df_encoded[target]

# Identify final numeric columns in X (for scaling)
final_numeric = [c for c in X.columns if c in numeric_cols]
print('Final numeric columns to scale:', final_numeric)

# Scale numeric columns using StandardScaler and save scaler object
scaler = StandardScaler()
if len(final_numeric) > 0:
    X[final_numeric] = scaler.fit_transform(X[final_numeric])
else:
    print('No numeric columns found for scaling; skipping scaler fit')

# Save features list and scaler
features_list = X.columns.tolist()
pd.DataFrame({'feature':features_list}).to_csv(FEATURES_OUT, index=False)
with open(PICKLE_PREPROC, 'wb') as f:
    pickle.dump({'scaler':scaler, 'features':features_list, 'numeric_features': final_numeric}, f)
print('Saved preprocessor to', PICKLE_PREPROC)

Final numeric columns to scale: ['Year', 'Present_Price', 'Driven_kms', 'Owner', 'Age']
Saved preprocessor to d:\Livstream\ Car Price Prediction with Machine Learning\notebooks\models\preprocessor.pkl


In [6]:
# Train-test split and save
try:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    X_train.to_csv(X_TRAIN_OUT, index=False)
    X_test.to_csv(X_TEST_OUT, index=False)
    y_train.to_csv(Y_TRAIN_OUT, index=False, header=True)
    y_test.to_csv(Y_TEST_OUT, index=False, header=True)
    print('Saved train/test splits to', DATA_PROCESSED)
except Exception as e:
    sys.exit(f'ERROR saving train/test splits: {e}')

Saved train/test splits to d:\Livstream\ Car Price Prediction with Machine Learning\notebooks\data\processed


In [7]:
# Save a small sample of X_train for quick inspection
try:
    X_train.sample(n=min(50, len(X_train)), random_state=42).to_csv(RESULTS / 'feature_sample_X_train.csv', index=False)
    print('Saved feature sample to', RESULTS / 'feature_sample_X_train.csv')
except Exception as e:
    print('Could not save feature sample:', e)

Saved feature sample to d:\Livstream\ Car Price Prediction with Machine Learning\notebooks\outputs\results\feature_sample_X_train.csv


## End of Feature Engineering

Saved outputs:
- `data/processed/X_train.csv`, `X_test.csv`, `y_train.csv`, `y_test.csv`
- `models/preprocessor.pkl` (pickle containing scaler and feature list)
- `outputs/results/features_list.csv` and `outputs/results/feature_sample_X_train.csv`

Next: run `04_model_training.ipynb` to train `RandomForestRegressor` using these files.