In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# imports and check files
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error

import joblib

# show files in the dataset folder
!ls -la /kaggle/input/car-price-prediction


In [None]:
# load dataset
csv_path = "/kaggle/input/car-price-prediction/CarPrice_Assignment.csv"
df = pd.read_csv(csv_path)
df.head()


In [None]:
# quick EDA
print("rows, cols:", df.shape)
display(df.info())
display(df.describe().T)
print("missing values per column:")
display(df.isnull().sum())


In [None]:
# extract brand and small cleanups
df['brand'] = df['CarName'].apply(lambda x: str(x).split(' ')[0].lower())

# fix common typos
typos = {'maxda':'mazda','porcshce':'porsche','toyouta':'toyota',
         'vokswagen':'volkswagen','vw':'volkswagen'}
df['brand'] = df['brand'].replace(typos)

# drop ID and original CarName
df = df.drop(['car_ID'], axis=1)
df = df.drop(['CarName'], axis=1)

df.head()


In [None]:
# check categorical columns
cat_cols = df.select_dtypes(include=['object']).columns.tolist()
cat_cols


In [None]:
# one-hot encode categorical features and avoid multicollinearity
df = pd.get_dummies(df, columns=cat_cols, drop_first=True)
print("after encoding shape:", df.shape)


In [None]:
# split X, y and train/test
X = df.drop("price", axis=1)
y = df["price"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42)
print("train shape:", X_train.shape, "test shape:", X_test.shape)


In [None]:
# scale features 
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [None]:
# Linear Regression baseline
lr = LinearRegression()
lr.fit(X_train_scaled, y_train)

y_pred_lr = lr.predict(X_test_scaled)
print("LR R2:", r2_score(y_test, y_pred_lr).round(4))
print("LR RMSE:", mean_squared_error(y_test, y_pred_lr, squared=False).round(2))


In [None]:
# Random Forest
rf = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)
print("RF R2:", r2_score(y_test, y_pred_rf).round(4))
print("RF RMSE:", mean_squared_error(y_test, y_pred_rf, squared=False).round(2))


In [None]:
# results folder and plots
os.makedirs('/kaggle/working/results', exist_ok=True)

# actual vs predicted (RF)
plt.figure(figsize=(6,6))
plt.scatter(y_test, y_pred_rf, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.xlabel("Actual Price")
plt.ylabel("Predicted Price")
plt.title("Actual vs Predicted (Random Forest)")
plt.tight_layout()
plt.savefig('/kaggle/working/results/pred_vs_actual_rf.png')
plt.show()

# top feature importances
feat_imp = pd.Series(rf.feature_importances_, index=X.columns).sort_values(ascending=False).head(20)
plt.figure(figsize=(8,6))
feat_imp[::-1].plot(kind='barh')  # reversed for nicer plot
plt.title("Top 20 Feature Importances (RF)")
plt.tight_layout()
plt.savefig('/kaggle/working/results/feature_importances.png')
plt.show()


In [None]:
# save models and scaler to /kaggle/working
joblib.dump(rf, "/kaggle/working/rf_model.joblib")
joblib.dump(lr, "/kaggle/working/lr_model.joblib")
joblib.dump(scaler, "/kaggle/working/scaler.joblib")

# list files to confirm
!ls -la /kaggle/working


In [None]:
# README.md and requirements.txt repo
readme_text = """# Car Price Prediction

This repository contains a Kaggle notebook that:
- loads the Car Price dataset (CarPrice_Assignment.csv),
- does basic EDA and cleaning,
- trains Linear Regression and Random Forest models to predict car price,
- saves results and models.

Files:
- `used_car_price_prediction.ipynb` — the notebook.
- `results/` — saved plots.
- `rf_model.joblib` — trained Random Forest model.
- `requirements.txt` — minimal packages to run locally.

Dataset: https://www.kaggle.com/datasets/hellbuoy/car-price-prediction
"""

with open('/kaggle/working/README.md','w') as f:
    f.write(readme_text)

requirements = """pandas
numpy
scikit-learn
matplotlib
seaborn
joblib
"""
with open('/kaggle/working/requirements.txt','w') as f:
    f.write(requirements)

!ls -la /kaggle/working
