In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import KNNImputer,SimpleImputer
from sklearn.preprocessing import StandardScaler,LabelEncoder,OneHotEncoder,OrdinalEncoder,PolynomialFeatures
from sklearn.linear_model import LinearRegression , Ridge , Lasso , ElasticNet
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split,cross_val_score,GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
train_data = '/kaggle/input/used-cars-price-prediction/train-data.csv'
test_data = '/kaggle/input/used-cars-price-prediction/test-data.csv'

# Exploratory Data Analysis (EDA)

In [None]:
df = pd.read_csv(train_data)
df.head()

In [None]:
df_test = pd.read_csv(test_data)
df_test.head()

In [None]:
df.columns = df.columns.str.lower()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.duplicated().sum()

In [None]:
df.drop(['unnamed: 0','new_price'], axis  = 1 , inplace = True)
df.head()

In [None]:
categorical_features = df[['name', 'location', 'year', 'fuel_type', 'transmission', 'owner_type','seats']]
numerical_features = df[['kilometers_driven', 'mileage', 'engine', 'power']]

In [None]:
df.hist()

In [None]:
df.describe()

In [None]:
df.describe(include = 'O')

In [None]:
df['name'] = df['name'].str.split().str[0]
fig, axs = plt.subplots(4, 2, figsize=(50, 50), constrained_layout=True)

# Flattening axs array so you can access subplots linearly
axs = axs.ravel()
for i, c in enumerate(categorical_features.columns):
    sns.countplot(data=df, x=c, ax=axs[i])
    axs[i].set_title('{} count plot'.format(c), fontsize=35)
    axs[i].tick_params(axis='x', rotation=70, labelsize=20)
    axs[i].tick_params(axis='y', labelsize=20)

# Hide any extra empty subplots
for j in range(len(categorical_features.columns), len(axs)):
    fig.delaxes(axs[j])

plt.suptitle('Categorical Features Count Plots', x=0.5, y=1.03, fontsize=40)
plt.show()


In [None]:
df['seats'].value_counts()

In [None]:
df.drop(df[df['seats'] == 0].index, axis = 0 , inplace = True)

In [None]:
fig, axs = plt.subplots(4, 2, figsize=(50, 50), constrained_layout=True)
axs = axs.flatten()
for i, c in enumerate(categorical_features.columns):
    order = df.groupby(c)['price'].mean().sort_values(ascending=False).index
    sns.barplot(data=df, x=c, y='price', ax=axs[i], order=order)
    
    axs[i].set_title('{} vs price'.format(c),fontsize=35)
    axs[i].tick_params(axis='x', rotation=70, labelsize=20)
    axs[i].tick_params(axis='y', labelsize=20)
for j in range(len(categorical_features.columns), len(axs)):
    fig.delaxes(axs[j])

plt.suptitle('Categorical Features vs Price', x=0.5, y=1.03, fontsize=40)
plt.show()



In [None]:
plt.figure()
sns.histplot(df['price'])
plt.show()

In [None]:
df['price'] = np.log(df['price'])
plt.figure()
sns.histplot(df['price'])
plt.show()

# Preprocessing

In [None]:
df.head()

In [None]:
def mileage_convert(x):
    if type(x) == str:
        if x.split()[-1] == 'km/kg':
            return float(x.split()[0])*1.40
        elif x.split()[-1] == 'kmpl':
            return float(x.split()[0])
    else:
        np.nan

df['mileage'] = df['mileage'].apply(mileage_convert)
df.head()

In [None]:
# For columns with string data, split the string and take the first part
for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = df[col].str.split().str[0]
 # Convert numeric columns to integers, coercing errors
df['mileage'] = pd.to_numeric(df['mileage'], errors='coerce').fillna(0).astype(float)
df['engine'] = pd.to_numeric(df['engine'], errors='coerce').fillna(0).astype(float)
df['power'] = pd.to_numeric(df['power'], errors='coerce').fillna(0).astype(float)

    
# Ensure 'year' is treated as a string (if that's the intent)
df['year'] = df['year'].astype(str)
df.info()

In [None]:
df.head()

# Outliers Handeling

In [None]:
fig, axs = plt.subplots(2, 2, figsize=(20, 10), constrained_layout=True)
axs = axs.flatten()
for i, c in enumerate(numerical_features.columns):
    sns.boxplot(data=df, x=c, ax=axs[i])
    axs[i].set_title('{} box plot'.format(c), fontsize=15)
for j in range(len(categorical_features.columns), len(axs)):
    fig.delaxes(axs[j])

plt.suptitle('Numerical Features Box Plots', x=0.5, y=1.06, fontsize=20)
plt.show()

In [None]:
def outliers_handling(x, p1=0.25 , p2=0.75):
    ql = x.quantile(p1)
    q3 = x.quantile(p2)
    iqr = q3 - ql
    min_limit = ql - 1.5 * iqr
    max_limit = q3 + 1.5 * iqr
    return x[(x > min_limit) & (x < max_limit)]

In [None]:
for col in numerical_features:
    df[col] = outliers_handling(df[col])

In [None]:
fig, axs = plt.subplots(2, 2, figsize=(20, 10), constrained_layout=True)
axs = axs.flatten()
for i, c in enumerate(numerical_features.columns):
    sns.boxplot(data=df, x=c, ax=axs[i])
    axs[i].set_title('{} box plot'.format(c), fontsize=15)

plt.suptitle('Numerical Features Box Plots', x=0.5, y=1.06, fontsize=20)
plt.show()

In [None]:
df.head()

# Data Spliting

In [None]:
x = df.drop(['price'],axis = 1)
y = df['price']

In [None]:
x_train , x_test , y_train , y_test = train_test_split(x , y , test_size = 0.2)
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

In [None]:
cal_cols = categorical_features.columns.tolist()
num_cols = numerical_features.columns.tolist()
print(cal_cols)
print(num_cols)

In [None]:
nominal_cols = ['name', 'location', 'fuel_type', 'transmission']
ordinal_cols = ['year', 'owner_type', 'seats']

In [None]:
num_pipeline = Pipeline([
    ('num_imputer', SimpleImputer(strategy = 'median')),
    ('poly', PolynomialFeatures(degree = 3)),
    ('scaler', StandardScaler())
])

ordinal_pipeline = Pipeline([
    ('ord_imputer', SimpleImputer(strategy = 'most_frequent')),
    ('ord_encoder', OrdinalEncoder(handle_unknown = 'use_encoded_value', unknown_value = -1)),
])

nominal_pipeline = Pipeline([
    ('nom_imputer', SimpleImputer(strategy = 'most_frequent')),
    ('ohe', OneHotEncoder(drop = 'first', sparse_output = False, handle_unknown = 'ignore'))
])

preprocessor = ColumnTransformer([
    ('num_pipeline', num_pipeline, num_cols),
    ('ordinal_pipeline', ordinal_pipeline, ordinal_cols),
    ('nominal_pipeline', nominal_pipeline, nominal_cols)
]).set_output(transform = 'pandas')

In [None]:
x_train_prep = preprocessor.fit_transform(x_train)
x_test_prep = preprocessor.transform(x_test)

In [None]:
x_train_prep

# Model Training

In [None]:
# Dictionary containing regression models
models = {
    'LinearRegression': LinearRegression(),
    'Ridge' : Ridge(),
    'Lasso' : Lasso(),
    'ElasticNet' : ElasticNet(),
    'DecisionTreeRegressor': DecisionTreeRegressor(),
    'RandomForestRegressor': RandomForestRegressor(),
    'KNeighborsRegressor': KNeighborsRegressor(),
    'SVR' : SVR(),
}

# Dictionary to store the mean R² scores for each model
model_results = {}

# Train and evaluate models
for model_name, model in models.items():
    scores = cross_val_score(model, x_train_prep, y_train, cv=5, scoring='r2')
    model_results[model_name] = scores.mean()
    #print(f'{model_name}: Mean R² Score = {scores.mean():.4f}')
    
print("\nFinal Results:")
for model_name, score in model_results.items():
    print(f'{model_name}: {score:.4f}')

In [None]:
model_names = list(model_results.keys())
r2_scores = list(model_results.values())

plt.figure(figsize=(10, 6))
plt.bar(model_names, r2_scores)

plt.xlabel('Model', fontsize=12)
plt.ylabel('R² Score', fontsize=12)
plt.title('R² Scores of Regression Models', fontsize=14)
plt.xticks(rotation=45, ha='right') 

plt.tight_layout()
plt.show()

In [None]:
sorted_results = sorted(model_results.items(), key=lambda x: x[1], reverse=True)
top_two_models = sorted_results[:2]
for model_name, r2_score in top_two_models:
    print(f'{model_name}: {r2_score:.4f}')

# Hyperparameter Tuning

In [None]:
best_model = None  # To store the best model
best_score = -float('inf')  # To store the best R² score
best_model_name = ''  # To store the name of the best model
best_params = {}  # To store the best hyperparameters
model_results = {}  # To store the results of each model

# Loop through each model and perform GridSearchCV with hyperparameter tuning
for model_name, model in models.items():
    print(f"Running GridSearchCV for {model_name}...")

    # Build a pipeline with preprocessor and the model
    pipeline = Pipeline([
        ('preprocessor', preprocessor),  
        (model_name.lower(), model)
    ])

    # Define parameter grid for each model
    if model_name in ['Ridge', 'Lasso']:
        params = {
            'preprocessor__num_pipeline__poly__degree': [2, 3, 4, 5],
            f'{model_name.lower()}__alpha': [0.1, 1, 10]
        }
    elif model_name == 'ElasticNet':
        params = {
            'preprocessor__num_pipeline__poly__degree': [2, 3, 4, 5],
            f'{model_name.lower()}__alpha': [0.1, 1, 10],
            f'{model_name.lower()}__l1_ratio': [0.1, 0.5, 0.9]
        }
    elif model_name == 'RandomForestRegressor':
        params = {
            'preprocessor__num_pipeline__poly__degree': [2, 3],
            f'{model_name.lower()}__n_estimators': [100, 200],
            f'{model_name.lower()}__max_depth': [None, 10, 20]
        }
    elif model_name == 'DecisionTreeRegressor':
        params = {
            'preprocessor__num_pipeline__poly__degree': [2, 3],
            f'{model_name.lower()}__max_depth': [None, 10, 20]
        }
    elif model_name == 'KNeighborsRegressor':
        params = {
            'preprocessor__num_pipeline__poly__degree': [2, 3],
            f'{model_name.lower()}__n_neighbors': [3, 5, 7]
        }
    elif model_name == 'SVR':
        params = {
            'preprocessor__num_pipeline__poly__degree': [2, 3],
            f'{model_name.lower()}__C': [0.1, 1, 10],
            f'{model_name.lower()}__epsilon': [0.01, 0.1, 1]
        }
    else:
        params = {
            'preprocessor__num_pipeline__poly__degree': [2, 3]
        }

    # Perform GridSearchCV
    grid_search = GridSearchCV(pipeline, params, cv=3, scoring='r2')
    grid_search.fit(x_train, y_train)

    # Store the results in the model_results dictionary
    model_results[model_name] = pd.DataFrame(grid_search.cv_results_)

    # Print the best parameters for the model
    print(f"Best parameters for {model_name}: {grid_search.best_params_}\n")

# To view the results for each model:
for model_name, result_df in model_results.items():
    print(f"Results for {model_name}:")
    print(result_df[['params', 'mean_test_score']].sort_values(by='mean_test_score', ascending=False))
    print("\n")


In [None]:
# Check if this model has the best score so far
if grid_search.best_score_ > best_score:
    best_score = grid_search.best_score_
    best_model = grid_search.best_estimator_
    best_model_name = model_name
    best_params = grid_search.best_params_

# Print the best model and its score
print(f"Best Model: {best_model_name} with R² Score: {best_score}")
print(f"Best Parameters: {best_params}")

# Save Model

In [None]:
import pickle
pickle.dump(best_model, open('model.pkl', 'wb'))

In [None]:
# Load the model
with open('model.pkl', 'rb') as file:
    best_model = pickle.load(file)

# Make predictions
y_pred = best_model.predict(x_test)
y_pred

In [None]:
y_test

In [None]:
print('y_test shape:', y_test.shape)  
print('y_pred shape:', y_pred.shape)  

In [None]:
mean_y = np.mean(y_test)

# Calculate SS_res and SS_tot
SS_res = np.sum((y_test - y_pred) ** 2)
SS_tot = np.sum((y_test - mean_y) ** 2)

# Calculate R² score
r2_manual = 1 - (SS_res / SS_tot)

print('R² Score (manual calculation):', r2_manual)