In [48]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from xgboost import XGBRegressor
from sklearn.metrics import make_scorer 
from sklearn.feature_selection import RFECV
from sklearn.kernel_ridge import KernelRidge
from scipy.stats import uniform
from sklearn.model_selection import RandomizedSearchCV

import warnings

warnings.filterwarnings('ignore')
sns.set_theme(style="white")

In [2]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

**Load data with pandas**

In [10]:
df = pd.read_csv("/kaggle/input/vehicle-dataset-from-cardekho/car data.csv")

In [11]:
df.head()

**Order data by Year**

In [13]:
df = df.sort_values(by="Year", ascending=True)
df.info()

In [14]:
df.isnull().sum()


In [15]:
df = df.drop(['Car_Name'], axis = 1)


**Distribution of Some Features.**

In [17]:
sns.distplot(df['Selling_Price'],hist=False, bins=30)

In [19]:
fig, axs = plt.subplots(2, 2, figsize=(15, 10))

sns.histplot(data=df, x="Fuel_Type", stat="percent", color="skyblue", discrete=True, ax=axs[0, 0])
sns.histplot(data=df, x="Seller_Type", stat="percent", color="olive", discrete=True, ax=axs[0, 1])
sns.histplot(data=df, x="Transmission", stat="percent", color="gold", discrete=True, ax=axs[1, 0])
sns.histplot(data=df, x="Owner", stat="percent", color="gold", discrete=True, ax=axs[1, 1])

plt.show()

In [20]:
#convert all categorical columns to numeric
binary_cols = [col for col in df.columns if df[col].dtype in ["O"] and col != "Car_Name"]

df = pd.get_dummies(df, columns=binary_cols, drop_first=False)

df.head()


**Feature Selection**

In [24]:
df1 = df.copy()
y = df1["Selling_Price"]
X = df1.drop(["Selling_Price"], axis=1)


In [25]:
xgb = XGBRegressor()
xgb.fit(X, y)
imp = pd.DataFrame(xgb.feature_importances_ ,columns = ['Importance'],index = X.columns)
imp = imp.sort_values(['Importance'], ascending = False)
print(imp)

In [33]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10)
print("Shape of X_train: {}".format(X_train.shape))
print("Shape of y_train: {}".format(y_train.shape))
print("Shape of X_test: {}".format(X_test.shape))
print("Shape of y_test: {}".format(y_test.shape))

In [34]:
# Define a function to calculate RMSE
def rmse(y_true, y_pred):
    return np.sqrt(np.mean((y_true-y_pred)**2))

# Define a function to calculate negative RMSE (as a score)
def nrmse(y_true, y_pred):
    return -1.0*rmse(y_true, y_pred)

neg_rmse = make_scorer(nrmse)

estimator = XGBRegressor()
selector = RFECV(estimator, cv = 3, n_jobs = -1, scoring = neg_rmse)
selector = selector.fit(X_train, y_train)

print("The number of selected features is: {}".format(selector.n_features_))

features_kept = X_train.columns.values[selector.support_] 
X_train = X_train[features_kept]
X_test = X_test[features_kept]

In [35]:
features_kept

In [50]:
from sklearn.linear_model import Lasso
lasso001 = Lasso(alpha=0.0001, max_iter=100000).fit(X_train, y_train) 
print("Training set score: {:.2f}".format(lasso001.score(X_train, y_train))) 
print("Test set score: {:.2f}".format(lasso001.score(X_test, y_test))) 
print("Number of features used: {}".format(np.sum(lasso001.coef_ != 0)))