In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import re
import matplotlib
import matplotlib.pyplot as plt
import plotly.express as px
%matplotlib inline

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 150)
sns.set_style('darkgrid')
matplotlib.rcParams['font.size'] = 14
matplotlib.rcParams['figure.figsize'] = (10, 6)
matplotlib.rcParams['figure.facecolor'] = '#00000000'

In [None]:
cars = pd.read_csv("../input/car-pricecar-dekho/cardekho.csv.csv")
cars.head(10)

In [None]:
cars.shape

In [None]:
cars.isnull().sum()

In [None]:
cars = cars.dropna()

cars.shape

In [None]:
torque_list = cars['torque'].to_list()

torque_rpm = []
torque_nm = []
def extracting_torque(x):
  for item in x:
    res = item.replace(',', '')
    if 'nm' in res:
        res = res.replace('nm', 'Nm')
    temp = [float(s) for s in re.findall('\d*\.?\d+', res)]
    torque_rpm.append(max(temp))
    
    if ('kgm' in res):
        torque_nm.append(round(min(temp)*9.80665, 2))
    else:
        torque_nm.append(min(temp))

extracting_torque(torque_list)
# Printing the first two elements
print(torque_list[:10])
print(torque_rpm[:10])
print(torque_nm[:10])

In [None]:
cars['torque_rpm'] = torque_rpm
cars['torque_nm'] = torque_nm

cars.head(2)

In [None]:
mil_list = cars['mileage'].to_list()

mileages = []
def extracting_mileage(x):
  for item in x:
    temp = []
    try:
      for s in item.split(" "):
        temp.append(float(s))
    except:
      pass
    mileages.append(max(temp))

extracting_mileage(mil_list)
# Printing the first two elements
print(mil_list[:2])
print(mileages[:2])

In [None]:
cars['mileage_num'] = mileages

cars.head(2)

In [None]:
engine_list = cars['engine'].to_list()
# torque_list[:2]
engine_cc = []
def extractingEngine(x):
  for item in x:
    temp = []
    # Here we will take only the numerical value the pass on encountering characters
    try:
      for s in item.split(" "):
        temp.append(float(s))
    except:
      pass
    engine_cc.append(max(temp))

extractingEngine(engine_list)

# Printing the first two items
print(engine_list[:2])
print(engine_cc[:2])

In [None]:
cars['engine_cc'] = engine_cc

cars.head(2)

In [None]:
power_list = cars['max_power'].to_list()
# torque_list[:2]
max_power = []
def extractingPower(x):
  for item in x:
    temp = []
    try:
      for s in item.split(" "):
        temp.append(float(s))
    except:
      pass
    max_power.append(max(temp))

extractingPower(power_list)
# Printing the first two elements
print(power_list[:2])
print(max_power[:2])

In [None]:
cars['max_power_new'] = max_power

cars.head(2)

In [None]:
cars.drop(['mileage', 'engine', 'max_power', 'torque'], axis = 1, inplace = True)

In [None]:
cars.describe()

In [None]:
sns.heatmap(cars.corr(),annot=True,cmap='plasma',linewidths=.5);

In [None]:
px.histogram(cars.sample(1000), x='selling_price', color='fuel')

In [None]:
px.histogram(cars.sample(1000), x='selling_price', color='transmission')

In [None]:
px.histogram(cars.sample(1000), x='selling_price', color="seller_type")

In [None]:
px.scatter(cars.sample(1000), x='selling_price', y='torque_nm')

In [None]:
px.scatter(cars.sample(1000), x='selling_price', y='mileage_num')

In [None]:
# Our journey for finding categorical data.

cars['fuel'].value_counts()

In [None]:
cars['seller_type'].value_counts()

In [None]:
cars['transmission'].value_counts()

In [None]:
cars['owner'].value_counts()

In [None]:
transmission_vals = {'Manual':1, 'Automatic':0}

cars['transmission'] = cars['transmission'].map(transmission_vals)

cars.head(2)

In [None]:
enc = OneHotEncoder()

enc.fit(cars[['seller_type']])

one_hot = enc.transform(cars[['seller_type']]).toarray()

enc.categories_

In [None]:
cars[['Dealer', 'Individual', 'Trustmark Dealer']] = one_hot
cars.drop(['seller_type'], axis=1, inplace = True)
cars.head(2)

In [None]:
enc.fit(cars[['fuel']])
one_hot = enc.transform(cars[['fuel']]).toarray()

enc.categories_

In [None]:
cars[['CNG', 'Diesel', 'LPG', 'Petrol']] = one_hot
cars.drop(['fuel'], axis=1, inplace = True)
cars.head(2)

In [None]:
enc.fit(cars[['owner']])
one_hot = enc.transform(cars[['owner']]).toarray()

enc.categories_

In [None]:
cars[['First Owner', 'Fourth & Above Owner', 'Second Owner', 'Test Drive Car', 'Third Owner']] = one_hot
cars.drop(['owner'], axis=1, inplace=True)
cars.head(2)

In [None]:
scaler = StandardScaler()

numeric_cols = ['year', 'km_driven', 'seats', 'torque_rpm', 'torque_nm', 'mileage_num', 'max_power_new', 'engine_cc']

scaler.fit(cars[numeric_cols])

scaled_inputs = scaler.transform(cars[numeric_cols])
scaler.get_feature_names_out()

In [None]:
cars.drop(numeric_cols, axis=1, inplace=True)

cars[numeric_cols] = scaled_inputs
cars.head()

In [None]:
input_df = cars

In [None]:
input_df.head(2)

In [None]:
targets = input_df['selling_price']
input_df = input_df.drop(['selling_price', 'name'], axis = 1)

In [None]:
train_inputs, val_inputs, train_targets, val_targets = train_test_split(input_df, targets, test_size=0.3)

val_inputs, test_inputs, val_targets, test_targets = train_test_split(val_inputs, val_targets, test_size=0.5)

In [None]:
def train_val_test(model):
    model.fit(train_inputs, train_targets)
    train_pred = model.predict(train_inputs)
    test_pred = model.predict(test_inputs)
    val_pred = model.predict(val_inputs)
    print('R2 score for train dataset: ', model.score(train_inputs, train_targets))
    print('RMSE for train dataset: ', mean_squared_error(train_targets, train_pred, squared=False))
    print('R2 score for val dataset: ', model.score(val_inputs, val_targets))
    print('RMSE for val dataset: ', mean_squared_error(val_targets, val_pred, squared=False))
    print('R2 score for test dataset: ', model.score(test_inputs, test_targets))
    print('RMSE for test dataset: ', mean_squared_error(test_targets, test_pred, squared=False))

In [None]:
lin_reg = LinearRegression()

train_val_test(lin_reg)

In [None]:
ridge_model = Ridge(alpha = 0.01)

train_val_test(ridge_model)

In [None]:
lasso_model = Lasso(alpha = 0.01)

train_val_test(lasso_model)

In [None]:
dec_tree = DecisionTreeRegressor(max_depth=5, max_leaf_nodes=128, random_state=42)

train_val_test(dec_tree)

### Hyperparameter tuning for decision tree

In [None]:
def param_error(md, mln):
    model = DecisionTreeRegressor(max_leaf_nodes=mln, max_depth=md, random_state=42)
    model.fit(train_inputs, train_targets)
    train_acc = 1 - model.score(train_inputs, train_targets)
    val_acc = 1 - model.score(val_inputs, val_targets)
    return {'Max Depth': md, 'Max Leaf Nodes': mln, 'Training Error': train_acc, 'Validation Error': val_acc}

In [None]:
errors_df = pd.DataFrame([param_error(md, mln) for md in range(9, 16) for mln in range(180,251,2)])
pd.set_option('display.max_rows', None)
errors_df

In [None]:
dec_tree = DecisionTreeRegressor(random_state=42, max_depth=14, max_leaf_nodes=242)

train_val_test(dec_tree)

In [None]:
random_forest = RandomForestRegressor(n_estimators=300, max_depth=15, max_leaf_nodes=200 , random_state = 42, n_jobs = -1)

train_val_test(random_forest)

In [None]:
def param_error(md, mln):
    model = RandomForestRegressor(n_estimators=300, max_leaf_nodes=mln, max_depth=md, random_state=42)
    model.fit(train_inputs, train_targets)
    train_acc = 1 - model.score(train_inputs, train_targets)
    val_acc = 1 - model.score(val_inputs, val_targets)
    return {'Max Depth': md, 'Max Leaf Nodes': mln, 'Training Error': train_acc, 'Validation Error': val_acc}

In [None]:
errors_df = pd.DataFrame([param_error(md, mln) for md in range(16, 21) for mln in range(500,800,100)])
pd.set_option('display.max_rows', None)
errors_df

In [None]:
random_forest = RandomForestRegressor(n_estimators=300, max_leaf_nodes=700, max_depth=20, random_state=42)

train_val_test(random_forest)