In [198]:
import pandas as pd
import numpy as np 
import warnings
from summarytools import dfSummary
from sklearn.model_selection import train_test_split

In [199]:
warnings.filterwarnings('ignore')

## data cleaning

In [200]:
pd.set_option("display.max_columns",100)

In [201]:
path = "https://frenzy86.s3.eu-west-2.amazonaws.com/python/data/cars.csv"

In [202]:
df = pd.read_csv(path)

In [203]:
df

Unnamed: 0,name,year,Price,kms_driven,fuel_type
0,Hyundai Santro Xing XO eRLX Euro III,2007,80000,"45,000 kms",Petrol
1,Mahindra Jeep CL550 MDI,2006,425000,40 kms,Diesel
2,Maruti Suzuki Alto 800 Vxi,2018,Ask For Price,"22,000 kms",Petrol
3,Hyundai Grand i10 Magna 1.2 Kappa VTVT,2014,325000,"28,000 kms",Petrol
4,Ford EcoSport Titanium 1.5L TDCi,2014,575000,"36,000 kms",Diesel
...,...,...,...,...,...
887,Ta,zest,310000,,
888,Tata Zest XM Diesel,2018,260000,"27,000 kms",Diesel
889,Mahindra Quanto C8,2013,390000,"40,000 kms",Diesel
890,Honda Amaze 1.2 E i VTEC,2014,180000,Petrol,


In [204]:
list(df["name"])

['Hyundai Santro Xing XO eRLX Euro III',
 'Mahindra Jeep CL550 MDI',
 'Maruti Suzuki Alto 800 Vxi',
 'Hyundai Grand i10 Magna 1.2 Kappa VTVT',
 'Ford EcoSport Titanium 1.5L TDCi',
 'Ford EcoSport Titanium 1.5L TDCi',
 'Ford Figo',
 'Hyundai Eon',
 'Ford EcoSport Ambiente 1.5L TDCi',
 'Maruti Suzuki Alto K10 VXi AMT',
 'Skoda Fabia Classic 1.2 MPI',
 'Maruti Suzuki Stingray VXi',
 'Hyundai Elite i20 Magna 1.2',
 'Mahindra Scorpio SLE BS IV',
 'Hyundai Santro Xing XO eRLX Euro III',
 'Mahindra Jeep CL550 MDI',
 'Audi A8',
 'Audi Q7',
 'Mahindra Scorpio S10',
 'Maruti Suzuki Alto 800',
 'Mahindra Scorpio S10',
 'Mahindra Scorpio S10',
 'Maruti Suzuki Alto 800 Vxi',
 'Hyundai i20 Sportz 1.2',
 'Hyundai i20 Sportz 1.2',
 'Hyundai i20 Sportz 1.2',
 'Maruti Suzuki Alto 800 Lx',
 'Maruti Suzuki Vitara Brezza ZDi',
 'Maruti Suzuki Alto LX',
 'Mahindra Bolero DI',
 'Maruti Suzuki Swift Dzire ZDi',
 'Mahindra Scorpio S10 4WD',
 'Maruti Suzuki Swift Vdi BSIII',
 'Maruti Suzuki Wagon R VXi BS III',

In [205]:
df["name"].nunique()

525

In [206]:
df = df[df['Price']!='Ask For Price'].reset_index(drop=True)
df['Price'] = df['Price'].str.replace(',', '').astype(int)/100

mask = df['year'].value_counts().index[:20].to_list()
df = df[df['year'].isin(mask)].reset_index(drop=True)
df['year'] = df['year'].astype(int)

df.at[816, 'fuel_type'] = 'Petrol'
df.at[816, 'kms_driven'] = '50000'
df.at[815, 'fuel_type'] = 'Petrol'
df.at[815, 'kms_driven'] = '50000'

df.at[128, 'fuel_type'] = 'Petrol'

df['kms_driven'] = df['kms_driven'].str.replace(',', '').str.replace(' kms', '').astype(int)


In [207]:
df

Unnamed: 0,name,year,Price,kms_driven,fuel_type
0,Hyundai Santro Xing XO eRLX Euro III,2007,800.0,45000,Petrol
1,Mahindra Jeep CL550 MDI,2006,4250.0,40,Diesel
2,Hyundai Grand i10 Magna 1.2 Kappa VTVT,2014,3250.0,28000,Petrol
3,Ford EcoSport Titanium 1.5L TDCi,2014,5750.0,36000,Diesel
4,Ford Figo,2012,1750.0,41000,Diesel
...,...,...,...,...,...
812,Toyota Corolla Altis,2009,3000.0,132000,Petrol
813,Tata Zest XM Diesel,2018,2600.0,27000,Diesel
814,Mahindra Quanto C8,2013,3900.0,40000,Diesel
815,Honda Amaze 1.2 E i VTEC,2014,1800.0,50000,Petrol


In [208]:
df['brand'] = df['name'].str.split().str[0]

In [209]:
df['old'] = 2024 - df['year']

In [210]:
df = df.drop(columns=['name','year'])

In [211]:
dfSummary(df)

No,Variable,Stats / Values,Freqs / (% of Valid),Graph,Missing
1,Price [float64],Mean (sd) : 4110.1 (4747.3) min < med < max: 300.0 < 3000.0 < 85000.0 IQR (CV) : 3150.0 (0.9),272 distinct values,,0 (0.0%)
2,kms_driven [int32],Mean (sd) : 46260.5 (34281.3) min < med < max: 0.0 < 41000.0 < 400000.0 IQR (CV) : 29758.0 (1.3),247 distinct values,,0 (0.0%)
3,fuel_type [object],1. Petrol 2. Diesel 3. LPG,429 (52.5%) 386 (47.2%) 2 (0.2%),,0 (0.0%)
4,brand [object],1. Maruti 2. Hyundai 3. Mahindra 4. Tata 5. Honda 6. Toyota 7. Chevrolet 8. Renault 9. Ford 10. Volkswagen 11. other,220 (26.9%) 139 (17.0%) 98 (12.0%) 65 (8.0%) 61 (7.5%) 37 (4.5%) 35 (4.3%) 33 (4.0%) 30 (3.7%) 19 (2.3%) 80 (9.8%),,0 (0.0%)
5,old [int32],Mean (sd) : 11.5 (3.9) min < med < max: 5.0 < 11.0 < 24.0 IQR (CV) : 5.0 (2.9),20 distinct values,,0 (0.0%)


In [212]:
df

Unnamed: 0,Price,kms_driven,fuel_type,brand,old
0,800.0,45000,Petrol,Hyundai,17
1,4250.0,40,Diesel,Mahindra,18
2,3250.0,28000,Petrol,Hyundai,10
3,5750.0,36000,Diesel,Ford,10
4,1750.0,41000,Diesel,Ford,12
...,...,...,...,...,...
812,3000.0,132000,Petrol,Toyota,15
813,2600.0,27000,Diesel,Tata,6
814,3900.0,40000,Diesel,Mahindra,11
815,1800.0,50000,Petrol,Honda,10


In [213]:
df = pd.get_dummies(df, columns= ["fuel_type","brand"], drop_first=True)

## scelta del modello migliore

In [214]:
df

Unnamed: 0,Price,kms_driven,old,fuel_type_LPG,fuel_type_Petrol,brand_BMW,brand_Chevrolet,brand_Datsun,brand_Fiat,brand_Force,brand_Ford,brand_Hindustan,brand_Honda,brand_Hyundai,brand_Jaguar,brand_Jeep,brand_Land,brand_Mahindra,brand_Maruti,brand_Mercedes,brand_Mini,brand_Mitsubishi,brand_Nissan,brand_Renault,brand_Skoda,brand_Tata,brand_Toyota,brand_Volkswagen,brand_Volvo
0,800.0,45000,17,False,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,4250.0,40,18,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False
2,3250.0,28000,10,False,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,5750.0,36000,10,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,1750.0,41000,12,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
812,3000.0,132000,15,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False
813,2600.0,27000,6,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False
814,3900.0,40000,11,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False
815,1800.0,50000,10,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [215]:
X_ = df.drop("Price", axis=1)
y = df["Price"]

In [216]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X = sc.fit_transform(X_)

In [217]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=667)

### creazione random forest

In [218]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=200)
rf.fit(X_train, y_train)
rf.score(X_test, y_test)

0.21271795690284734

### creazione regressione lineare

In [219]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train, y_train)
lr.score(X_test, y_test)

-1.3931711056920273

### creazione decision tree regressor

In [220]:
from sklearn.tree import DecisionTreeRegressor
dt = DecisionTreeRegressor()
dt.fit(X_train, y_train)
dt.score(X_test, y_test)

0.15835534204590263

In [221]:
from sklearn.linear_model import ElasticNet
en = ElasticNet()
en.fit(X_train, y_train)
en.score(X_test, y_test)

0.16730073005307755

### creazione ridge

In [222]:
from sklearn.linear_model import Ridge
rid = Ridge()
rid.fit(X_train, y_train)
rid.score(X_test, y_test)

0.23178219402173506

### creazione lasso

In [None]:
from sklearn.linear_model import Lasso
las = Lasso()
las.fit(X_train, y_train)
las.score(X_test, y_test)

0.23239771040976476

In [224]:
from sklearn.model_selection import cross_val_predict