# Autos Prediction using Decision Trees
Use Random Forest / Gradient Boosted Regression Tree / Regression Trees algorithms to perform the following:
1. Predict the prices of cars using the provided auto dataset.

## Step 1: Import all necessary libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

## Step 2: Load the Dataset

In [2]:
autoData = pd.read_csv("Auto Dataset.csv", encoding="latin-1")
autoData.info()
autoData.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 20 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   dateCrawled          50000 non-null  object
 1   name                 50000 non-null  object
 2   seller               50000 non-null  object
 3   offerType            50000 non-null  object
 4   price                50000 non-null  object
 5   abtest               50000 non-null  object
 6   vehicleType          44905 non-null  object
 7   yearOfRegistration   50000 non-null  int64 
 8   gearbox              47320 non-null  object
 9   powerPS              50000 non-null  int64 
 10  model                47242 non-null  object
 11  odometer             50000 non-null  object
 12  monthOfRegistration  50000 non-null  int64 
 13  fuelType             45518 non-null  object
 14  brand                50000 non-null  object
 15  notRepairedDamage    40171 non-null  object
 16  date

Unnamed: 0,dateCrawled,name,seller,offerType,price,abtest,vehicleType,yearOfRegistration,gearbox,powerPS,model,odometer,monthOfRegistration,fuelType,brand,notRepairedDamage,dateCreated,nrOfPictures,postalCode,lastSeen
0,3/26/2016 17:47,Peugeot_807_160_NAVTECH_ON_BOARD,privat,Angebot,"$5,000",control,bus,2004,manuell,158,andere,"150,000km",3,lpg,peugeot,nein,3/26/2016 0:00,0,79588,4/6/2016 6:45
1,4/4/2016 13:38,BMW_740i_4_4_Liter_HAMANN_UMBAU_Mega_Optik,privat,Angebot,"$8,500",control,limousine,1997,automatik,286,7er,"150,000km",6,benzin,bmw,nein,4/4/2016 0:00,0,71034,4/6/2016 14:45
2,3/26/2016 18:57,Volkswagen_Golf_1.6_United,privat,Angebot,"$8,990",test,limousine,2009,manuell,102,golf,"70,000km",7,benzin,volkswagen,nein,3/26/2016 0:00,0,35394,4/6/2016 20:15
3,3/12/2016 16:58,Smart_smart_fortwo_coupe_softouch/F1/Klima/Pan...,privat,Angebot,"$4,350",control,kleinwagen,2007,automatik,71,fortwo,"70,000km",6,benzin,smart,nein,3/12/2016 0:00,0,33729,3/15/2016 3:16
4,4/1/2016 14:38,Ford_Focus_1_6_Benzin_TÜV_neu_ist_sehr_gepfleg...,privat,Angebot,"$1,350",test,kombi,2003,manuell,0,focus,"150,000km",7,benzin,ford,nein,4/1/2016 0:00,0,39218,4/1/2016 14:38


## Step 3: Preprocess the Dataset

In [3]:
columns_to_drop = ['dateCrawled', 'name', 'seller', 'offerType', 'abtest', 'dateCreated', 'nrOfPictures', 'lastSeen']
autoData = autoData.drop(columns=columns_to_drop)

In [4]:
autoData.head()

Unnamed: 0,price,vehicleType,yearOfRegistration,gearbox,powerPS,model,odometer,monthOfRegistration,fuelType,brand,notRepairedDamage,postalCode
0,"$5,000",bus,2004,manuell,158,andere,"150,000km",3,lpg,peugeot,nein,79588
1,"$8,500",limousine,1997,automatik,286,7er,"150,000km",6,benzin,bmw,nein,71034
2,"$8,990",limousine,2009,manuell,102,golf,"70,000km",7,benzin,volkswagen,nein,35394
3,"$4,350",kleinwagen,2007,automatik,71,fortwo,"70,000km",6,benzin,smart,nein,33729
4,"$1,350",kombi,2003,manuell,0,focus,"150,000km",7,benzin,ford,nein,39218


In [5]:
categorical_columns = ['vehicleType', 'gearbox', 'model', 'odometer', 'fuelType', 'brand', 'notRepairedDamage']

# Perform label encoding for categorical columns
label_encoder = LabelEncoder()
for col in categorical_columns:
    autoData[col] = label_encoder.fit_transform(autoData[col].astype(str))

In [6]:
# Convert 'price' column to numeric (remove '$' and ',' symbols)
autoData['price'] = autoData['price'].str.replace('$', '', regex=False).str.replace(',', '').str.strip().astype(float)

## Step 4: Split the Dataset

In [7]:
X = autoData.drop('price', axis=1)
y = autoData['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


## Step 5: Train the Gradient Boosting Regression Tree model

In [8]:
model = GradientBoostingRegressor(random_state=42)
model.fit(X_train, y_train)

GradientBoostingRegressor(random_state=42)

## Step 6: Evaluate the Model

In [11]:
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    rmse = mse ** 0.5
    return rmse

gb_rmse = evaluate_model(model, X_test, y_test)
print(f"Gradient Boosting RMSE: {gb_rmse}")

# Make predictions on the test set
y_pred = model.predict(X_test)

# Print the first few predictions
print("Predictions:")
print(y_pred[:10])

# Evaluate the model
accuracy_training = model.score(X_train, y_train)
accuracy_testing = model.score(X_test, y_test)

print()

# Print evaluation metrics
print("Final test set predictions:", y_pred)
print("Final train set accuracy:", accuracy_training)
print("Final test set accuracy:", accuracy_testing)

Gradient Boosting RMSE: 163188.77039217635
Predictions:
[ 152.64283699 3098.78740763 1502.34593431 2442.682692   6795.1980928
 4649.86873089 3890.76479551 -988.37245628 1223.02076114 8965.3853633 ]

Final test set predictions: [  152.64283699  3098.78740763  1502.34593431 ... -1070.3050503
  5878.6311393   1202.2659709 ]
Final train set accuracy: 0.29364122119005953
Final test set accuracy: -0.8815132495404077
