In [1]:
#Importing dependancies
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

In [None]:
#Import data stored 
train_data = pd.read_csv("train-data.csv")
test_data = pd.read_csv("test-data.csv")

train_data.head()
test_data.head()

Unnamed: 0.1,Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,New_Price
0,0,Maruti Alto K10 LXI CNG,Delhi,2014,40929,CNG,Manual,First,32.26 km/kg,998 CC,58.2 bhp,4.0,
1,1,Maruti Alto 800 2016-2019 LXI,Coimbatore,2013,54493,Petrol,Manual,Second,24.7 kmpl,796 CC,47.3 bhp,5.0,
2,2,Toyota Innova Crysta Touring Sport 2.4 MT,Mumbai,2017,34000,Diesel,Manual,First,13.68 kmpl,2393 CC,147.8 bhp,7.0,25.27 Lakh
3,3,Toyota Etios Liva GD,Hyderabad,2012,139000,Diesel,Manual,First,23.59 kmpl,1364 CC,null bhp,5.0,
4,4,Hyundai i20 Magna,Mumbai,2014,29000,Petrol,Manual,First,18.5 kmpl,1197 CC,82.85 bhp,5.0,


In [None]:
#preprocess the data and clean it
class Preprocess():
    def __init__(self, dataset):
        self.dataset = dataset
        return None

    #This function drops unnecessary columns
    def dropcolumns(self):
        to_drop = ["Unnamed: 0", "Name", "Location", "Kilometers_Driven", "Fuel_Type", "Transmission", "Owner_Type", "Power", "Seats", "New_Price"]
        return self.dataset.drop(
            to_drop, inplace=True, axis=1
        )

    #This function drops all rows with null values from the dataset
    def dropna(self):
        col_drop = self.dropcolumns()
        dropped = self.dataset.dropna()
        return dropped

    #This function cleans the units from the string columns
    #It also concerts the clean columns which are in string format to numeric
    def removeString(self):
        self.dataset['Engine'] = self.dataset['Engine'].str.replace(r'\D', '')
        self.dataset['Engine'] = pd.to_numeric(self.dataset['Engine'])
        self.dataset['Mileage'] = self.dataset['Mileage'].str.replace(r'\D', '')
        self.dataset['Mileage'] = pd.to_numeric(self.dataset['Mileage'])
        return self.dataset

    #This function calls the rest of the class
    def clean(self):
      self.removeString()
      return self.dropna()


In [None]:
#clean the training and test data
clean_train_data = Preprocess(train_data).clean()

clean_test_data = Preprocess(test_data).clean()

#show the clean data
clean_train_data.head()



Unnamed: 0,Year,Mileage,Engine,Price
0,2010,266.0,998.0,1.75
1,2015,1967.0,1582.0,12.5
2,2011,182.0,1199.0,4.5
3,2012,2077.0,1248.0,6.0
4,2013,152.0,1968.0,17.74


In [None]:
#split the data to features(X) and targets(labels)(Y)
class splitXY():
    def __init__(self, dataset, label):
        self.dataset = dataset
        self.label = label
        return None
    #this function creates the features and labels
    def splitlabel(self):
        X = self.dataset.drop(self.label, axis = 1).values
        y = self.dataset[self.label[0]].values

        return X, y 

    #splot the data to train and test data
    def splitdata(self):
        X, y = self.splitlabel()

        X_train, X_test, y_train, y_test = train_test_split(X, y , test_size=0.2, random_state=25)

        # print(f"No. of training examples: {training_data.shape[0]}")
        # print(f"No. of testing examples: {testing_data.shape[0]}")
        return X_train, X_test, y_train, y_test

#First Approach: Using the Target Variable Price as a continous variable and thus regression by decision trees

In [None]:
X_train, X_test, y_train, y_test = splitXY(clean_train_data, ["Price"]).splitdata()
#Show the training data
X_train, y_train

(array([[2013.,  284., 1248.],
        [2015.,  182., 1248.],
        [2015., 1757., 1193.],
        ...,
        [2005.,  110., 2987.],
        [2018., 1602., 1373.],
        [2016., 2014., 1498.]]),
 array([ 4.95,  4.3 ,  4.52, ..., 10.  ,  8.25,  6.3 ]))

In [None]:
#Instanciate the decision tree regressors
fit_1 = DecisionTreeRegressor(max_depth=2)
fit_2 = DecisionTreeRegressor(max_depth=5)


#Fit the data to the instanciated model
fit_1.fit(X_train, y_train)
fit_2.fit(X_train, y_train)

DecisionTreeRegressor(max_depth=5)

In [None]:
fit_2.score(X_train, y_train)

0.7678476226395207

In [None]:
fit_2.get_n_leaves()

32

In [None]:
cross_val_score(fit_2, X_train, y_train, cv=10)

array([0.81216411, 0.76445921, 0.70394034, 0.66493376, 0.64491868,
       0.71417788, 0.61902476, 0.7185033 , 0.72539609, 0.68439817])

In [None]:
from math import sqrt
#Make predictions of the model using the test dataset
#X_test = clean_test_data
y_1 = fit_1.predict(X_test)
y_2 = fit_2.predict(X_test)


#Calculate sum of squared errors
err = y_test - y_2
print((sum(err**2)))

30397.07422407224


#Second Approach: Make classes/bins using Target Variable Price and thus classifcation using decision trees

In [None]:
#create three classes of cheap, middle and expensive
clean_train_data['Label'] = pd.cut(x = clean_train_data['Price'], bins = [0, 4, 7, 15, 40, 200], labels=['Cheap', 'Low-Mid', 'Mid-High','Expensive', "Super-Expensive"])
clean_train_data['Label'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Cheap              1951
Low-Mid            1793
Mid-High           1258
Expensive           809
Super-Expensive     170
Name: Label, dtype: int64

In [None]:
X_train, X_test, y_train, y_test = splitXY(clean_train_data, ["Label","Price"]).splitdata()
#Show the training data
X_train, y_train

(array([[2013.,  284., 1248.],
        [2015.,  182., 1248.],
        [2015., 1757., 1193.],
        ...,
        [2005.,  110., 2987.],
        [2018., 1602., 1373.],
        [2016., 2014., 1498.]]),
 ['Low-Mid', 'Low-Mid', 'Low-Mid', 'Low-Mid', 'Low-Mid', ..., 'Cheap', 'Low-Mid', 'Mid-High', 'Mid-High', 'Low-Mid']
 Length: 4784
 Categories (5, object): ['Cheap' < 'Low-Mid' < 'Mid-High' < 'Expensive' < 'Super-Expensive'])

In [None]:
clf = DecisionTreeClassifier(random_state = 34)# max_depth = 5)
clf.fit(X_train, y_train)

DecisionTreeClassifier(random_state=34)

In [None]:
cross_val_score(clf, X_train, y_train , cv=10)

array([0.77244259, 0.76617954, 0.76617954, 0.76200418, 0.80125523,
       0.75313808, 0.76569038, 0.78870293, 0.77824268, 0.76987448])

In [None]:
clf.score(X_test, y_test)

0.772765246449457

In [None]:
clf.predict(clean_test_data)

  f"X has feature names, but {self.__class__.__name__} was fitted without"


array(['Cheap', 'Cheap', 'Expensive', ..., 'Cheap', 'Cheap', 'Expensive'],
      dtype=object)