In [1]:
import numpy as np
import pandas as pd
from collections import OrderedDict

In [2]:
class DataMunger():
    def __init__(self, bTrainingData):
        self.__bTrainingData = bTrainingData

    def _StandardizeMakeModelVariant(self, strNameRaw):
        strNameLower = strNameRaw.strip().lower()
        strMake  =""
        strModel = ""
        strVariant = ""

        strNameLower = strNameLower.replace("-","")
        strNameLower = strNameLower.replace("land rover", "landrover")
        strNameLower = strNameLower.replace("range rover", "rangerover")
        strNameLower = strNameLower.replace("s cross", "scross")
        strNameLower = strNameLower.replace("wagon r", "wagonr")
        strNameLower = strNameLower.replace("vitara brezza", "vitarabrezza")
        strNameLower = strNameLower.replace("redi go", "redigo")
        strNameLower = strNameLower.replace("grande punto", "grandepunto")
        strNameLower = strNameLower.replace("punto evo", "puntoevo")
        strNameLower = strNameLower.replace("new cclass", "newclass")
        strNameLower = strNameLower.replace("mahindra renault", "mahindrarenault")
        strNameLower = strNameLower.replace("innova crysta", "innovacrysta")

        strMake = strNameLower.split(" ")[0]
        strModel = strNameLower.split(" ")[1]
        strVariant = " ".join(strNameLower.split(" ")[2:])
        return strMake,strModel,strVariant

    def _GetAge(self, strMfgYear):
        return 2020 - int(strMfgYear)

    def _GetMileage(self, strMileage):
        try:
            if strMileage is not None and strMileage != 'null' and pd.notna(strMileage):
                return float(strMileage.split(" ")[0])
            else:
                return -1
        except:
            print("Exception processing {} in _GetMileage".format(strMileage))

    def _GetEngineCC(self, strEngineCC):
        try:
            if strEngineCC is not None and strEngineCC != 'null' and pd.notna(strEngineCC):
                return float(strEngineCC.split(" ")[0])
            else:
                return -1
        except:
            print("Exception processing {} in _GetEngineCC".format(strEngineCC))



    def _GetPowerBhp(self, strPowerBhp):
        try:
            if strPowerBhp is not None and strPowerBhp != 'null bhp' and pd.notna(strPowerBhp):
                return float(strPowerBhp.split(" ")[0])
            else:
                return -1
        except:
            print("Exception processing {} in _GetPowerBhp".format(strPowerBhp))

    def _SqueezeMakeModelVariant(self, strName):
        return strName.replace(" ","").lower()

    def _GetSeats(self, seats):
        try:
            if pd.isna(seats):
                return -1
            else:
                return int(seats)
        except:
            print("Exception processing {} in _GetSeats".format(seats))

    def GetGroomedData(self, rawdata):
        rawdata =rawdata.drop(columns=['New_Price'])

        Make = []
        Location = []
        Age = []
        KmDriven = []
        FuelType = []
        Transmission = []
        OwnerType = []
        Mileage = []
        EngineCC = []
        PowerBhp = []
        Seats = []
        Price = []

        for inx in range(len(rawdata)):
            make, model, _ = self._StandardizeMakeModelVariant(rawdata.iloc[inx]['Name'])
            Make.append(make+model)
            Location.append(rawdata.iloc[inx]['Location'])
            Age.append(self._GetAge(rawdata.iloc[inx]['Year']))
            KmDriven.append(rawdata.iloc[inx]['Kilometers_Driven'])
            FuelType.append(rawdata.iloc[inx]['Fuel_Type'])
            Transmission.append(rawdata.iloc[inx]['Transmission'])
            OwnerType.append(str(rawdata.iloc[inx]['Owner_Type']))
            Mileage.append(self._GetMileage(rawdata.iloc[inx]['Mileage']))
            EngineCC.append(self._GetEngineCC(rawdata.iloc[inx]['Engine']))
            PowerBhp.append(self._GetPowerBhp(rawdata.iloc[inx]['Power']))
            Seats.append(self._GetSeats(rawdata.iloc[inx]['Seats']))
            if self.__bTrainingData:
                Price.append(rawdata.iloc[inx]['Price'])
            else:
                Price.append(0)

        clean_data_items = [    ('Make', Make),
                                ('Location', Location),
                                ('Age', Age),
                                ('Odometer', KmDriven),
                                ('FuelType', FuelType),
                                ('Transmission', Transmission),
                                ('OwnerType', OwnerType),
                                ('Mileage', Mileage),
                                ('EngineCC', EngineCC),
                                ('PowerBhp', PowerBhp),
                                ('Seats', Seats),
                                ('Price', Price)
        ]

        CleanData = pd.DataFrame.from_dict(OrderedDict(clean_data_items))
        return CleanData



    def _Get1stLevelImputedData(self, StdData):
        # Impute Seats
        vehicle_makemodels = set(StdData[ StdData['Seats']<=0]['Make'])
        for vmm in vehicle_makemodels:
            observationsNeedingImpute = (StdData['Make'] == vmm) & (StdData['Seats'] <= 0)
            goodobservations = (StdData['Make'] == vmm) & (StdData['Seats']>0)
            if sum(goodobservations) > 0:
                average_good_obs = int(np.floor(np.average(StdData[goodobservations]['Seats'])))
            else:
                average_good_obs = -1
            StdData.loc[observationsNeedingImpute, "Seats"] = average_good_obs

        # Impute Mileage
        vehicle_makemodels = set(StdData[ StdData['Mileage']<=0]['Make'])
        for vmm in vehicle_makemodels:
            observationsNeedingImpute = (StdData['Make'] == vmm) & (StdData['Mileage'] <= 0)
            goodobservations = (StdData['Make'] == vmm) & (StdData['Mileage']>0)
            if sum(goodobservations) > 0:
                average_good_obs = np.round(np.average(StdData[goodobservations]['Mileage']),2)
            else:
                average_good_obs = -1
            StdData.loc[observationsNeedingImpute, "Mileage"] = average_good_obs

        # Impute EngineCC
        vehicle_makemodels = set(StdData[ StdData['EngineCC']<=0]['Make'])
        for vmm in vehicle_makemodels:
            observationsNeedingImpute = (StdData['Make'] == vmm) & (StdData['EngineCC'] <= 0)
            goodobservations = (StdData['Make'] == vmm) & (StdData['EngineCC']>0)
            if sum(goodobservations) > 0:
                average_good_obs = np.round(np.average(StdData[goodobservations]['EngineCC']), 2)
            else:
                average_good_obs = -1
            StdData.loc[observationsNeedingImpute, "EngineCC"] = average_good_obs

        # Impute PowerBhp
        vehicle_makemodels = set(StdData[ StdData['PowerBhp']<=0]['Make'])
        for vmm in vehicle_makemodels:
            observationsNeedingImpute = (StdData['Make'] == vmm) & (StdData['PowerBhp'] <= 0)
            goodobservations = (StdData['Make'] == vmm) & (StdData['PowerBhp']>0)
            if sum(goodobservations) > 0:
                average_good_obs = np.round(np.average(StdData[goodobservations]['PowerBhp']),2)
            else:
                average_good_obs = -1
            StdData.loc[observationsNeedingImpute, "PowerBhp"] = average_good_obs

        return StdData


    def _Get2ndLevelImputedData(self, Impute1Data):
        Imp2Data = Impute1Data.drop(columns = ['Seats'])
        return Imp2Data



    def GetImputedData(self, StdData):
        ImputedDataLevel1 = self._Get1stLevelImputedData(StdData)
        ImputedDataLevel2 = self._Get2ndLevelImputedData(ImputedDataLevel1)
        return ImputedDataLevel2



In [5]:
OriginalData = pd.read_excel("UsedCarPrices.xlsx",sheet_name = 0, header=0)
dm = DataMunger(True)

# First groom the data to cleanup the incoming data. After grooming, all datapoints of any field
# must be in the same format.
GroomedData = dm.GetGroomedData(OriginalData)

# Impute missing values. I am using mean value substitution where ever applicable.
ImputedData = dm.GetImputedData(GroomedData)

In [6]:
ImputedData.head()

Unnamed: 0,Make,Location,Age,Odometer,FuelType,Transmission,OwnerType,Mileage,EngineCC,PowerBhp,Price
0,marutiwagonr,Mumbai,10,72000,CNG,Manual,First,26.6,998.0,58.16,1.75
1,hyundaicreta,Pune,5,41000,Diesel,Manual,First,19.67,1582.0,126.2,12.5
2,hondajazz,Chennai,9,46000,Petrol,Manual,First,18.2,1199.0,88.7,4.5
3,marutiertiga,Chennai,8,87000,Diesel,Manual,First,20.77,1248.0,88.76,6.0
4,audia4,Coimbatore,7,40670,Diesel,Automatic,Second,15.2,1968.0,140.8,17.74


In [7]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score


ImputedData_dv = pd.get_dummies(ImputedData,drop_first=True)
ImputedData_dv.head()

Unnamed: 0,Age,Odometer,Mileage,EngineCC,PowerBhp,Price,Make_audia3,Make_audia4,Make_audia6,Make_audia7,...,Location_Mumbai,Location_Pune,FuelType_Diesel,FuelType_Electric,FuelType_LPG,FuelType_Petrol,Transmission_Manual,OwnerType_Fourth & Above,OwnerType_Second,OwnerType_Third
0,10,72000,26.6,998.0,58.16,1.75,0,0,0,0,...,1,0,0,0,0,0,1,0,0,0
1,5,41000,19.67,1582.0,126.2,12.5,0,0,0,0,...,0,1,1,0,0,0,1,0,0,0
2,9,46000,18.2,1199.0,88.7,4.5,0,0,0,0,...,0,0,0,0,0,1,1,0,0,0
3,8,87000,20.77,1248.0,88.76,6.0,0,0,0,0,...,0,0,1,0,0,0,1,0,0,0
4,7,40670,15.2,1968.0,140.8,17.74,0,1,0,0,...,0,0,1,0,0,0,0,0,1,0


In [8]:
from   sklearn.model_selection import train_test_split
y = np.array(ImputedData_dv['Price'].values)
y = y.reshape(-1, 1)
X = np.array(ImputedData_dv.drop(columns = ['Price']).values)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [9]:
# Implement Standard Scaler
from sklearn.preprocessing import StandardScaler
scaler_X = StandardScaler()
scaler_X.fit(X_train)

X_train_std = scaler_X.transform(X_train)
X_test_std = scaler_X.transform(X_test)

scaler_y = StandardScaler()
scaler_y.fit(y_train)
y_train_std = scaler_y.transform(y_train)
y_test_std = scaler_y.transform(y_test)


<h1>Linear Regression</h1>

In [10]:
# Implement Linear Regression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score


regression_model = LinearRegression()
# Fit the data(train the model)
regression_model.fit(X_train_std, y_train_std)

# Predict
y_predicted = regression_model.predict(X_test_std)


# model evaluation
y_predicted_inverse = scaler_y.inverse_transform(y_predicted)
r2 = r2_score(y_test_std, y_predicted)
rmse = mean_squared_error(y_test, y_predicted_inverse)
print(r2, rmse)

-1.0299521683350536e+23 1.3471642924880145e+25


<h1>Decision Tree</h1>

In [11]:
# Implement Decision Regression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score


regression_model = RandomForestRegressor(n_estimators = 10)

# Fit the data(train the model)
regression_model.fit(X_train_std, y_train_std)

# Predict
y_predicted = regression_model.predict(X_test_std)



# model evaluation
y_predicted_inverse = scaler_y.inverse_transform(y_predicted)
r2 = r2_score(y_test_std, y_predicted)
rmse = mean_squared_error(y_test, y_predicted_inverse)
print(r2, rmse)

  if __name__ == '__main__':


0.8940197980005633 13.862075175306845


<ul>
    <li>Hyper parameter tuning is possible by modifying the constructor parameters of RandomForestRegressor() class.</li>
</ul>