(Click on the below links to navigate to different sections of the notebook)
# **Overview**  
- <a href="#1">1. Importing Data</a>

In [26]:
import numpy as np # linear algebra
import pandas as pd # data processing
from sklearn.model_selection import train_test_split # for splitting the dataset in train,test and validation
from sklearn.linear_model import LinearRegression # for linear regression


from sklearn.metrics import mean_squared_error # for calcualting mse

import matplotlib.pyplot as plt #for plotting


# Suppress warnings 
import warnings
warnings.filterwarnings('ignore')

## <a id="1"> 1. Importing Data </a>

In [35]:
df1 = pd.read_csv('OLX_Car_Data_CSV.csv',encoding= 'latin1')
df2 = df1.sample(frac=1).reset_index(drop=True)# shuffle
nRow, nCol = df1.shape
print(f'There are {nRow} rows and {nCol} columns')

There are 24973 rows and 9 columns


In [36]:
display(df2.head(5))
df2.columns

Unnamed: 0,Brand,Condition,Fuel,KMs Driven,Model,Price,Registered City,Transaction Type,Year
0,Toyota,New,Hybrid,100.0,Prius,1345000,Lahore,Cash,2015.0
1,Nissan,Used,Petrol,150000.0,Sunny,850000,Lahore,Cash,2006.0
2,Nissan,Used,Petrol,88000.0,Sunny,920000,Karachi,Cash,2005.0
3,Toyota,Used,Petrol,79500.0,Vitz,1050000,Karachi,Cash,2009.0
4,Toyota,New,Petrol,10.0,Corolla Fielder,540000,Karachi,Installment/Leasing,2017.0


Index(['Brand', 'Condition', 'Fuel', 'KMs Driven', 'Model', 'Price',
       'Registered City', 'Transaction Type', 'Year'],
      dtype='object')

## <a id="2" > 2. Preprocessing the Dataset</a>

### <a id="2-1" > 2.1 Check for NULLS</a>

In [37]:
df2.isnull().sum(axis = 0)

Brand               2137
Condition           2136
Fuel                2445
KMs Driven          2286
Model               2448
Price                  0
Registered City     4636
Transaction Type    2445
Year                2284
dtype: int64

In [38]:
df2.dtypes

Brand                object
Condition            object
Fuel                 object
KMs Driven          float64
Model                object
Price                 int64
Registered City      object
Transaction Type     object
Year                float64
dtype: object

In [39]:
df2.describe()

Unnamed: 0,KMs Driven,Price,Year
count,22687.0,24973.0,22689.0
mean,127811.2,912895.3,2005.901626
std,599672.6,1537134.0,9.563804
min,1.0,50000.0,1915.0
25%,16000.0,380000.0,2002.0
50%,66510.0,650000.0,2008.0
75%,100000.0,1145000.0,2013.0
max,10000000.0,87654320.0,2020.0


In [40]:
df3=df2.dropna() #drop rows with atleast a column with missing values
df3.isnull().sum(axis = 0)

Brand               0
Condition           0
Fuel                0
KMs Driven          0
Model               0
Price               0
Registered City     0
Transaction Type    0
Year                0
dtype: int64

### <a id="2-2" > 2.2 Encoding the categorical data (one hot Encoding)</a>

In [41]:
df3=pd.get_dummies(df3,drop_first=True)
df3.head()

Unnamed: 0,KMs Driven,Price,Year,Brand_BMW,Brand_Changan,Brand_Chevrolet,Brand_Classic & Antiques,Brand_Daewoo,Brand_Daihatsu,Brand_FAW,...,Registered City_Sheikhüpura,Registered City_Sialkot,Registered City_Sukkar,Registered City_Sukkur,Registered City_Swabi,Registered City_Swat,Registered City_Tank,Registered City_Vehari,Registered City_Wah,Transaction Type_Installment/Leasing
0,100.0,1345000,2015.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,150000.0,850000,2006.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,88000.0,920000,2005.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,79500.0,1050000,2009.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,10.0,540000,2017.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


## <a id="3">3. Explanatory Data Analysis </a>

## <a id="4"> 4.  Divide the data into training, cross-validation and testing data </a>

In [42]:
df_y = df3['Price'].values
df_X = df3.drop(['Price'], axis=1)


In [43]:
test_size = 0.30

#Split into train and validation
X_train, X_val, Y_train, Y_val = train_test_split(df_X,df_y, test_size=test_size,shuffle=True, random_state = 3)


In [44]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

rf = RandomForestRegressor()

param_grid = { "criterion" : ["mse"]
              , "min_samples_leaf" : [3]
              , "min_samples_split" : [3]
              , "max_depth": [10]
              , "n_estimators": [500]}

gs = GridSearchCV(estimator=rf, param_grid=param_grid, cv=2, n_jobs=-1, verbose=1)
gs = gs.fit(X_train, Y_train)

Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   24.3s finished


In [45]:
print(gs.best_score_)
print(gs.best_params_)
 

0.16226289478364928
{'criterion': 'mse', 'max_depth': 10, 'min_samples_leaf': 3, 'min_samples_split': 3, 'n_estimators': 500}


In [46]:
bp = gs.best_params_
forest = RandomForestRegressor(criterion=bp['criterion'],
                              min_samples_leaf=bp['min_samples_leaf'],
                              min_samples_split=bp['min_samples_split'],
                              max_depth=bp['max_depth'],
                              n_estimators=bp['n_estimators'])
forest.fit(X_train, Y_train)
# Explained variance score: 1 is perfect prediction
print('Score: %.2f' % forest.score(X_val, Y_val))

Score: 0.22
