In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns 


In [2]:
df = pd.read_csv('./clean_dataset.csv', index_col=0)
# df = df.reset_index(drop=True)
df.head()

Unnamed: 0,year,selling_price,km_driven,fuel,transmission,owner
1,2007,9180.0,50000,Petrol,Manual,First Owner
2,2012,40800.0,100000,Diesel,Manual,First Owner
4,2014,30600.0,141000,Diesel,Manual,Second Owner
5,2007,9520.0,125000,Petrol,Manual,First Owner
6,2016,37400.0,25000,Petrol,Manual,First Owner


In [3]:
import random

def multiply_by_n(x, random_factors):
    return x * random.choice(random_factors)

percentages = [0.1]


df['selling_price'] = df['selling_price'].apply(multiply_by_n, args=(percentages,))

df.head()
df.min()
df.max()

year                            2020
selling_price                33660.0
km_driven                     170000
fuel                          Petrol
transmission                  Manual
owner            Third Owner & Above
dtype: object

In [4]:
df.head()
df.min()
df.max()
df['selling_price'].mean()

4554.021965943944

In [5]:
transmission = pd.get_dummies(df['transmission'])
fuel = pd.get_dummies(df['fuel'])
owner = df['owner'].map(lambda x: 1 if x=='First Owner' else 2 if x=='Second Owner' else 3)

fuel = pd.get_dummies(df['fuel'])
owner = df['owner'].map(lambda x: 1 if x=='First Owner' else 2 if x=='Second Owner' else 3)
X = pd.concat([transmission,fuel,owner,df.drop(['transmission','fuel','owner','selling_price'],axis=1)],axis=1)
y = df['selling_price']

In [6]:
X.head(5)

Unnamed: 0,Automatic,Manual,Diesel,Other,Petrol,owner,year,km_driven
1,0,1,0,0,1,1,2007,50000
2,0,1,1,0,0,1,2012,100000
4,0,1,1,0,0,2,2014,141000
5,0,1,0,0,1,1,2007,125000
6,0,1,0,0,1,1,2016,25000


In [7]:
y.head(5)

1     918.0
2    4080.0
4    3060.0
5     952.0
6    3740.0
Name: selling_price, dtype: float64

In [8]:
from sklearn.model_selection import train_test_split 

#Splitting the Data to train/test
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=10)

In [9]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso 
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor 


np.random.seed(45)
#Creating an instance of the model
model = LinearRegression()

model.fit(X_train, y_train)
model.score(X_test,y_test)

0.6042560826860334

In [10]:
from sklearn.model_selection import ShuffleSplit 
from sklearn.model_selection import cross_val_score


'''
Testing model accuracy using K Fold Cross Validation
'''

cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
cross_val_score(LinearRegression(), X, y, cv=cv)

array([0.65663353, 0.60782169, 0.47293879, 0.57811769, 0.60524399])

### Using GridSearchCV to Find The Best Model





In [11]:
from sklearn.linear_model import Lasso 
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor 

from sklearn.model_selection import GridSearchCV 

def find_best_model(X,y):
    algos = {
      'linear_regression' : {
            'model': LinearRegression(),
            'params': {
                'normalize': [True, False]
            }
        },
        'lasso': {
            'model': Lasso(),
            'params': {
                'alpha': [1,2],
                'selection': ['random', 'cyclic']
            }
        },
        'decision_tree': {
            'model': DecisionTreeRegressor(),
            'params': {
                'criterion' : ['mse','friedman_mse'],
                'splitter': ['best','random']
            }
        },
        'random_forest': {
            'model': RandomForestRegressor(),
            'params': {
                'criterion' : ['mse','friedman_mse'],
                'n_estimators': range(1,50,5)
            }
        }
    }
    scores = []
    cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
    for algo_name, config in algos.items():
        gs =  GridSearchCV(config['model'], config['params'], cv=cv, return_train_score=False)
        gs.fit(X,y)
        scores.append({
            'model': algo_name,
            'best_score': gs.best_score_,
            'best_params': gs.best_params_
        })

    return pd.DataFrame(scores,columns=['model','best_score','best_params'])

find_best_model(X,y)

ValueError: ignored

In [12]:

def find_best_model_using_gridsearchcv(X,y):
    algos = {
        'linear_regression' : {
            'model': LinearRegression(),
            'params': {
                # 'normalise': [True, False]
            }
        },
        'lasso': {
            'model': Lasso(),
            'params': {
                'alpha': [1,2],
                'selection': ['random', 'cyclic']
            }
        },
        'decision_tree': {
            'model': DecisionTreeRegressor(),
            'params': {
                'criterion' : ['mse','friedman_mse'],
                'splitter': ['best','random']
            }
        },
        'random_forest': {
            'model': RandomForestRegressor(),
            'params': {
                'criterion' : ['mse','friedman_mse'],
                'n_estimators': range(1,50,5)
            }
        }
    }
    scores = []
    cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
    for algo_name, config in algos.items():
        gs =  GridSearchCV(config['model'], config['params'], cv=cv, return_train_score=False)
        gs.fit(X,y)
        scores.append({
            'model': algo_name,
            'best_score': gs.best_score_,
            'best_params': gs.best_params_
        })

    return pd.DataFrame(scores,columns=['model','best_score','best_params'])
np.random.seed(45)
find_best_model_using_gridsearchcv(X,y)

10 fits failed out of a total of 20.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.9/dist-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.9/dist-packages/sklearn/tree/_classes.py", line 1247, in fit
    super().fit(
  File "/usr/local/lib/python3.9/dist-packages/sklearn/tree/_classes.py", line 177, in fit
    self._validate_params()
  File "/usr/local/lib/python3.9/dist-packages/sklearn/base.py", line 600, in _validate_params
    validate_parameter_constraints(
  File "/usr/local/lib/python3.9/dist-packages/sklearn/utils/

Unnamed: 0,model,best_score,best_params
0,linear_regression,0.584151,{}
1,lasso,0.584713,"{'alpha': 2, 'selection': 'random'}"
2,decision_tree,0.826194,"{'criterion': 'friedman_mse', 'splitter': 'best'}"
3,random_forest,0.852458,"{'criterion': 'friedman_mse', 'n_estimators': 21}"


Linear Regression got an accuracy of 0.58 due to a lack of parameters. Even with parameters, the accuracy would still be low


Best Model is Decision Tree and Random Forest for this particular case

In [13]:
model2 = RandomForestRegressor(criterion='friedman_mse',n_estimators=13)
model2.fit(X_train,y_train)
model2.score(X_test, y_test)

0.8486863026588783

In [14]:
def predict_price(transmission,fuel,previous_owner,year,km_driven):
    x = []
    x[:8] = np.zeros(8,dtype='int32')
    x[5] = previous_owner
    x[6] = year
    x[7] = km_driven
    
    transmission_index = np.where(X.columns==transmission)[0][0]
    fuel_index = np.where(X.columns==fuel)[0][0]
    
    if transmission_index>=0:
        x[transmission_index] = 1
    if fuel_index>=2:
        x[fuel_index] = 1
        
    return float(format(model.predict([x])[0],'.2f'))

In [15]:
predict_price('Automatic','Petrol',1,2021,1000)



15269.15

### 24 April - Saving the model for API Use

In [None]:
import pickle

pickle.dump(model, open('linear_regression_model.pkl', 'wb'))
pickle.dump(model2, open('robert_randomforest_final.pkl', 'wb'))

# The Linear Regression Model failed to Exceed 75, the minimum acceptence for such models
# Random Forest Regression did the best
# Decision Trees would have done well too but Random Forest Regression would be better to document

In [16]:
model2 = RandomForestRegressor(criterion='friedman_mse',n_estimators=13)
model2.fit(X_train,y_train)

zitf_day_score = model2.score(X_test, y_test)
print(f'Lapha ushaye {round(zitf_day_score*100)}%')

Lapha ushaye 84%


In [17]:
predict_price('Manual','Petrol', 1, 2014, 10000)



3437.64