<a href="https://colab.research.google.com/github/mkri/master/blob/master/modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Building FastAi Model

In [5]:
#Printing thw working directory
import os
wd = os.getcwd()

In [6]:
wd

'/content'

In [11]:
# Setting The Home Directory for the project
home_dir = wd[:-7]

In [12]:
home_dir

'/'

In [13]:
#Listing the home directory and working directory contents
print('HOME DIRECTORY')
print(os.listdir(home_dir))
print('WORKING DIRECTORY')
print(os.listdir(wd))

HOME DIRECTORY
['sys', 'root', 'media', 'boot', 'sbin', 'mnt', 'proc', 'etc', 'home', 'srv', 'tmp', 'dev', 'opt', 'run', 'bin', 'lib', 'var', 'lib64', 'usr', 'content', '.dockerenv', 'datalab', 'tools', 'swift', 'tensorflow-1.15.2', 'lib32']
WORKING DIRECTORY
['.config', 'Data_Train_Cars.xlsx', 'sample_data']


## Loading The Datasets

In [14]:
import pandas as pd
import numpy as np
training_set = pd.read_excel(wd+'/Data_Train_Cars.xlsx')

In [15]:
training_set.head(5)

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,New_Price,Price
0,Maruti Wagon R LXI CNG,Mumbai,2010,72000,CNG,Manual,First,26.6 km/kg,998 CC,58.16 bhp,5.0,,1.75
1,Hyundai Creta 1.6 CRDi SX Option,Pune,2015,41000,Diesel,Manual,First,19.67 kmpl,1582 CC,126.2 bhp,5.0,,12.5
2,Honda Jazz V,Chennai,2011,46000,Petrol,Manual,First,18.2 kmpl,1199 CC,88.7 bhp,5.0,8.61 Lakh,4.5
3,Maruti Ertiga VDI,Chennai,2012,87000,Diesel,Manual,First,20.77 kmpl,1248 CC,88.76 bhp,7.0,,6.0
4,Audi A4 New 2.0 TDI Multitronic,Coimbatore,2013,40670,Diesel,Automatic,Second,15.2 kmpl,1968 CC,140.8 bhp,5.0,,17.74


## Structuring & Formatting The Datasets

In [16]:
def restructure(data):
  
  names = list(data.Name)
  
  brand = []
  model = []
  
  for i in range(len(names)):
    try:
      brand.append(names[i].split(" ")[0])
      try:
        model.append(" ".join(names[i].split(" ")[1:]).strip())
      except:
        pass
    except:
        print("ERR ! - ", names[i], "@" , i)
        
        
  mileage = list(data.Mileage)
  
  for i in range(len(mileage)):
    try :
      mileage[i] = float(mileage[i].split(" ")[0].strip())
    except:
      mileage[i] = np.nan
      
      
  engine = list(data.Engine)
  for i in range(len(engine)):
    try :
      engine[i] = int(engine[i].split(" ")[0].strip())
    except:
      engine[i] = np.nan
      
      
  power = list(data.Power)
  for i in range(len(power)):
    try :
      power[i] = float(power[i].split(" ")[0].strip())
    except:
      power[i] = np.nan
      
  data['New_Price'].fillna(0, inplace = True)
  
  newp = list(data['New_Price'])
  
  for i in range(len(newp)):
    if newp[i] == 0:
      newp[i] = float(newp[i])
      continue
    elif 'Cr' in newp[i]:
      newp[i] = float(newp[i].split()[0].strip()) * 100 
    elif 'Lakh' in newp[i]:
      newp[i] = float(newp[i].split()[0].strip())
      
      
#Re-ordering the columns

  restructured = pd.DataFrame({'Brand': brand,
                              'Model':model,
                              'Location': data['Location'], 
                              'Year':data['Year'] , 
                              'Kilometers_Driven':data['Kilometers_Driven'],
                              'Fuel_Type':data['Fuel_Type'],
                              'Transmission':data['Transmission'],
                              'Owner_Type':data['Owner_Type'],
                              'Mileage':mileage,
                              'Engine':engine,
                              'Power':power,
                              'Seats':data['Seats'],
                              'New_Price':newp
                             })

  if 'Price' in data.columns:
    restructured['Price'] = data['Price']
    return restructured

  else:
    return restructured

In [17]:
train_d = restructure(training_set)


### Selecting Few Features

In [18]:
cols = ['Brand', 'Location', 'Year', 'Kilometers_Driven', 'Fuel_Type',
       'Transmission', 'Owner_Type', 'Mileage', 'Price']

In [19]:
train_d = train_d[cols]

In [20]:
train_d.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6019 entries, 0 to 6018
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Brand              6019 non-null   object 
 1   Location           6019 non-null   object 
 2   Year               6019 non-null   int64  
 3   Kilometers_Driven  6019 non-null   int64  
 4   Fuel_Type          6019 non-null   object 
 5   Transmission       6019 non-null   object 
 6   Owner_Type         6019 non-null   object 
 7   Mileage            6017 non-null   float64
 8   Price              6019 non-null   float64
dtypes: float64(2), int64(2), object(5)
memory usage: 423.3+ KB


## Modeling With Fast.ai


In [21]:
from fastai.tabular import *
#This path will be used for saving and exporting the model
path = wd

In [22]:
#The target variable that we are trying to predict
dep_var = 'Price'

#The categorical variables 
cat_names = list(train_d.select_dtypes('object').columns)

#The continuous variables
cont_names =['Year', 'Kilometers_Driven', 'Mileage'] #No need to keep the Dependend variable

#Preprocessing steps for the fastai learner
procs = [FillMissing, Categorify, Normalize]

In [23]:
#Creating a validation set
val = TabularList.from_df(train_d.iloc[800:1000].copy(), path=path, cat_names=cat_names, cont_names=cont_names)

In [24]:
#Creating a trainig set
data = (TabularList.from_df(train_d, path=path, cat_names=cat_names, cont_names=cont_names, procs=procs)
                           .split_by_idx(list(range(len(train_d) - int(len(train_d) * 0.2),len(train_d))))
                           .label_from_df(cols=dep_var)
                           .add_test(val)
                           .databunch())

In [25]:
data.show_batch(5)

Brand,Location,Fuel_Type,Transmission,Owner_Type,Mileage_na,Year,Kilometers_Driven,Mileage,target
Maruti,Delhi,Petrol,Manual,First,False,0.2025,-0.0417,0.1142,4.2
Hyundai,Coimbatore,Petrol,Manual,First,False,0.8188,-0.311,0.6629,3.55
Maruti,Kochi,Petrol,Manual,First,False,1.435,-0.3526,0.7287,6.57
Honda,Kolkata,Petrol,Automatic,First,False,-0.1056,-0.2007,-0.5443,4.55
Toyota,Kolkata,Petrol,Automatic,First,False,1.435,-0.5386,-0.834,16.95


### Initializing Neural Network

In [26]:
learn = tabular_learner(data, layers=[300,100, 100, 50], metrics= rmse)

### Training The Model

In [27]:
learn.fit(25, 1e-2)

epoch,train_loss,valid_loss,root_mean_squared_error,time
0,56.722385,34.669807,5.180325,00:01
1,37.283188,45.852707,6.377563,00:01
2,31.620218,37.376774,5.416864,00:01
3,26.220377,32.587776,4.953351,00:01
4,25.89547,30.719851,4.850688,00:01
5,31.47164,33.470623,5.136633,00:01
6,27.325243,34.25177,5.17717,00:01
7,24.815571,36.427143,5.289566,00:01
8,26.620882,30.493164,4.807407,00:01
9,25.453169,32.986179,5.029118,00:01


In [28]:
learn.show_results(ds_type=DatasetType.Train)

Brand,Location,Fuel_Type,Transmission,Owner_Type,Mileage_na,Year,Kilometers_Driven,Mileage,target,prediction
Toyota,Mumbai,Diesel,Manual,First,False,-0.4138,0.7213,-1.1149,13.5,[9.60764]
Hyundai,Kolkata,Diesel,Manual,First,False,0.5106,-0.1411,0.8384,3.45,[4.952608]
Volkswagen,Pune,Petrol,Manual,First,False,1.435,-0.3895,-0.237,5.5,[5.862298]
Renault,Delhi,Diesel,Manual,First,False,-0.4138,0.7037,0.3424,3.75,[4.091384]
Volkswagen,Bangalore,Diesel,Automatic,First,False,-1.3381,0.1472,-0.5662,5.25,[5.013774]


In [29]:
learn.show_results(ds_type=DatasetType.Valid)

Brand,Location,Fuel_Type,Transmission,Owner_Type,Mileage_na,Year,Kilometers_Driven,Mileage,target,prediction
BMW,Delhi,Petrol,Automatic,Second,False,-2.2625,-0.1112,-1.4178,6.99,[7.959209]
Hyundai,Coimbatore,Diesel,Manual,First,False,1.7431,-0.1722,1.014,15.57,[10.255191]
Tata,Coimbatore,Diesel,Manual,First,False,0.5106,0.2558,-0.6101,5.29,[7.115001]
Datsun,Kolkata,Petrol,Manual,First,False,0.8188,-0.4392,1.014,2.25,[1.866148]
BMW,Chennai,Diesel,Automatic,First,False,-1.3381,0.5049,-1.4002,20.0,[12.028448]


## Saving & Exporting The Model

In [30]:
learn.save('model',return_path=True)

PosixPath('/content/models/model.pth')

In [31]:
learn.export('model.pkl')

In [33]:
model = load_learner(path, 'model.pkl')