In [5]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [29]:
# access data file
df = pd.read_csv("./cars.csv")
df.head(4)

Unnamed: 0,Price,Age,KM,FuelType,HP,MetColor,Automatic,CC,Doors,Weight
0,13500,23,46986,Diesel,90,1,0,2000,3,1165
1,13750,23,72937,Diesel,90,1,0,2000,3,1165
2,13950,24,41711,Diesel,90,1,0,2000,3,1165
3,14950,26,48000,Diesel,90,0,0,2000,3,1165


In [30]:
df.describe()

Unnamed: 0,Price,Age,KM,HP,MetColor,Automatic,CC,Doors,Weight
count,1436.0,1436.0,1436.0,1436.0,1436.0,1436.0,1436.0,1436.0,1436.0
mean,10730.824513,55.947075,68533.259749,101.502089,0.674791,0.05571,1566.827994,4.033426,1072.45961
std,3626.964585,18.599988,37506.448872,14.98108,0.468616,0.229441,187.182436,0.952677,52.64112
min,4350.0,1.0,1.0,69.0,0.0,0.0,1300.0,2.0,1000.0
25%,8450.0,44.0,43000.0,90.0,0.0,0.0,1400.0,3.0,1040.0
50%,9900.0,61.0,63389.5,110.0,1.0,0.0,1600.0,4.0,1070.0
75%,11950.0,70.0,87020.75,110.0,1.0,0.0,1600.0,5.0,1085.0
max,32500.0,80.0,243000.0,192.0,1.0,1.0,2000.0,5.0,1615.0


In [31]:
# handle categorical variable
fuel = pd.get_dummies(df['FuelType'], drop_first=True)
# dropping extra column
df = df.drop(['FuelType', 'MetColor'],axis=1)
# concatation of independent variables and new cateorical variable.
df = pd.concat([df,fuel],axis=1)

In [32]:
df.head(5)

Unnamed: 0,Price,Age,KM,HP,Automatic,CC,Doors,Weight,Diesel,Petrol
0,13500,23,46986,90,0,2000,3,1165,1,0
1,13750,23,72937,90,0,2000,3,1165,1,0
2,13950,24,41711,90,0,2000,3,1165,1,0
3,14950,26,48000,90,0,2000,3,1165,1,0
4,13750,30,38500,90,0,2000,3,1170,1,0


In [33]:
df.columns

Index(['Price', 'Age', 'KM', 'HP', 'Automatic', 'CC', 'Doors', 'Weight',
       'Diesel', 'Petrol'],
      dtype='object')

In [34]:
df['Price']

0       13500
1       13750
2       13950
3       14950
4       13750
        ...  
1431     7500
1432    10845
1433     8500
1434     7250
1435     6950
Name: Price, Length: 1436, dtype: int64

In [35]:
df.iloc[:,0:1]

Unnamed: 0,Price
0,13500
1,13750
2,13950
3,14950
4,13750
...,...
1431,7500
1432,10845
1433,8500
1434,7250


In [36]:
df.iloc[:,1:]

Unnamed: 0,Age,KM,HP,Automatic,CC,Doors,Weight,Diesel,Petrol
0,23,46986,90,0,2000,3,1165,1,0
1,23,72937,90,0,2000,3,1165,1,0
2,24,41711,90,0,2000,3,1165,1,0
3,26,48000,90,0,2000,3,1165,1,0
4,30,38500,90,0,2000,3,1170,1,0
...,...,...,...,...,...,...,...,...,...
1431,69,20544,86,0,1300,3,1025,0,1
1432,72,19000,86,0,1300,3,1015,0,1
1433,71,17016,86,0,1300,3,1015,0,1
1434,70,16916,86,0,1300,3,1015,0,1


In [37]:
# Split data into train and test (Validation in our case)
Xtrain, Xtest, ytrain, ytest = train_test_split(df.iloc[:, 1:], df.iloc[:, 0:1], random_state=44, test_size=0.30)

print(f"Shape of Xtrain: {Xtrain.shape}\n\
Shape of Xtest: {Xtest.shape}\n\
Shape of ytrain: {ytrain.shape}\n\
Shape of ytest: {ytest.shape}")

Shape of Xtrain: (1005, 9)
Shape of Xtest: (431, 9)
Shape of ytrain: (1005, 1)
Shape of ytest: (431, 1)


In [38]:
reg = LinearRegression().fit(Xtrain, ytrain)

In [39]:
# save model in a pickle format .pkl
import pickle
pickle_out = open("./model.pkl","wb")
pickle.dump(reg, pickle_out)
pickle_out.close()