In [3]:
import pandas as pd
df = pd.read_csv("carprices.csv")
df

Unnamed: 0,Car Model,Mileage,Sell Price($),Age(yrs)
0,BMW X5,69000,18000,6
1,BMW X5,35000,34000,3
2,BMW X5,57000,26100,5
3,BMW X5,22500,40000,2
4,BMW X5,46000,31500,4
5,Audi A5,59000,29400,5
6,Audi A5,52000,32000,5
7,Audi A5,72000,19300,6
8,Audi A5,91000,12000,8
9,Mercedez Benz C class,67000,22000,6


In [10]:
# Use Panda dummies to convert the string columns to 0/1 indicator variables
dummies = pd.get_dummies(df['Car Model']).astype(int)
dummies

Unnamed: 0,Audi A5,BMW X5,Mercedez Benz C class
0,0,1,0
1,0,1,0
2,0,1,0
3,0,1,0
4,0,1,0
5,1,0,0
6,1,0,0
7,1,0,0
8,1,0,0
9,0,0,1


In [11]:
# Concat dummies with orginal data
merged_dataset = pd.concat([df, dummies], axis = 'columns')
merged_dataset

Unnamed: 0,Car Model,Mileage,Sell Price($),Age(yrs),Audi A5,BMW X5,Mercedez Benz C class
0,BMW X5,69000,18000,6,0,1,0
1,BMW X5,35000,34000,3,0,1,0
2,BMW X5,57000,26100,5,0,1,0
3,BMW X5,22500,40000,2,0,1,0
4,BMW X5,46000,31500,4,0,1,0
5,Audi A5,59000,29400,5,1,0,0
6,Audi A5,52000,32000,5,1,0,0
7,Audi A5,72000,19300,6,1,0,0
8,Audi A5,91000,12000,8,1,0,0
9,Mercedez Benz C class,67000,22000,6,0,0,1


In [13]:
# Drop Orignal columns
# Drop on of the dummy columns to avoid Dummy Variable Trap. When we know the value of Audi A5	& BMW X5, we cxan derive the value of Mercedez Benz C class.
# Why is it a problem?
# In linear regression, multicollinearity makes it impossible for the algorithm to estimate coefficients uniquely.
# Mathematically, it makes the X matrix non-invertible (singular), so regression can fail.

final_data_frame = merged_dataset.drop(['Car Model', 'Audi A5'], axis = 'columns')
final_data_frame

Unnamed: 0,Mileage,Sell Price($),Age(yrs),BMW X5,Mercedez Benz C class
0,69000,18000,6,1,0
1,35000,34000,3,1,0
2,57000,26100,5,1,0
3,22500,40000,2,1,0
4,46000,31500,4,1,0
5,59000,29400,5,0,0
6,52000,32000,5,0,0
7,72000,19300,6,0,0
8,91000,12000,8,0,0
9,67000,22000,6,0,1


In [22]:
# Create Model
from sklearn.linear_model import LinearRegression as lr
model = lr()

# Derive Independent Variable
X = final_data_frame.drop('Sell Price($)', axis = 'columns')
X

Unnamed: 0,Mileage,Age(yrs),BMW X5,Mercedez Benz C class
0,69000,6,1,0
1,35000,3,1,0
2,57000,5,1,0
3,22500,2,1,0
4,46000,4,1,0
5,59000,5,0,0
6,52000,5,0,0
7,72000,6,0,0
8,91000,8,0,0
9,67000,6,0,1


In [23]:
# Derive Dependent Variable
Y = final_data_frame['Sell Price($)']
Y

0     18000
1     34000
2     26100
3     40000
4     31500
5     29400
6     32000
7     19300
8     12000
9     22000
10    20000
11    21000
12    33000
Name: Sell Price($), dtype: int64

In [24]:
# Train the model
model.fit(X,Y)

In [27]:
# Predict the model
model.predict([[69000, 6, 1, 0]])



array([18705.2723644])

In [28]:
model.predict([[69000, 6, 1, 1]])



array([21158.81310167])

In [29]:
# Gte the accuracy of the model
model.score(X,Y)

0.9417050937281082

In [31]:
# Using One Hot Encoding from Sklearn
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [33]:
# Freash Dataset for LabelEncoder
ledf = df
ledf

Unnamed: 0,Car Model,Mileage,Sell Price($),Age(yrs)
0,BMW X5,69000,18000,6
1,BMW X5,35000,34000,3
2,BMW X5,57000,26100,5
3,BMW X5,22500,40000,2
4,BMW X5,46000,31500,4
5,Audi A5,59000,29400,5
6,Audi A5,52000,32000,5
7,Audi A5,72000,19300,6
8,Audi A5,91000,12000,8
9,Mercedez Benz C class,67000,22000,6


In [35]:
# Transform Car Model column to LabelEncoder
ledf['Car Model'] = le.fit_transform(ledf['Car Model'])
ledf

Unnamed: 0,Car Model,Mileage,Sell Price($),Age(yrs)
0,1,69000,18000,6
1,1,35000,34000,3
2,1,57000,26100,5
3,1,22500,40000,2
4,1,46000,31500,4
5,0,59000,29400,5
6,0,52000,32000,5
7,0,72000,19300,6
8,0,91000,12000,8
9,2,67000,22000,6


In [36]:
X = ledf.drop('Sell Price($)', axis = 'columns')
X

Unnamed: 0,Car Model,Mileage,Age(yrs)
0,1,69000,6
1,1,35000,3
2,1,57000,5
3,1,22500,2
4,1,46000,4
5,0,59000,5
6,0,52000,5
7,0,72000,6
8,0,91000,8
9,2,67000,6


In [37]:
Y = ledf['Sell Price($)']
Y

0     18000
1     34000
2     26100
3     40000
4     31500
5     29400
6     32000
7     19300
8     12000
9     22000
10    20000
11    21000
12    33000
Name: Sell Price($), dtype: int64

In [38]:
model.fit(X,Y)

In [39]:
model.predict([[1, 69000, 6]])



array([22855.47176747])

In [40]:
model.predict([[1, 69000, 7]])



array([23107.05761445])

In [41]:
model.score(X,Y)

0.8719970367825953

In [58]:
# Traing & Test Data
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.2)

In [59]:
# The training set will be alyways random. To get fix set of training set use the below.
# X_train, Y_train, X_test, Y_test = train_test_split(X,Y,test_size=0.2, random_state=10)
X_train

Unnamed: 0,Car Model,Mileage,Age(yrs)
2,1,57000,5
7,0,72000,6
3,1,22500,2
0,1,69000,6
12,2,59000,5
1,1,35000,3
8,0,91000,8
9,2,67000,6
10,2,83000,7
4,1,46000,4


In [60]:
Y_train

2     26100
7     19300
3     40000
0     18000
12    33000
1     34000
8     12000
9     22000
10    20000
4     31500
Name: Sell Price($), dtype: int64

In [61]:
from sklearn.linear_model import LinearRegression
lr_model = LinearRegression()
lr_model.fit(X_train, Y_train)

In [62]:
# Predict with actualt test data set
lr_model.predict(X_test)

array([21130.55617816, 24093.74979988, 19053.6582242 ])

In [63]:
Y_test

6     32000
5     29400
11    21000
Name: Sell Price($), dtype: int64

In [64]:
lr_model.score(X_train, Y_train)

0.9444843318598716