In [2]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
import pickle
import joblib

In [3]:
df = pd.read_csv('/content/carprices.csv')

In [4]:
df.head()

Unnamed: 0,Car Model,Mileage,Sell Price($),Age(yrs)
0,BMW X5,69000,18000,6
1,BMW X5,35000,34000,3
2,BMW X5,57000,26100,5
3,BMW X5,22500,40000,2
4,BMW X5,46000,31500,4


**Data Preprocessing using dummy data**


1.   List item
2.   List item





In [5]:
# Replacing car model data with dummy data
dummy_data = pd.get_dummies(df['Car Model'])

In [13]:
# Concatinating the new column to the dataframe
merged_df = pd.concat([df, dummy_data], axis='columns') # The axis needs to be specified.

In pandas, when creating dummy variables using get_dummies function, we usually drop one of the columns to avoid the "dummy variable trap".

The dummy variable trap is a scenario in which two or more dummy variables are highly correlated, meaning that one variable can be predicted from the others. This can lead to problems in statistical analyses, such as multicollinearity, which can produce unstable estimates of the regression coefficients.  Dropping one of the columns also helps to reduce the dimensionality of the dataset, which can improve the performance of some machine learning algorithms

In [14]:
final_df = merged_df.drop(['Car Model', 'Audi A5' ], axis='columns')   #The axis needs to be specified.

In [16]:
new_cols = ['mileage', 'price', 'age', 'BMW', 'Mercedes']
final_df.columns = new_cols

In [17]:
final_df

Unnamed: 0,mileage,price,age,BMW,Mercedes
0,69000,18000,6,1,0
1,35000,34000,3,1,0
2,57000,26100,5,1,0
3,22500,40000,2,1,0
4,46000,31500,4,1,0
5,59000,29400,5,0,0
6,52000,32000,5,0,0
7,72000,19300,6,0,0
8,91000,12000,8,0,0
9,67000,22000,6,0,1


In [19]:
x = final_df.drop(['price'], axis='columns')

In [20]:
x

Unnamed: 0,mileage,age,BMW,Mercedes
0,69000,6,1,0
1,35000,3,1,0
2,57000,5,1,0
3,22500,2,1,0
4,46000,4,1,0
5,59000,5,0,0
6,52000,5,0,0
7,72000,6,0,0
8,91000,8,0,0
9,67000,6,0,1


In [21]:
y = final_df.price

In [22]:
y

0     18000
1     34000
2     26100
3     40000
4     31500
5     29400
6     32000
7     19300
8     12000
9     22000
10    20000
11    21000
12    33000
Name: price, dtype: int64

In [23]:
model = LinearRegression()
model.fit(x,y)

In [25]:
model.predict([[69000,6,1, 0]])



array([18705.2723644])

In [35]:
#Serializing the model using pickle
with open('pickle_model', 'wb' ) as file:
  pickle.dump(model, file)

In [36]:
with open('pickle_model', 'rb') as file:
  pm = pickle.load(file)

In [37]:
pm.predict([[35000,3,1,0]])



array([35286.78445645])

In [38]:
model.score(x,y)

0.9417050937281083

**Data Preprocessing using One Hot Encoder**

In [39]:
df

Unnamed: 0,Car Model,Mileage,Sell Price($),Age(yrs)
0,BMW X5,69000,18000,6
1,BMW X5,35000,34000,3
2,BMW X5,57000,26100,5
3,BMW X5,22500,40000,2
4,BMW X5,46000,31500,4
5,Audi A5,59000,29400,5
6,Audi A5,52000,32000,5
7,Audi A5,72000,19300,6
8,Audi A5,91000,12000,8
9,Mercedez Benz C class,67000,22000,6


In [43]:
# Using LabelEncoder to convert the categorical variables into numerical values
le = LabelEncoder()

In [44]:
encoded_df = df
encoded_df['Car Model'] = le.fit_transform(encoded_df['Car Model'])

In [48]:
new_cols = ['model', 'mileage', 'price', 'age']

In [49]:
encoded_df.columns = new_cols

In [50]:
encoded_df

Unnamed: 0,model,mileage,price,age
0,1,69000,18000,6
1,1,35000,34000,3
2,1,57000,26100,5
3,1,22500,40000,2
4,1,46000,31500,4
5,0,59000,29400,5
6,0,52000,32000,5
7,0,72000,19300,6
8,0,91000,12000,8
9,2,67000,22000,6


In [51]:
x = encoded_df.drop(['price'], axis='columns')

In [52]:
y = encoded_df.price

In [53]:
ct = ColumnTransformer([('one_hot_encoder', OneHotEncoder(categories='auto'), [0])],remainder='passthrough')
x = ct.fit_transform(x)
x


array([[0.00e+00, 1.00e+00, 0.00e+00, 6.90e+04, 6.00e+00],
       [0.00e+00, 1.00e+00, 0.00e+00, 3.50e+04, 3.00e+00],
       [0.00e+00, 1.00e+00, 0.00e+00, 5.70e+04, 5.00e+00],
       [0.00e+00, 1.00e+00, 0.00e+00, 2.25e+04, 2.00e+00],
       [0.00e+00, 1.00e+00, 0.00e+00, 4.60e+04, 4.00e+00],
       [1.00e+00, 0.00e+00, 0.00e+00, 5.90e+04, 5.00e+00],
       [1.00e+00, 0.00e+00, 0.00e+00, 5.20e+04, 5.00e+00],
       [1.00e+00, 0.00e+00, 0.00e+00, 7.20e+04, 6.00e+00],
       [1.00e+00, 0.00e+00, 0.00e+00, 9.10e+04, 8.00e+00],
       [0.00e+00, 0.00e+00, 1.00e+00, 6.70e+04, 6.00e+00],
       [0.00e+00, 0.00e+00, 1.00e+00, 8.30e+04, 7.00e+00],
       [0.00e+00, 0.00e+00, 1.00e+00, 7.90e+04, 7.00e+00],
       [0.00e+00, 0.00e+00, 1.00e+00, 5.90e+04, 5.00e+00]])

In [55]:
x = x[:,1:]
x

array([[0.00e+00, 6.90e+04, 6.00e+00],
       [0.00e+00, 3.50e+04, 3.00e+00],
       [0.00e+00, 5.70e+04, 5.00e+00],
       [0.00e+00, 2.25e+04, 2.00e+00],
       [0.00e+00, 4.60e+04, 4.00e+00],
       [0.00e+00, 5.90e+04, 5.00e+00],
       [0.00e+00, 5.20e+04, 5.00e+00],
       [0.00e+00, 7.20e+04, 6.00e+00],
       [0.00e+00, 9.10e+04, 8.00e+00],
       [1.00e+00, 6.70e+04, 6.00e+00],
       [1.00e+00, 8.30e+04, 7.00e+00],
       [1.00e+00, 7.90e+04, 7.00e+00],
       [1.00e+00, 5.90e+04, 5.00e+00]])

In [58]:
model.fit(x,y)

In [61]:
model.predict([[1, 86000,7]])

array([14081.23606258])

In [62]:
joblib.dump(model,'joblib_model')

['joblib_model']

In [65]:
jm = joblib.load('joblib_model')
jm.predict([[1,44000,6]])

array([36074.52283542])

In [66]:
model.score(x,y)

0.9059850708891809