In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score
import pickle

In [2]:
mpl.style.use('ggplot')

In [3]:
car = pd.read_csv('quikr_car.csv')

In [4]:
car = car[car['year'].str.isnumeric()]
car['year'] = car['year'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  car['year'] = car['year'].astype(int)


In [5]:
car = car[car['Price'] != 'Ask For Price']
car['Price'] = car['Price'].str.replace(',', '').astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  car['Price'] = car['Price'].str.replace(',', '').astype(int)


In [6]:
car['kms_driven'] = car['kms_driven'].str.split().str.get(0).str.replace(',', '')
car = car[car['kms_driven'].str.isnumeric()]
car['kms_driven'] = car['kms_driven'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  car['kms_driven'] = car['kms_driven'].astype(int)


In [7]:
car = car[~car['fuel_type'].isna()]

In [8]:
car['name'] = car['name'].str.split().str.slice(start=0, stop=3).str.join(' ')

In [9]:
car = car.reset_index(drop=True)


In [10]:
car = car[car['Price'] < 6000000]

In [11]:
X = car[['name', 'company', 'year', 'kms_driven', 'fuel_type']]
y = car['Price']

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [13]:
ohe = OneHotEncoder()
column_trans = make_column_transformer(
    (OneHotEncoder(), ['name', 'company', 'fuel_type']),
    remainder='passthrough'
)


In [18]:
column_trans = make_column_transformer(
    (OneHotEncoder(handle_unknown='ignore'), ['name', 'company', 'fuel_type']), # Handle unknown categories
    remainder='passthrough'
)

In [19]:
lr = LinearRegression()

In [20]:
pipe = make_pipeline(column_trans, lr)

In [21]:
pipe.fit(X_train, y_train)

In [22]:
y_pred = pipe.predict(X_test)
print(f"R2 Score: {r2_score(y_test, y_pred)}")

R2 Score: 0.5731311949541246


In [23]:
with open('LinearRegressionModel.pkl', 'wb') as file:
    pickle.dump(pipe, file)

In [24]:
sample_input = pd.DataFrame(columns=['name', 'company', 'year', 'kms_driven', 'fuel_type'],
                            data=np.array(['Maruti Suzuki Swift', 'Maruti', 2019, 100, 'Petrol']).reshape(1, 5))
print(pipe.predict(sample_input))

[418107.53771862]
