In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
car = pd.read_csv('/content/drive/MyDrive/data/car_data.csv')
car.head()

Unnamed: 0,name,company,year,Price,kms_driven,fuel_type
0,Hyundai Santro Xing XO eRLX Euro III,Hyundai,2007,80000,"45,000 kms",Petrol
1,Mahindra Jeep CL550 MDI,Mahindra,2006,425000,40 kms,Diesel
2,Maruti Suzuki Alto 800 Vxi,Maruti,2018,Ask For Price,"22,000 kms",Petrol
3,Hyundai Grand i10 Magna 1.2 Kappa VTVT,Hyundai,2014,325000,"28,000 kms",Petrol
4,Ford EcoSport Titanium 1.5L TDCi,Ford,2014,575000,"36,000 kms",Diesel


In [None]:
#car.info()
#car['Price'].unique()
#car[~car['year'].str.isnumeric()]

**Data Quality Issues:**

  1. names are pretty inconsistent
  2. names have company names attached to it
  3. year has many non-year values
  4. year is in object. Change to integer
  5. Price has Ask for Price
  6. Price has commas in its prices and is in object
  7. kms_driven has object values with kms at last.
  8. It has nan values and two rows have 'Petrol' in them
  9. fuel_type has nan values

In [None]:
###Clean kms_driven column
car = car.dropna(subset=['kms_driven'])
#car = car[~car['kms_driven'].isna()] (same as above)
car['kms_driven'] = car['kms_driven'].str.split(' ').str.get(0).str.replace(',','')
car = car[car['kms_driven'].str.isnumeric()]
car['kms_driven'] = car['kms_driven'].astype(int)


In [None]:
#Clean Year column
car = car[car['year'].str.isnumeric()]
car['year'] = car['year'].astype(int)


In [None]:
#Clean Price column
car['Price'] = car['Price'].str.replace('Ask For Price','').str.replace(',','')
#car['Price'] = car['Price'].str.replace(',','')
car = car[car['Price'].str.isnumeric()]
car['Price'] = car['Price'].astype(int)

In [None]:
##Clean fuel_type column
car = car[~car['fuel_type'].isna()]


In [None]:
#clean car name
car['name'] = car['name'].str.split(' ').str.slice(0,3).str.join(' ')

In [None]:
#drop index
car = car.reset_index(drop=True)
car.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 816 entries, 0 to 815
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   name        816 non-null    object
 1   company     816 non-null    object
 2   year        816 non-null    int64 
 3   Price       816 non-null    int64 
 4   kms_driven  816 non-null    int64 
 5   fuel_type   816 non-null    object
dtypes: int64(3), object(3)
memory usage: 38.4+ KB


#Build Model

In [None]:
#X, y columns
X = car.drop(columns='Price', axis=1)
y = car['Price']


In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=425)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((652, 5), (164, 5), (652,), (164,))

In [None]:
#import linear regression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline

In [None]:
ohe = OneHotEncoder()
ohe.fit(X[['name','company','fuel_type']])

In [None]:
column_trans = make_column_transformer((OneHotEncoder(categories=ohe.categories_),['name','company','fuel_type']),
                                       remainder='passthrough')

In [None]:
lr = LinearRegression()

In [None]:
pipe = make_pipeline(column_trans, lr)

In [None]:
pipe.fit(X_train, y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [None]:
pipe.score(X_test, y_test)

0.5226408734754824

In [None]:
# prompt: create for loop and run the above pipeline for 1000 times. store r2 score in a list. print max of r2 score list
r2_scores = []
for i in range(1000):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=i) # Remove random_state for different splits
    pipe.fit(X_train, y_train)
    score = pipe.score(X_test, y_test)
    r2_scores.append(score)

r2_ind = np.argmax(r2_scores)
print(f"Best R2 Score: {r2_scores[r2_ind]}")

Best R2 Score: 0.7873968963170417


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=np.argmax(r2_scores))
lr = LinearRegression()
pipe = make_pipeline(column_trans, lr)
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
r2_score(y_test, y_pred)

0.7873968963170417

In [None]:
import pickle
pickle.dump(pipe, open('/content/drive/MyDrive/data/pickle_dumps/CarData_LinearRegressionModel.pkl', 'wb'))

In [None]:
# prompt: test the above pickle pipe with a record
new_car_data = pd.DataFrame([['Maruti Suzuki Swift','Maruti',2019,100000,'Petrol']],columns=['name','company','year','kms_driven','fuel_type'])
new_car_data
loaded_model = pickle.load(open('/content/drive/MyDrive/data/pickle_dumps/CarData_LinearRegressionModel.pkl', 'rb'))
loaded_model.predict(new_car_data)

array([451526.70297749])

In [1]:
import sklearn
print("Scikit-learn version:", sklearn.__version__)



Scikit-learn version: 1.6.1
