# User Cars DataPreprocessing

## Preprocessing for Test and Train Data

In [1653]:
import numpy as np 
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

In [1654]:
df = pd.read_csv(r"E:\AMIT - Assignments and Materials\AI ONL 104\ML\Linear Regression S2\train.csv")


### First preprocessing on train data

In [1655]:
df.shape

(6019, 13)

In [1656]:
df.columns

Index(['Name', 'Location', 'Year', 'Kilometers_Driven', 'Fuel_Type',
       'Transmission', 'Owner_Type', 'Mileage', 'Engine', 'Power', 'Seats',
       'New_Price', 'Price'],
      dtype='object')

In [1657]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6019 entries, 0 to 6018
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Name               6019 non-null   object 
 1   Location           6019 non-null   object 
 2   Year               6019 non-null   int64  
 3   Kilometers_Driven  6019 non-null   int64  
 4   Fuel_Type          6019 non-null   object 
 5   Transmission       6019 non-null   object 
 6   Owner_Type         6019 non-null   object 
 7   Mileage            6017 non-null   object 
 8   Engine             5983 non-null   object 
 9   Power              5983 non-null   object 
 10  Seats              5977 non-null   float64
 11  New_Price          824 non-null    object 
 12  Price              6019 non-null   float64
dtypes: float64(2), int64(2), object(9)
memory usage: 611.4+ KB


In [1658]:
#detect nulls 
df.isnull().sum()

Name                    0
Location                0
Year                    0
Kilometers_Driven       0
Fuel_Type               0
Transmission            0
Owner_Type              0
Mileage                 2
Engine                 36
Power                  36
Seats                  42
New_Price            5195
Price                   0
dtype: int64

In [1659]:
df = df.drop('New_Price', axis=1)
#df = df.drop('Name', axis=1)

In [1660]:
df.isnull().sum()

Name                  0
Location              0
Year                  0
Kilometers_Driven     0
Fuel_Type             0
Transmission          0
Owner_Type            0
Mileage               2
Engine               36
Power                36
Seats                42
Price                 0
dtype: int64

In [1661]:
df.duplicated().sum()

np.int64(0)

In [1662]:
# Feature Engineering
df['Mileage'] = df['Mileage'].str.extract(r'(\d+\.?\d*)').astype(float)
df['Engine'] = df['Engine'].str.extract(r'(\d+\.?\d*)').astype(float)
df['Power'] = df['Power'].str.extract(r'(\d+\.?\d*)').astype(float)

In [1663]:
#Fill na with median values
df['Mileage'].fillna(df['Mileage'].median(), inplace=True)
df['Engine'].fillna(df['Engine'].median(), inplace=True)
df['Power'].fillna(df['Power'].median(), inplace=True)
df['Seats'].fillna(df['Seats'].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Mileage'].fillna(df['Mileage'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Engine'].fillna(df['Engine'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which

In [1664]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6019 entries, 0 to 6018
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Name               6019 non-null   object 
 1   Location           6019 non-null   object 
 2   Year               6019 non-null   int64  
 3   Kilometers_Driven  6019 non-null   int64  
 4   Fuel_Type          6019 non-null   object 
 5   Transmission       6019 non-null   object 
 6   Owner_Type         6019 non-null   object 
 7   Mileage            6019 non-null   float64
 8   Engine             6019 non-null   float64
 9   Power              6019 non-null   float64
 10  Seats              6019 non-null   float64
 11  Price              6019 non-null   float64
dtypes: float64(5), int64(2), object(5)
memory usage: 564.4+ KB


In [1665]:
df

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price
0,Maruti Wagon R LXI CNG,Mumbai,2010,72000,CNG,Manual,First,26.60,998.0,58.16,5.0,1.75
1,Hyundai Creta 1.6 CRDi SX Option,Pune,2015,41000,Diesel,Manual,First,19.67,1582.0,126.20,5.0,12.50
2,Honda Jazz V,Chennai,2011,46000,Petrol,Manual,First,18.20,1199.0,88.70,5.0,4.50
3,Maruti Ertiga VDI,Chennai,2012,87000,Diesel,Manual,First,20.77,1248.0,88.76,7.0,6.00
4,Audi A4 New 2.0 TDI Multitronic,Coimbatore,2013,40670,Diesel,Automatic,Second,15.20,1968.0,140.80,5.0,17.74
...,...,...,...,...,...,...,...,...,...,...,...,...
6014,Maruti Swift VDI,Delhi,2014,27365,Diesel,Manual,First,28.40,1248.0,74.00,5.0,4.75
6015,Hyundai Xcent 1.1 CRDi S,Jaipur,2015,100000,Diesel,Manual,First,24.40,1120.0,71.00,5.0,4.00
6016,Mahindra Xylo D4 BSIV,Jaipur,2012,55000,Diesel,Manual,Second,14.00,2498.0,112.00,8.0,2.90
6017,Maruti Wagon R VXI,Kolkata,2013,46000,Petrol,Manual,First,18.90,998.0,67.10,5.0,2.65


In [1666]:
# Applying log transformation to skewed numerical features
for col in ['Kilometers_Driven', 'Engine', 'Power']:
    df[col] = np.log1p(df[col])

In [1667]:
# Label Encoding categorical columns
cat_cols = ["Name",'Location', 'Fuel_Type', 'Transmission', 'Owner_Type']

label_maps = {}

for col in cat_cols:
    if col == "Owner_Type":
        mapping = {
            "First": 0,
            "Second": 1,
            "Third": 2,
            "Fourth & Above": 3
        }
        df[col] = df[col].map(mapping)
        label_maps[col] = list(mapping.keys())
    else:
        encoded, uniques = pd.factorize(df[col])
        df[col] = encoded
        label_maps[col] = uniques


In [1668]:
# # One Hot Encoding categorical columns
# cat_cols = ['Location', 'Fuel_Type', 'Transmission', 'Owner_Type']

# df = pd.get_dummies(df, columns=cat_cols, drop_first=True)


In [1669]:
df.head()

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price
0,0,0,2010,11.184435,0,0,0,26.6,6.906755,4.080246,5.0,1.75
1,1,1,2015,10.621352,1,0,0,19.67,7.367077,4.845761,5.0,12.5
2,2,2,2011,10.736418,2,0,0,18.2,7.090077,4.496471,5.0,4.5
3,3,2,2012,11.373675,1,0,0,20.77,7.130099,4.497139,7.0,6.0
4,4,3,2013,10.613271,1,1,1,15.2,7.585281,4.954418,5.0,17.74


In [1670]:
df['Owner_Type'].unique()

array([0, 1, 3, 2])

In [1671]:
# Detecting outliers
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


In [1672]:
# detect outliers manually
q1 = df['Mileage'].quantile(0.25)
q3 = df['Mileage'].quantile(0.75)
IQR = q3 - q1
IQR

wisk_min = q1 - 1.5 * IQR
wisk_max = q3 + 1.5 * IQR

df[(df['Mileage'] < wisk_min) | (df['Mileage'] > wisk_max)]

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price
14,14,1,2012,11.350418,1,1,1,0.00,7.687080,4.753590,5.0,17.50
67,66,3,2019,9.640173,1,1,0,0.00,7.576097,5.273000,5.0,35.67
79,78,4,2005,11.380445,2,0,0,0.00,6.991177,4.592085,5.0,1.30
194,176,10,2007,11.002217,2,0,0,0.00,7.309212,4.592085,5.0,2.95
229,205,9,2015,11.162474,1,0,0,0.00,7.312553,4.605170,5.0,3.60
...,...,...,...,...,...,...,...,...,...,...,...,...
5875,66,10,2019,8.294300,1,1,0,0.00,7.576097,5.273000,5.0,35.00
5935,1858,1,2015,10.994168,0,0,1,32.26,6.906755,4.080922,4.0,3.10
5943,1861,2,2002,11.225257,1,0,0,0.00,7.655864,4.592085,6.0,1.70
5972,677,0,2008,11.082158,2,0,1,0.00,6.991177,4.143135,5.0,1.39


In [1673]:
# Visualizing outliers
df['Mileage'] = df['Mileage'].replace(0, np.nan)
df['Mileage'] = df.groupby('Fuel_Type')['Mileage'].transform(
    lambda x: x.fillna(x.median())
)


In [1674]:
# detect outliers manually
q1 = df['Mileage'].quantile(0.25)
q3 = df['Mileage'].quantile(0.75)
IQR = q3 - q1
IQR

wisk_min = q1 - 1.5 * IQR
wisk_max = q3 + 1.5 * IQR

df[(df['Mileage'] < wisk_min) | (df['Mileage'] > wisk_max)]

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price
1269,792,0,2018,9.268704,0,0,0,33.44,6.680855,3.720862,4.0,3.5
1332,816,1,2013,11.283449,0,0,0,33.54,6.906755,4.220096,5.0,2.96
2059,816,0,2013,10.896758,0,0,0,33.54,6.906755,4.220096,5.0,3.05
2371,816,1,2014,10.282027,0,0,0,33.54,6.906755,4.220096,5.0,3.9
2701,1258,6,2016,11.163893,0,0,0,31.79,6.906755,4.083115,5.0,4.58
3119,1364,6,2018,10.696548,0,0,0,32.26,6.906755,4.080922,4.0,4.0
3129,816,8,2014,11.220753,0,0,0,33.54,6.906755,4.220096,5.0,3.25
3553,792,1,2015,9.798183,0,0,0,33.44,6.680855,3.720862,4.0,3.0
3869,792,8,2012,11.090385,0,0,1,33.44,6.680855,3.720862,4.0,2.1
4141,816,0,2014,10.76217,0,0,0,33.54,6.906755,4.220096,5.0,2.85


In [1675]:
# splitting data into X and y
X = df.drop(columns=['Price'])

# Target
y = df['Price']


In [1676]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Appling log transformation to target variable for better distribution of target variable
y_train = np.log1p(y_train)
y_test = np.log1p(y_test)
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")

X_train shape: (4815, 11)
X_test shape: (1204, 11)


In [1677]:
# Scaling the features
from sklearn.preprocessing import StandardScaler
# Initialize the scaler
scaler = StandardScaler()
numerical_cols = [col for col in X_train.columns ] # list comprehenssion for numerical columns
# Fit on training data
scaler.fit(X_train[numerical_cols])
# Transform the training data
X_train[numerical_cols] = scaler.transform(X_train[numerical_cols])
X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])
# Transform both training and testing data
print("Scaled training data head:")
display(X_train.head())

Scaled training data head:


Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats
4248,-0.433542,1.22414,-0.736105,0.683467,1.051112,-0.628366,-0.442752,0.059146,-0.708174,-0.635895,-0.341862
4129,-0.189145,-0.449788,0.185622,0.452497,-0.87107,-0.628366,-0.442752,1.057224,-0.05549,-0.109919,-0.341862
2534,-0.289308,0.889355,1.414591,-3.919174,-0.87107,-0.628366,-0.442752,1.423426,-0.590762,-0.372414,-0.341862
2893,0.47393,1.22414,-0.736105,1.638239,-0.87107,-0.628366,-0.442752,-1.283592,1.439213,-0.060818,3.379458
2860,1.36137,-0.115002,1.107349,1.045853,-0.87107,-0.628366,-0.442752,1.100307,-0.590762,-0.806741,-0.341862


# Applying Modeling

In [1678]:
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.metrics import mean_squared_error ,mean_absolute_error, r2_score
from sklearn.model_selection import GridSearchCV ,cross_val_score
from sklearn.datasets import make_regression # for generating a sample regression dataset
from sklearn.model_selection import train_test_split
import numpy as np

In [1679]:
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [1680]:
# first evaluation
r2_score(y_test, y_pred)

0.8424807800579854

In [1681]:
# second evaluation
np.sqrt(mean_squared_error(y_test,y_pred))

np.float64(0.29881090341570066)

In [1682]:
#third evaluation
mean_absolute_error(y_test, y_pred)

0.2236006667328171

###  1: k-fold cross-validation

In [1683]:
lr_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='r2')


In [1684]:
lr_scores

array([0.86536957, 0.84407511, 0.86416005, 0.87350635, 0.85048346])

In [1685]:
print(f"{np.mean(lr_scores)*100:.2f}")

85.95


#### Using ridge L2

In [1686]:
ridge = Ridge()
param_grid = {'alpha': list(range(0,50))}

### 2: GridSearchCV

In [1687]:
grid_search = GridSearchCV(ridge, param_grid, cv=5, scoring='r2')
grid_search.fit(X_train,y_train)


0,1,2
,estimator,Ridge()
,param_grid,"{'alpha': [0, 1, ...]}"
,scoring,'r2'
,n_jobs,
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,alpha,8
,fit_intercept,True
,copy_X,True
,max_iter,
,tol,0.0001
,solver,'auto'
,positive,False
,random_state,


In [1688]:
grid_search.best_params_['alpha']

8

In [1689]:
grid_search.best_score_ 

np.float64(0.8595223753261607)

#### Using Lasso L1

In [1690]:
param_grid = {'alpha': np.logspace(-3, 1, 50)}
lasso = Lasso(max_iter=10000)
grid_search = GridSearchCV(lasso, param_grid, cv=5, scoring='r2')
grid_search.fit(X_train, y_train)

0,1,2
,estimator,Lasso(max_iter=10000)
,param_grid,{'alpha': array([1.0000...00000000e+01])}
,scoring,'r2'
,n_jobs,
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,alpha,np.float64(0.001)
,fit_intercept,True
,precompute,False
,copy_X,True
,max_iter,10000
,tol,0.0001
,warm_start,False
,positive,False
,random_state,
,selection,'cyclic'


### 2: GridSearchCV

In [1691]:
grid_search.best_params_['alpha']

np.float64(0.001)

In [1692]:
grid_search.best_score_ 

np.float64(0.8595394593897634)

In [1693]:
# في النهايه استنتجت ان المودل مكانش محتاج regularization لانه ماثرش في حاجه في القيم اللي ظهرت لانه مكانش في overfitting اصلا