In [35]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

In [4]:
df = pd.read_csv("car_l3_dataset.csv")

In [5]:
#df.head()        # First 5 rows
#df.tail()        # Last 5 rows
df.info()       # Column types and non-null values
#df.describe()    # Summary stats
#df.columns       # Column names
#df.index         # Row indices
#df.shape

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145 entries, 0 to 144
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Price        145 non-null    object 
 1   Odometer_km  138 non-null    float64
 2   Doors        138 non-null    float64
 3   Accidents    145 non-null    int64  
 4   Location     140 non-null    object 
 5   Year         145 non-null    int64  
dtypes: float64(2), int64(2), object(2)
memory usage: 6.9+ KB


In [6]:
# Knowing The Missing values
df.isnull().sum()

Price          0
Odometer_km    7
Doors          7
Accidents      0
Location       5
Year           0
dtype: int64

In [7]:
#Accessing each column in my data to know the missing values
df["Location"].value_counts(dropna=False)  # missing vakues are shown
#df["Odometer_km"].value_counts(dropna=False)
#df["Doors"].value_counts(dropna=False)
#df["Accidents"].value_counts(dropna=False)
#df["Year"].value_counts(dropna=False)

Location
City      59
Suburb    45
Rural     21
Subrb      8
??         7
NaN        5
Name: count, dtype: int64

In [8]:
# Cleaning the price column
df["Price"] = df["Price"].replace(r"[\$,]", "", regex=True).astype(float)
df["Price"].head()

0    1500.0
1    4171.0
2    5331.0
3    1500.0
4    1500.0
Name: Price, dtype: float64

In [9]:
df.head()

Unnamed: 0,Price,Odometer_km,Doors,Accidents,Location,Year
0,1500.0,137830.0,4.0,1,City,1998
1,4171.0,,4.0,0,Rural,2016
2,5331.0,107302.0,4.0,0,Suburb,2014
3,1500.0,141838.0,4.0,1,Suburb,1999
4,1500.0,,3.0,0,City,2022


In [10]:
# Checking the outliers in the price column
df["Price"].skew()

np.float64(7.871113660850301)

In [11]:
#Cleaning the location column
df["Location"] = df["Location"].replace({"Subrb" : "Suburb", "??" : pd.NA})
df["Location"].value_counts(dropna=False)

Location
City      59
Suburb    53
Rural     21
<NA>       7
NaN        5
Name: count, dtype: int64

In [None]:
# i checked the column if it has outlier before use mean value b/c if it has outlier the mean will be effected
df["Odometer_km"].skew()

np.float64(0.9546906127332146)

In [13]:
# Filling missing values 
# i fillied (Odometer_km) with mean value b/c it's numerical column 
# Locattion and Doors with mode b/c they are categorical columns
df["Odometer_km"] =df["Odometer_km"].fillna(df["Odometer_km"].mean())
df["Location"]  = df["Location"].fillna(df["Location"].mode()[0])
df["Doors"] =df["Doors"].fillna(df["Doors"].mode()[0])



In [14]:
df.isnull().sum()

Price          0
Odometer_km    0
Doors          0
Accidents      0
Location       0
Year           0
dtype: int64

In [15]:
# Before Removing Duplicates, checking the shape of the dataset 
df.shape

(145, 6)

In [16]:
df = df.drop_duplicates()

# after removing duplicates 
df.shape

(140, 6)

In [17]:
# Dealing with outliers
def iqr_fun(series, k=1.5):
    q1, q3 = series.quantile([0.25, 0.75])
    iqr = q3 - q1
    lower = q1 - k * iqr
    upper = q3 + k * iqr
    return lower, upper

lower_price, upper_price = iqr_fun(df["Price"])
lower_oddmeter, upper_oddmeter = iqr_fun(df["Odometer_km"])

In [18]:
print("Lower Price", lower_price, "Upper Price", upper_price)
print("Lower Oddmeter", lower_oddmeter, "upper_oddmeter", upper_oddmeter)

Lower Price -2984.625 Upper Price 8974.375
Lower Oddmeter -6642.25 upper_oddmeter 271987.75


In [19]:
# clipping the outliers
df["Price"] =df["Price"].clip(lower_price, upper_price)
df["Odometer_km"] = df["Odometer_km"].clip(lower_oddmeter, upper_oddmeter)

In [20]:
# one-hot encoding
df = pd.get_dummies(df, columns=["Location"], drop_first=False, dtype="int")
#[c for c in df.columns if c.startswith("Location")]

In [21]:
df.head()

Unnamed: 0,Price,Odometer_km,Doors,Accidents,Year,Location_City,Location_Rural,Location_Suburb
0,1500.0,137830.0,4.0,1,1998,1,0,0
1,4171.0,134823.615942,4.0,0,2016,0,1,0
2,5331.0,107302.0,4.0,0,2014,0,0,1
3,1500.0,141838.0,4.0,1,1999,0,0,1
4,1500.0,134823.615942,3.0,0,2022,1,0,0


In [22]:
#Feature Engineering 
CURRENT_YEAR =  2025
df["CarAge"] = CURRENT_YEAR - df["Year"]
df.head()

Unnamed: 0,Price,Odometer_km,Doors,Accidents,Year,Location_City,Location_Rural,Location_Suburb,CarAge
0,1500.0,137830.0,4.0,1,1998,1,0,0,27
1,4171.0,134823.615942,4.0,0,2016,0,1,0,9
2,5331.0,107302.0,4.0,0,2014,0,0,1,11
3,1500.0,141838.0,4.0,1,1999,0,0,1,26
4,1500.0,134823.615942,3.0,0,2022,1,0,0,3


In [25]:
# km_per_year with safe
df["km_per_year_with_safe"] = df["Odometer_km"] / df["CarAge"].replace(0, np.nan)

In [26]:
df.head()

Unnamed: 0,Price,Odometer_km,Doors,Accidents,Year,Location_City,Location_Rural,Location_Suburb,CarAge,km_per_year_with_safe
0,1500.0,137830.0,4.0,1,1998,1,0,0,27,5104.814815
1,4171.0,134823.615942,4.0,0,2016,0,1,0,9,14980.401771
2,5331.0,107302.0,4.0,0,2014,0,0,1,11,9754.727273
3,1500.0,141838.0,4.0,1,1999,0,0,1,26,5455.307692
4,1500.0,134823.615942,3.0,0,2022,1,0,0,3,44941.205314


In [29]:
# Is_Urban
df["Is_Urban"] = df["Location_City"].astype(int)

In [30]:
df.head()

Unnamed: 0,Price,Odometer_km,Doors,Accidents,Year,Location_City,Location_Rural,Location_Suburb,CarAge,km_per_year_with_safe,Is_Urban
0,1500.0,137830.0,4.0,1,1998,1,0,0,27,5104.814815,1
1,4171.0,134823.615942,4.0,0,2016,0,1,0,9,14980.401771,0
2,5331.0,107302.0,4.0,0,2014,0,0,1,11,9754.727273,0
3,1500.0,141838.0,4.0,1,1999,0,0,1,26,5455.307692,0
4,1500.0,134823.615942,3.0,0,2022,1,0,0,3,44941.205314,1


In [31]:
df["LogPrice"] = np.log1p(df["Price"])

In [32]:
df.head()

Unnamed: 0,Price,Odometer_km,Doors,Accidents,Year,Location_City,Location_Rural,Location_Suburb,CarAge,km_per_year_with_safe,Is_Urban,LogPrice
0,1500.0,137830.0,4.0,1,1998,1,0,0,27,5104.814815,1,7.313887
1,4171.0,134823.615942,4.0,0,2016,0,1,0,9,14980.401771,0,8.336151
2,5331.0,107302.0,4.0,0,2014,0,0,1,11,9754.727273,0,8.581482
3,1500.0,141838.0,4.0,1,1999,0,0,1,26,5455.307692,0,7.313887
4,1500.0,134823.615942,3.0,0,2022,1,0,0,3,44941.205314,1,7.313887


In [33]:
# After making feature engineering 
df.shape

(140, 12)

In [34]:
OUT_PATH = "car_13_clean_f1.csv"

df.to_csv(OUT_PATH)

In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 140 entries, 0 to 139
Data columns (total 12 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Price                  140 non-null    float64
 1   Odometer_km            140 non-null    float64
 2   Doors                  140 non-null    float64
 3   Accidents              140 non-null    int64  
 4   Year                   140 non-null    int64  
 5   Location_City          140 non-null    int64  
 6   Location_Rural         140 non-null    int64  
 7   Location_Suburb        140 non-null    int64  
 8   CarAge                 140 non-null    int64  
 9   km_per_year_with_safe  140 non-null    float64
 10  Is_Urban               140 non-null    int64  
 11  LogPrice               140 non-null    float64
dtypes: float64(5), int64(7)
memory usage: 14.2 KB


In [38]:
# Scaling
dont_scale = {"Price", "LogPrice"}
numeric_cols = df.select_dtypes(include=["int64", "float64"]).columns.to_list()
exclude = [c for c in df.columns if c.startswith("Location")] + ["Is_Urban"]
num_features_to_scale = [c for c in numeric_cols if c not in dont_scale and c not in exclude]
scaler = StandardScaler()
df[num_features_to_scale] = scaler.fit_transform(df[num_features_to_scale])

In [39]:
df.head()

Unnamed: 0,Price,Odometer_km,Doors,Accidents,Year,Location_City,Location_Rural,Location_Suburb,CarAge,km_per_year_with_safe,Is_Urban,LogPrice
0,1500.0,0.122531,0.254091,0.316968,-1.686714,1,0,0,1.686714,-0.554984,1,7.313887
1,4171.0,0.066468,0.254091,-0.820867,0.794617,0,1,0,-0.794617,0.03392,0,8.336151
2,5331.0,-0.446747,0.254091,-0.820867,0.518913,0,0,1,-0.518913,-0.277699,0,8.581482
3,1500.0,0.197271,0.254091,0.316968,-1.548862,0,0,1,1.548862,-0.534084,0,7.313887
4,1500.0,0.066468,-0.931668,-0.820867,1.621727,1,0,0,-1.621727,1.820554,1,7.313887


In [40]:
OUT_PATH = "car_13_clean_ready.csv"

df.to_csv(OUT_PATH)