In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

In [2]:
CURRENT_YEAR = 2025

In [3]:
# ================================
# Step 1: Load & Inspect
# ================================

In [4]:
CSV_PATH = "car_l3_dataset.csv"

In [5]:
df = pd.read_csv(CSV_PATH)

In [6]:
print("=== INITIAL HEAD (10) ===")
print(df.head(10))

=== INITIAL HEAD (10) ===
    Price  Odometer_km  Doors  Accidents Location  Year
0  $1,500     137830.0    4.0          1     City  1998
1  4171.0          NaN    4.0          0    Rural  2016
2  $5,331     107302.0    4.0          0   Suburb  2014
3  1500.0     141838.0    4.0          1   Suburb  1999
4  1500.0          NaN    3.0          0     City  2022
5  $1,500     211171.0    4.0          0       ??  2003
6  1500.0     222235.0    4.0          2    Rural  2004
7  $1,500     105068.0    5.0          1     City  2002
8  $2,291      90015.0    4.0          0    Rural  2007
9  1500.0     125976.0    2.0          0     City  2002


In [7]:
print("=== INITIAL SHAPE ===")
print(df.shape)

=== INITIAL SHAPE ===
(145, 6)


In [8]:
print("=== INITIAL INFO ===")
print(df.info())

=== INITIAL INFO ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145 entries, 0 to 144
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Price        145 non-null    object 
 1   Odometer_km  138 non-null    float64
 2   Doors        138 non-null    float64
 3   Accidents    145 non-null    int64  
 4   Location     140 non-null    object 
 5   Year         145 non-null    int64  
dtypes: float64(2), int64(2), object(2)
memory usage: 6.9+ KB
None


In [9]:
print("=== INITIAL MISSING VALUE ===")
print(df.isnull().sum())

=== INITIAL MISSING VALUE ===
Price          0
Odometer_km    7
Doors          7
Accidents      0
Location       5
Year           0
dtype: int64


In [10]:
print("=== INITIAL DESCRIPE ===")
print(df.describe())

=== INITIAL DESCRIPE ===
         Odometer_km       Doors   Accidents         Year
count     138.000000  138.000000  145.000000   145.000000
mean   134823.615942    3.775362    0.717241  2010.027586
std     61312.448209    0.854512    0.879579     7.252341
min      5000.000000    2.000000    0.000000  1998.000000
25%     97504.000000    3.000000    0.000000  2004.000000
50%    128548.000000    4.000000    0.000000  2009.000000
75%    170048.500000    4.000000    1.000000  2016.000000
max    395000.000000    5.000000    3.000000  2023.000000


In [11]:
 print("=== LOCATION VALUE COUNTS BEFORE FIX ===")
print(df["Location"].value_counts(dropna=False))

=== LOCATION VALUE COUNTS BEFORE FIX ===
Location
City      59
Suburb    45
Rural     21
Subrb      8
??         7
NaN        5
Name: count, dtype: int64


In [12]:
# ================================
# Step 2: Clean Price
# ================================

In [13]:
df["Price"] = df["Price"].replace(r"[\$,]", "",regex=True).astype(float)

In [14]:
print("Price skewness:",df["Price"].skew())

Price skewness: 7.871113660850301


In [15]:
# ================================
# Step 3: Fix Location Categories
# ================================

In [16]:
df['Location'] = df['Location'].str.strip().str.title()

In [17]:
df["Location"] = df["Location"].replace({"Subrb" : "Suburb","??" :pd.NA}) 

In [18]:
print("===  LOCATION  VALUE COUNTS AFTER FIX ===")
print(df["Location"].value_counts(dropna=False))

===  LOCATION  VALUE COUNTS AFTER FIX ===
Location
City      59
Suburb    53
Rural     21
<NA>       7
NaN        5
Name: count, dtype: int64


In [19]:
# ================================
# Step 4: Impute Missing
# ================================

In [20]:
df["Location"] = df["Location"].fillna(df["Location"].mode()[0])

In [21]:
df["Doors"] = df["Doors"].fillna(df["Doors"].mode()[0])

In [22]:
df["Odometer_km"] = df["Odometer_km"].fillna(df["Odometer_km"].median())

In [23]:
print("Missing after imputation:", df.isnull().sum())

Missing after imputation: Price          0
Odometer_km    0
Doors          0
Accidents      0
Location       0
Year           0
dtype: int64


In [24]:
# ================================
# Step 5: Remove Duplicates
# ================================

In [25]:
befre = df.shape
print("before removing duplicate : ",befre)

before removing duplicate :  (145, 6)


In [26]:
df = df.drop_duplicates()

In [27]:
after = df.shape
print("after removing duplicate : ",after)

after removing duplicate :  (140, 6)


In [28]:
#print(f"\nRemoved {befre - after} duplicate rows")

In [29]:
# ================================
# Step 6: Outlier Capping
# ================================

In [30]:
def iqr_fun(series, k=1.5):
    q1, q3 = series.quantile([0.25, 0.75])
    iqr = q3 - q1
    lower = q1 - k * iqr
    upper = q3 + k * iqr
    return lower , upper

In [31]:
low_price, high_price = iqr_fun(df["Price"])

In [32]:
low_Odometer, high_Odometer = iqr_fun(df["Odometer_km"])

In [33]:
df["Price"]    = df["Price"].clip(lower=low_price, upper=high_price)

In [34]:
df["Odometer_km"] = df["Odometer_km"].clip(lower=low_Odometer, upper=high_Odometer )

In [35]:
print("price after IQR clipping")
print(df["Price"].describe())

price after IQR clipping
count     140.000000
mean     3175.456250
std      2601.848629
min      1500.000000
25%      1500.000000
50%      1500.000000
75%      4489.750000
max      8974.375000
Name: Price, dtype: float64


In [36]:
print("Odometer_km after IQR clipping")
print(df["Odometer_km"].describe())

Odometer_km after IQR clipping
count       140.000000
mean     130945.403571
std       53815.006935
min        5000.000000
25%       97844.000000
50%      128548.000000
75%      167501.500000
max      271987.750000
Name: Odometer_km, dtype: float64


In [37]:
#import seaborn as sns
#import matplotlib.pyplot as plt
# waxaad arkeysa in uu jirin outlier oo laga saaray 
#sns.boxplot(x=df["Price"])
#plt.show()

#sns.boxplot(x=df["Odometer_km"])
#plt.show()


In [38]:
# ================================
# Step 7: One-Hot Encode
# ================================

In [39]:
df= pd.get_dummies(df, columns=["Location"], drop_first=False)

In [40]:
print([c for c in df.columns if c.startswith("Location")])

['Location_City', 'Location_Rural', 'Location_Suburb']


In [41]:
# ================================
# Step 8: Feature Engineering
# ================================

In [42]:
df["CarAge"] = CURRENT_YEAR - df["Year"]

In [43]:
df["Km_per_year"] = df["Odometer_km"] / df["CarAge"].replace(0,np.nan)

In [44]:
df ["LogPrice"] = np. log1p(df["Price"])

In [45]:
df["Is_Urban"] = (df["Location_City"] + df["Location_Suburb"] > 0).astype(int)

In [46]:
df['Accident_flag'] = (df["Accidents"] > 0).astype(int)

In [47]:
print(df.head(3))

    Price  Odometer_km  Doors  Accidents  Year  Location_City  Location_Rural  \
0  1500.0     137830.0    4.0          1  1998           True           False   
1  4171.0     128548.0    4.0          0  2016          False            True   
2  5331.0     107302.0    4.0          0  2014          False           False   

   Location_Suburb  CarAge   Km_per_year  LogPrice  Is_Urban  Accident_flag  
0            False      27   5104.814815  7.313887         1              1  
1            False       9  14283.111111  8.336151         0              0  
2             True      11   9754.727273  8.581482         1              0  


In [48]:
# ================================
# Step 9: Feature Scalling
# ================================

In [49]:
dont_scale = ["LogPrice","Price"]

In [50]:
numeric_col = df.select_dtypes(include=["int64", "float64"]).columns.to_list()

In [51]:
exclude = [c for c in df.columns if c.startswith("Location")] + ["Is_Urban","Accident_flag"]

In [52]:
num_features_to_scale = [c for c in numeric_col if c not in dont_scale and c not in exclude]

In [53]:
scaler = StandardScaler()

In [54]:
df[num_features_to_scale] = scaler.fit_transform(df[num_features_to_scale])

In [55]:
print("after scalinggg")
print(df.head(3))

after scalinggg
    Price  Odometer_km     Doors  Accidents      Year  Location_City  \
0  1500.0     0.128390  0.254091   0.316968 -1.686714           True   
1  4171.0    -0.044709  0.254091  -0.820867  0.794617          False   
2  5331.0    -0.440923  0.254091  -0.820867  0.518913          False   

   Location_Rural  Location_Suburb    CarAge  Km_per_year  LogPrice  Is_Urban  \
0           False            False  1.686714    -0.553364  7.313887         1   
1            True            False -0.794617    -0.003812  8.336151         0   
2           False             True -0.518913    -0.274949  8.581482         1   

   Accident_flag  
0              1  
1              0  
2              0  


In [56]:
# ================================
# Step 10: Final Checks & Save
# ================================

In [57]:
print("\n=== FINAL HEAD ===")
print(df.head())


=== FINAL HEAD ===
    Price  Odometer_km     Doors  Accidents      Year  Location_City  \
0  1500.0     0.128390  0.254091   0.316968 -1.686714           True   
1  4171.0    -0.044709  0.254091  -0.820867  0.794617          False   
2  5331.0    -0.440923  0.254091  -0.820867  0.518913          False   
3  1500.0     0.203135  0.254091   0.316968 -1.548862          False   
4  1500.0    -0.044709 -0.931668  -0.820867  1.621727           True   

   Location_Rural  Location_Suburb    CarAge  Km_per_year  LogPrice  Is_Urban  \
0           False            False  1.686714    -0.553364  7.313887         1   
1            True            False -0.794617    -0.003812  8.336151         0   
2           False             True -0.518913    -0.274949  8.581482         1   
3           False             True  1.548862    -0.532378  7.313887         1   
4           False            False -1.621727     1.706594  7.313887         1   

   Accident_flag  
0              1  
1              0  
2  

In [58]:
print("\n=== FINAL INFO ===")
print(df.info())


=== FINAL INFO ===
<class 'pandas.core.frame.DataFrame'>
Index: 140 entries, 0 to 139
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Price            140 non-null    float64
 1   Odometer_km      140 non-null    float64
 2   Doors            140 non-null    float64
 3   Accidents        140 non-null    float64
 4   Year             140 non-null    float64
 5   Location_City    140 non-null    bool   
 6   Location_Rural   140 non-null    bool   
 7   Location_Suburb  140 non-null    bool   
 8   CarAge           140 non-null    float64
 9   Km_per_year      140 non-null    float64
 10  LogPrice         140 non-null    float64
 11  Is_Urban         140 non-null    int32  
 12  Accident_flag    140 non-null    int32  
dtypes: bool(3), float64(8), int32(2)
memory usage: 11.3 KB
None


In [59]:

print("\n=== FINAL MISSING VALUES ===")
print(df.isnull().sum())


=== FINAL MISSING VALUES ===
Price              0
Odometer_km        0
Doors              0
Accidents          0
Year               0
Location_City      0
Location_Rural     0
Location_Suburb    0
CarAge             0
Km_per_year        0
LogPrice           0
Is_Urban           0
Accident_flag      0
dtype: int64


In [60]:

print("\n=== FINAL DESCRIBE ===")
print(df.describe())


=== FINAL DESCRIBE ===
             Price   Odometer_km         Doors     Accidents          Year  \
count   140.000000  1.400000e+02  1.400000e+02  1.400000e+02  1.400000e+02   
mean   3175.456250  3.172066e-18  2.077703e-16  6.344132e-18  2.686740e-15   
std    2601.848629  1.003591e+00  1.003591e+00  1.003591e+00  1.003591e+00   
min    1500.000000 -2.348743e+00 -2.117428e+00 -8.208670e-01 -1.686714e+00   
25%    1500.000000 -6.173048e-01 -9.316683e-01 -8.208670e-01 -8.596039e-01   
50%    1500.000000 -4.470894e-02  2.540913e-01 -8.208670e-01 -3.249362e-02   
75%    4489.750000  6.817310e-01  2.540913e-01  3.169684e-01  8.290796e-01   
max    8974.375000  2.630285e+00  1.439851e+00  2.592639e+00  1.759579e+00   

             CarAge   Km_per_year    LogPrice    Is_Urban  Accident_flag  
count  1.400000e+02  1.400000e+02  140.000000  140.000000     140.000000  
mean  -9.516197e-18  2.537653e-17    7.797661    0.850000       0.478571  
std    1.003591e+00  1.003591e+00    0.684154   

In [61]:
print("\n=== FINAL SHAPE ===")
print(df.shape)


=== FINAL SHAPE ===
(140, 13)


In [62]:
print("\n=== FINAL CLEANED DATA ===")
OUT_PATH = "Car_l3_Clean.csv"
df.to_csv(OUT_PATH, index=False)


=== FINAL CLEANED DATA ===
