In [8]:
import numpy as np
import pandas as pd

def clean_data(df):
    # Format 'Date' column to dd/mm/yyyy
    df['Date'] = df['Date'].dt.strftime('%d/%m/%Y')
    # Replace gaps forward from the previous valid value in: 'Date'
    df = df.fillna({'Date': df['Date'].ffill()})
    # Fix NaN issue and rename 'Km' column
    df['Km'] = df['Km'].str.replace(',', '').str.replace(' Km', '')  # Remove commas and ' Km'
    df['Km'] = pd.to_numeric(df['Km'], errors='coerce')  # Convert to numeric, setting invalid parsing as NaN
    df['Km'] = df['Km'].fillna(0).astype(int)  # Replace NaN with 0 and convert to int
    df.rename(columns={'Km': 'Mileage_in_KM'}, inplace=True)  # Rename column to 'Mileage_in_KM'
    # Remove outliers in Mileage_in_KM
    q1 = df['Mileage_in_KM'].quantile(0.25)
    q3 = df['Mileage_in_KM'].quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    # Filter the DataFrame to exclude outliers
    df = df[(df['Mileage_in_KM'] >= lower_bound) & (df['Mileage_in_KM'] <= upper_bound)]
    # Handle NaN values before converting 'Price' column to integer
    df['Price'] = df['Price'].str.replace(',', '').str.replace(' EGP', '')
    df['Price'] = pd.to_numeric(df['Price'], errors='coerce')  # Convert to numeric, setting invalid parsing as NaN
    df['Price'] = df['Price'].fillna(0).astype(int)  # Fill NaN with 0 and convert to integer
    # Replace all instances of 0 with 0 in column: 'Price'
    df.loc[df['Price'] == 0, 'Price'] = np.nan
    # Drop column: 'Source.Name'
    df = df.drop(columns=['Source.Name'])
    return df

# Loaded variable 'df' from URI: d:\depi final project\Data 1-240.xlsx
df = pd.read_excel(r'd:\depi final project\cars1.xlsx')

df_clean = clean_data(df.copy())
df_clean.head()

Unnamed: 0,URL,Price,Date,Make,Model,Used since,Mileage_in_KM,Transmission,City,Color,Version,Fuel,Class,Body Style
0,https://eg.hatla2ee.com/en/car/mercedes/250/70...,1600000.0,21/10/2025,Mercedes,250,2010.0,190,automatic,Cairo,Black,E250,gas,,
1,https://eg.hatla2ee.com/en/car/skoda/kodiaq/70...,2100000.0,20/10/2025,Skoda,Kodiaq,2021.0,92000,automatic,6 October,Black,,gas,A/T / SPORT LINE,
2,https://eg.hatla2ee.com/en/car/mercedes/g-clas...,10800000.0,20/10/2025,Mercedes,G Class,2022.0,36000,automatic,Tagamo3 - New Cairo,Dark blue,,gas,,SUV
3,https://eg.hatla2ee.com/en/car/bmw/520/7079345,4350000.0,21/10/2025,BMW,520,2024.0,14000,automatic,Tagamo3 - New Cairo,Black,,gas,,Sedan
4,https://eg.hatla2ee.com/en/car/mercedes/e-200/...,4500000.0,21/10/2025,Mercedes,E 200,2022.0,12000,automatic,Tagamo3 - New Cairo,Gray,,gas,,Sedan
