In [1]:
import pandas as pd
import numpy as np
import os
from scipy import stats
from scipy.stats import zscore

### Read the data file

In [2]:
def load_data(file_path):
    return pd.read_csv(file_path)

# df = pd.read_csv('/Users/mohammed/Downloads/used-cars-sales-prediction/data/raw.csv')


### Find Missing Values

In [3]:
def identify_missing_data(df):
    missing_summary = df.isnull().sum()
    missing_percentage = (missing_summary / len(df)) * 100
    missing_report = pd.DataFrame({
        'Missing Count': missing_summary,
        'Missing Percentage': missing_percentage
    })
    print("Missing Data Report:")
    print(missing_report)
    return missing_report

## Find Duplicates

In [4]:
def find_duplicate_rows(df):
    duplicate_mask = df.duplicated(keep=False)
    duplicate_rows = df[duplicate_mask]
    return duplicate_rows


In [5]:
def drop_duplicates(df):
    duplicate_count = df.duplicated().sum()
    print(f"Number of duplicates before dropping: {duplicate_count}")
    df = df.drop_duplicates()
    return df, duplicate_count

## Remove Outliers

In [6]:
def remove_outliers(df, column_name, threshold=3):
    df['z_score'] = zscore(df[column_name])
    original_row_count = len(df)
    df_cleaned = df[df['z_score'].abs() <= threshold]
    cleaned_row_count = len(df_cleaned)
    rows_affected = original_row_count - cleaned_row_count
    df_cleaned = df_cleaned.drop(columns=['z_score'])
    print(f"Number of rows affected by removing outliers: {rows_affected}")
    return df_cleaned

## Unique Brands

In [9]:
# Looking for misspellings in brand names

def 

df = pd.read_csv('/Users/mohammed/Downloads/used-cars-sales-prediction/data/raw1.csv')
df['brand'].unique()

array(['Ford', 'Hyundai', 'Lexus', 'INFINITI', 'Audi', 'Acura', 'BMW',
       'Tesla', 'Land', 'Aston', 'Toyota', 'Lincoln', 'Jaguar',
       'Mercedes-Benz', 'Dodge', 'Nissan', 'Genesis', 'Chevrolet', 'Kia',
       'Jeep', 'Bentley', 'Honda', 'Lucid', 'MINI', 'Porsche', 'Hummer',
       'Chrysler', 'Volvo', 'Cadillac', 'Lamborghini', 'Maserati',
       'Volkswagen', 'Subaru', 'Rivian', 'GMC', 'RAM', 'Alfa', 'Ferrari',
       'Scion', 'Mitsubishi', 'Mazda', 'Saturn', 'Bugatti', 'Polestar',
       'Rolls-Royce', 'McLaren', 'Buick', 'Lotus', 'Pontiac', 'FIAT',
       'Karma', 'Saab', 'Mercury', 'Plymouth', 'smart', 'Maybach',
       'Suzuki'], dtype=object)

In [7]:
raw_data = load_data('/Users/mohammed/Downloads/used-cars-sales-prediction/data/raw1.csv')

missing_report = identify_missing_data(raw_data)

duplicate_rows = find_duplicate_rows(raw_data)
print("Duplicate rows found:")
print(duplicate_rows)

raw_data, duplicate_count = drop_duplicates(raw_data)
print(f"Number of duplicates removed: {duplicate_count}")



Missing Data Report:
              Missing Count  Missing Percentage
brand                     0            0.000000
model                     0            0.000000
model_year                0            0.000000
milage                    0            0.000000
fuel_type               170            4.240459
engine                    0            0.000000
transmission              0            0.000000
ext_col                   0            0.000000
int_col                   0            0.000000
accident                113            2.818658
clean_title             596           14.866550
price                     0            0.000000
Duplicate rows found:
Empty DataFrame
Columns: [brand, model, model_year, milage, fuel_type, engine, transmission, ext_col, int_col, accident, clean_title, price]
Index: []
Number of duplicates before dropping: 0
Number of duplicates removed: 0
