In [None]:
import pandas as pd
import numpy as np
import os
from scipy import stats
from scipy.stats import zscore

### Read the data file

In [None]:
def load_data(file_path):
    return pd.read_csv(file_path)

# df = pd.read_csv('/Users/mohammed/Downloads/used-cars-sales-prediction/data/raw.csv')


### Find Missing Values

In [None]:
def identify_missing_data(df):
    missing_summary = df.isnull().sum()
    missing_percentage = (missing_summary / len(df)) * 100
    missing_report = pd.DataFrame({
        'Missing Count': missing_summary,
        'Missing Percentage': missing_percentage
    })
    print("Missing Data Report:")
    print(missing_report)
    return missing_report

## Find Duplicates

In [None]:
def find_duplicate_rows(df):
    duplicate_mask = df.duplicated(keep=False)
    duplicate_rows = df[duplicate_mask]
    return duplicate_rows


In [None]:
def drop_duplicates(df):
    duplicate_count = df.duplicated().sum()
    print(f"Number of duplicates before dropping: {duplicate_count}")
    df = df.drop_duplicates()
    return df, duplicate_count

## Remove $ from price column to convert to numerical data type

In [None]:
def clean_price_column(df, column_name="price"):
    df[column_name] = (
        df[column_name]
        .str.replace('$', '', regex=False)  # Remove dollar sign
        .str.replace(',', '', regex=False)  # Remove commas
        .astype(float)  # Convert to float
    )
    return df

## Remove Outliers

In [None]:
def remove_outliers(df, column_name, threshold=3):
    df['z_score'] = zscore(df[column_name])
    original_row_count = len(df)
    df_cleaned = df[df['z_score'].abs() <= threshold]
    cleaned_row_count = len(df_cleaned)
    rows_affected = original_row_count - cleaned_row_count
    df_cleaned = df_cleaned.drop(columns=['z_score'])
    print(f"Number of rows affected by removing outliers: {rows_affected}")
    return df_cleaned

In [None]:
raw_data = load_data('/Users/mohammed/Downloads/used-cars-sales-prediction/data/raw1.csv')

missing_report = identify_missing_data(raw_data)

duplicate_rows = find_duplicate_rows(raw_data)
print("Duplicate rows found:")
print(duplicate_rows)

raw_data, duplicate_count = drop_duplicates(raw_data)
print(f"Number of duplicates removed: {duplicate_count}")

raw_data = remove_outliers(raw_data, 'price')

raw_data = clean_price_column(raw_data)
