In [11]:
# Document key findings for the team
findings = {
    'total_rows': len(df),
    'total_columns': len(df.columns),
    'missing_columns': len(missing_data),
    'cleaning_needed': len(cleaning_needed),
    'duplicates': int(duplicate_count)
}

print("\n‚úÖ EDA Complete! Ready for data preprocessing pipeline.")
print(f"Findings: {findings}")


‚úÖ EDA Complete! Ready for data preprocessing pipeline.
Findings: {'total_rows': 2840, 'total_columns': 6, 'missing_columns': 0, 'cleaning_needed': 3, 'duplicates': 528}


In [10]:
print("="*80)
print("üìã INITIAL EDA SUMMARY")
print("="*80)
print(f"Dataset Shape: {df.shape[0]:,} rows √ó {df.shape[1]} columns")
print(f"Memory Usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
print(f"Duplicate Rows: {duplicate_count}")
print(f"Columns with Missing Values: {len(missing_data)}")
print(f"Columns Needing Cleaning: {len(cleaning_needed)}")
print(f"Numerical Columns: {len(df.select_dtypes(include=[np.number]).columns)}")
print(f"Categorical Columns: {len(categorical_cols)}")

print("\nüéØ NEXT STEPS:")
print("1. Clean columns with currency symbols and unit strings (km, cc)")
print("2. Handle missing values based on domain knowledge")
print("3. Convert object columns to appropriate numeric types")
print("4. Standardize categorical variables")
print("5. Remove or investigate duplicate records")
print("6. Perform deeper statistical analysis and feature engineering")

üìã INITIAL EDA SUMMARY
Dataset Shape: 2,840 rows √ó 6 columns
Memory Usage: 0.94 MB
Duplicate Rows: 528
Columns with Missing Values: 0
Columns Needing Cleaning: 3
Numerical Columns: 1
Categorical Columns: 5

üéØ NEXT STEPS:
1. Clean columns with currency symbols and unit strings (km, cc)
2. Handle missing values based on domain knowledge
3. Convert object columns to appropriate numeric types
4. Standardize categorical variables
5. Remove or investigate duplicate records
6. Perform deeper statistical analysis and feature engineering


## 8. Summary of Findings & Next Steps

Consolidate all findings and define preprocessing requirements

In [7]:
duplicate_count = df.duplicated().sum()
print("="*80)
print("DUPLICATE RECORDS CHECK")
print("="*80)
print(f"Total duplicate rows: {duplicate_count}")

if duplicate_count > 0:
    print(f"‚ö†Ô∏è  {duplicate_count} duplicate records found ({duplicate_count/len(df)*100:.2f}%)")
else:
    print("‚úÖ No duplicate records found!")

DUPLICATE RECORDS CHECK
Total duplicate rows: 528
‚ö†Ô∏è  528 duplicate records found (18.59%)


## 7. Duplicate Records Check

Identify and report duplicate entries in the dataset

In [9]:
print("="*80)
print("CATEGORICAL COLUMNS SUMMARY")
print("="*80)
categorical_cols = df.select_dtypes(include=['object']).columns
for col in categorical_cols[:5]:  # Show first 5 categorical columns
    print(f"\n{col}:")
    print(f"  Unique values: {df[col].nunique()}")
    print(f"  Top 5 values:\n{df[col].value_counts().head()}")

CATEGORICAL COLUMNS SUMMARY

name:
  Unique values: 309
  Top 5 values:
name
Toyota Corolla           114
Honda Civic               97
Mercedes-Benz C-Class     76
Toyota Camry              69
Ford F-150                65
Name: count, dtype: int64

miles:
  Unique values: 2280
  Top 5 values:
miles
25 miles         5
55 miles         4
107,676 miles    3
135,320 miles    3
18,666 miles     3
Name: count, dtype: int64

color:
  Unique values: 64
  Top 5 values:
color
Black exterior, Black interior     576
White exterior, Black interior     522
Gray exterior, Black interior      350
Silver exterior, Black interior    200
White exterior, Gray interior      154
Name: count, dtype: int64

condition:
  Unique values: 25
  Top 5 values:
condition
No accidents reported, 1 Owner     1575
No accidents reported, 2 Owners     426
1 accident reported, 1 Owner        236
1 accident reported, 2 Owners       157
No accidents reported, 3 Owners     152
Name: count, dtype: int64

price:
  Unique values:

In [None]:
print("="*80)
print("NUMERICAL COLUMNS SUMMARY")
print("="*80)
df.describe()

## 6. Basic Statistical Summary

Descriptive statistics for numerical and categorical columns

In [5]:
print("="*80)
print("DATA QUALITY ISSUES - COLUMNS REQUIRING CLEANING")
print("="*80)

cleaning_needed = []

# Check each column for common data quality issues
for col in df.columns:
    issues = []
    
    # Skip if all null
    if df[col].isnull().all():
        issues.append("All values are NULL")
    
    # Check for string contamination in numeric-looking columns
    if df[col].dtype == 'object':
        sample_values = df[col].dropna().head(10).tolist()
        
        # Check for currency symbols
        if any(pd.notna(val) and any(sym in str(val) for sym in ['$', '‚Ç¨', '¬£', '‚Çπ', 'Rs', 'USD']) for val in sample_values):
            issues.append("Contains currency symbols")
        
        # Check for 'km' string (mileage)
        if any(pd.notna(val) and 'km' in str(val).lower() for val in sample_values):
            issues.append("Contains 'km' string")
        
        # Check for 'cc' string (engine size)
        if any(pd.notna(val) and 'cc' in str(val).lower() for val in sample_values):
            issues.append("Contains 'cc' string")
        
        # Check for percentage signs
        if any(pd.notna(val) and '%' in str(val) for val in sample_values):
            issues.append("Contains percentage signs")
        
        # Check for comma separators in numbers
        if any(pd.notna(val) and ',' in str(val) and str(val).replace(',', '').replace('.', '').isdigit() for val in sample_values):
            issues.append("Contains comma separators")
        
        # Check for mixed data types
        if df[col].dropna().apply(type).nunique() > 1:
            issues.append("Mixed data types")
        
        # Check for whitespace issues
        if any(pd.notna(val) and (str(val).startswith(' ') or str(val).endswith(' ')) for val in sample_values):
            issues.append("Leading/trailing whitespace")
    
    # Store findings
    if issues:
        cleaning_needed.append({
            'Column': col,
            'Current_Type': str(df[col].dtype),
            'Issues': ', '.join(issues),
            'Sample_Values': str(sample_values[:3])
        })

# Display cleaning requirements
if cleaning_needed:
    cleaning_df = pd.DataFrame(cleaning_needed)
    print(cleaning_df.to_string(index=False))
    print(f"\n‚ö†Ô∏è  Total columns needing cleaning: {len(cleaning_needed)}/{len(df.columns)}")
else:
    print("‚úÖ All columns appear clean!")

DATA QUALITY ISSUES - COLUMNS REQUIRING CLEANING
   Column Current_Type                    Issues                                                                                        Sample_Values
     name       object      Contains 'cc' string                                             ['Kia Forte', 'Chevrolet Silverado 1500', 'Toyota RAV4']
condition       object      Contains 'cc' string ['No accidents reported, 1 Owner', '1 accident reported, 1 Owner', 'No accidents reported, 1 Owner']
    price       object Contains currency symbols                                                                    ['$15,988', '$38,008', '$24,988']

‚ö†Ô∏è  Total columns needing cleaning: 3/6


## 5. Data Quality Assessment

Identify columns requiring cleaning (currency symbols, unit strings, whitespace, etc.)

In [None]:
# Visualize missing values
if len(missing_data) > 0:
    plt.figure(figsize=(10, 6))
    plt.barh(missing_data['Column'], missing_data['Missing_Percentage'], color='coral')
    plt.xlabel('Missing Percentage (%)', fontsize=12)
    plt.ylabel('Column Name', fontsize=12)
    plt.title('Missing Values by Column', fontsize=14, fontweight='bold')
    plt.grid(axis='x', alpha=0.3)
    plt.tight_layout()
    plt.show()

In [4]:
# Calculate missing values
missing_data = pd.DataFrame({
    'Column': df.columns,
    'Missing_Count': df.isnull().sum(),
    'Missing_Percentage': (df.isnull().sum() / len(df) * 100).round(2)
})

# Filter columns with missing values
missing_data = missing_data[missing_data['Missing_Count'] > 0].sort_values(
    'Missing_Percentage', ascending=False
).reset_index(drop=True)

print("="*80)
print("MISSING VALUES SUMMARY")
print("="*80)
if len(missing_data) > 0:
    print(missing_data.to_string(index=False))
    print(f"\n‚ö†Ô∏è  Total columns with missing values: {len(missing_data)}/{len(df.columns)}")
else:
    print("‚úÖ No missing values found!")

MISSING VALUES SUMMARY
‚úÖ No missing values found!


## 4. Missing Values Analysis

Identify and visualize missing data patterns

In [None]:
# Display data types
print("="*80)
print("DATA TYPES SUMMARY")
print("="*80)
print(df.dtypes.value_counts())

In [None]:
# Display data info
print("="*80)
print("DATASET INFO")
print("="*80)
df.info()

In [3]:
# Display first 5 rows
print("="*80)
print("FIRST 5 ROWS")
print("="*80)
df.head()

FIRST 5 ROWS


Unnamed: 0,name,year,miles,color,condition,price
0,Kia Forte,2022,"41,406 miles","Gray exterior, Black interior","No accidents reported, 1 Owner","$15,988"
1,Chevrolet Silverado 1500,2021,"15,138 miles","White exterior, Black interior","1 accident reported, 1 Owner","$38,008"
2,Toyota RAV4,2022,"32,879 miles","Silver exterior, Unknown interior","No accidents reported, 1 Owner","$24,988"
3,Honda Civic,2020,"37,190 miles","Blue exterior, Black interior","No accidents reported, 1 Owner","$18,998"
4,Honda Civic,2020,"27,496 miles","Black exterior, Black interior","No accidents reported, 1 Owner","$19,498"


## 3. Initial Data Inspection

Display dataset structure, first rows, and column information

In [2]:
# Define data path
DATA_PATH = Path("../data/raw/car_web_scraped_dataset.csv")

# Check if file exists
if not DATA_PATH.exists():
    raise FileNotFoundError(f"Dataset not found at {DATA_PATH}")

# Load data
df = pd.read_csv(DATA_PATH)

print(f"‚úÖ Data loaded successfully")
print(f"üìä Dataset shape: {df.shape[0]:,} rows √ó {df.shape[1]} columns")

‚úÖ Data loaded successfully
üìä Dataset shape: 2,840 rows √ó 6 columns


## 2. Load Raw Data (DVC-Tracked)

Load the dataset from the DVC-tracked data directory. Verify file existence and display shape.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings

warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', None)

# Plotting style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

print("‚úÖ Libraries imported successfully")

‚úÖ Libraries imported successfully


## 1. Setup & Imports

Import required libraries and configure display options for analysis

# Used Car Dynamic Pricing - Initial EDA

**Objective:** Understand the raw scraped dataset structure, identify data quality issues, and plan preprocessing steps.

**Dataset:** `data/raw/car_web_scraped_dataset.csv` (tracked with DVC)