In [1]:
%pip install pandas 

Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
import pandas as pd


In [3]:
class DatasetLoader:
    def load_data(self, filepath):
        return pd.read_csv(filepath, encoding = "ISO-8859-1", engine='python')

In [20]:


class DataCleaner:
    def handle_missing_values(self, data):
        print(data.isnull().sum())  # Count missing values per column
        data = data.dropna()  # Drop rows with any null values
        return data

    def check_header_data_types(self, data, data_types):
        for column, dtype in data_types.items():
            if column in data.columns:
                try:
                    data[column] = data[column].astype(dtype)
                except ValueError as e:
                    print(f"Error converting {column} to {dtype}: {e}")
        return data

    def validate_values(self, data):
        data_types = {
            "InvoiceNo": "object",
            "StockCode": "object",
            "Description": "object",
            "Quantity": "float64",
            "InvoiceDate": "datetime64[ns]",
            "UnitPrice": "float64",
            "CustomerID": "float64",
            "Country": "object"
        }
        # Step 1: Fix data types first (may introduce new NaNs)
        data = self.check_header_data_types(data, data_types)
        
        # Step 2: Remove duplicates based on key columns
        data = self.handle_duplicates(data, subset=["InvoiceNo", "StockCode"])
        return data
    
    

    def handle_duplicates(self, data, subset=None, keep='first'):
        """
        Remove duplicate rows and print a detailed report.
        """
        # Track initial state
        initial_rows = len(data)
        duplicate_mask = data.duplicated(subset=subset, keep=keep)
        duplicate_count = duplicate_mask.sum()
    
        # Print duplicate report
        print("\n=== Duplicate Report ===")
        print(f"Initial rows: {initial_rows}")
        print(f"Duplicate rows found: {duplicate_count}")
    
        if duplicate_count > 0:
            # Show example duplicates (first 2 rows)
            print("\nExample duplicate rows:")
            duplicates = data[duplicate_mask].head(2)
            print(duplicates.to_string(index=False))
    
        # Remove duplicates
        data_cleaned = data.drop_duplicates(subset=subset, keep=keep)
        final_rows = len(data_cleaned)
        print(f"\nRows after removal: {final_rows}")
        print(f"Total duplicates removed: {initial_rows - final_rows}")
    
        return data_cleaned

    def unique_check(self, data):
            for column in data.columns:
                if data[column].duplicated().any():
                    print(f"Warning: Duplicate values found in column {column}")
            return data

    def save_chanegs(self, data):
        data.to_csv('../dataset/cleaned_data.csv', index=False)
        print("Cleaned data saved successfully!")
        return data

In [22]:
filepath = "../dataset/data.csv"
loader = DatasetLoader()
# Load data
data = loader.load_data(filepath)
print(f"Initial shape: {data.shape}")





Initial shape: (541909, 8)


In [23]:
cleaner = DataCleaner()

# Handle missing values
data = cleaner.handle_missing_values(data)
print(f"Shape after handling missing values: {data.shape}")


InvoiceNo           0
StockCode           0
Description      1454
Quantity            0
InvoiceDate         0
UnitPrice           0
CustomerID     135080
Country             0
dtype: int64
Shape after handling missing values: (406829, 8)


In [24]:
# Validate data types and remove duplicates
data = cleaner.validate_values(data)
print(f"Final shape after all cleaning: {data.shape}")



=== Duplicate Report ===
Initial rows: 406829
Duplicate rows found: 10148

Example duplicate rows:
InvoiceNo StockCode                     Description  Quantity         InvoiceDate  UnitPrice  CustomerID        Country
   536381     71270                 PHOTO CLIP LINE       3.0 2010-12-01 09:41:00       1.25     15311.0 United Kingdom
   536409    90199C 5 STRAND GLASS NECKLACE CRYSTAL       1.0 2010-12-01 11:45:00       6.35     17908.0 United Kingdom

Rows after removal: 396681
Total duplicates removed: 10148
Final shape after all cleaning: (396681, 8)


In [25]:
# Save cleaned data
print(cleaner.save_chanegs(data))



Cleaned data saved successfully!
       InvoiceNo StockCode                          Description  Quantity  \
0         536365    85123A   WHITE HANGING HEART T-LIGHT HOLDER       6.0   
1         536365     71053                  WHITE METAL LANTERN       6.0   
2         536365    84406B       CREAM CUPID HEARTS COAT HANGER       8.0   
3         536365    84029G  KNITTED UNION FLAG HOT WATER BOTTLE       6.0   
4         536365    84029E       RED WOOLLY HOTTIE WHITE HEART.       6.0   
...          ...       ...                                  ...       ...   
541904    581587     22613          PACK OF 20 SPACEBOY NAPKINS      12.0   
541905    581587     22899         CHILDREN'S APRON DOLLY GIRL        6.0   
541906    581587     23254        CHILDRENS CUTLERY DOLLY GIRL        4.0   
541907    581587     23255      CHILDRENS CUTLERY CIRCUS PARADE       4.0   
541908    581587     22138        BAKING SET 9 PIECE RETROSPOT        3.0   

               InvoiceDate  UnitPrice  Cus