In [29]:
import pandas as pd
import logging

In [30]:
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
)

In [31]:
class DataCleaning:
    def __init__(self, raw_dataset):
        self.raw_dataset = raw_dataset
        self.df = None


    """
    reading and encoding the csv file and converting to a Pandas dataframe
    """
    def read_raw_data(self):
        try:
            self.df = pd.read_csv(self.raw_dataset, encoding="ISO-8859-1", engine='python')
            logging.info("Read the dataset successfully.")
            return self.df
        except FileNotFoundError:
            logging.error("Error: The file was not found.")
        except pd.errors.EmptyDataError:
            logging.error("Error: The file is empty.")
        except pd.errors.ParserError:
            logging.error("Error: The file could not be parsed.")


    
    """
    converting column headers to lower cases due to the naming convention of a lot of databases (namely PostgreSQL which we'll be using
    in this project) and SQL prior to replacing the blank space inbetween with underscores
    """
    def convert_columns_to_lowercase(self):
        try:
            self.df.columns = self.df.columns.str.lower()
            logging.info("Columns names converted to lowercase")
        except Exception as e:
            logging.error(f"Failed to convert columns to lowercase, {e}")

        return self.df


    """
    some column headers contain blank space like <invoice id>
    - adding undercore to standardise the namings
    """
    def add_underline_to_columns(self):
        try:
            self.df.columns = self.df.columns.str.replace(" " , "_")
            logging.info("Added underscore to columns")
        except Exception as e:
            logging.error(f"Failed to add underscore to columns, {e}")

        return self.df




    """
    checking if the tax 5% column in the raw dataset conatins the correct values
    """

    def verify_tax_on_price(self):
        try:
            self.df["calculated_tax"] = (self.df['unit_price'] * self.df['quantity']) * 0.05
        except Exception as e:
            logging.error(f"Failed to calculate the 5% tax, {e}")




    """
    USING IQR METHOD TO DETECT OUTLIERS IN COLUMNS
    """
    def outlier_detection(self, column):
        if not pd.api.types.is_numeric_dtype(self.df[column]):
            logging.warning(f"Column '{column}' is not numeric. Skipping outlier detection.")
            return pd.DataFrame()

        Q1 = self.df[column].quantile(0.25)
        Q3 = self.df[column].quantile(0.75)
        IQR = Q3 - Q1

        outliers = self.df[
            (self.df[column] < Q1 - 1.5 * IQR) |
            (self.df[column] > Q3 + 1.5 * IQR)
        ]
        percentage = (len(outliers) / len(self.df)) * 100
        logging.info(f"{percentage:.2f}% of the values in column '{column}' are outliers.")
        
        return outliers



    """
    dropping duplicate rows
    """
    def drop_duplicate_rows(self):
        total_rows = self.df.shape[0]
        duplicate_rows = self.df.duplicated().sum()
        
        try:
            if duplicate_rows == 0:
                logging.info("No duplicate rows were found")
            else:
                self.df = self.df.drop_duplicates()
                removed_rows = total_rows - self.df.shape[0]
                logging.info(f"{removed_rows} rows were removed")
        except Exception as e:
            logging.error(f"Failed to drop duplicate rows, {e}")
            
            

In [32]:
raw_dataset = "../dataset/raw/market_sales.csv"
data = pd.read_csv(raw_dataset)

In [33]:
cleaner = DataCleaning(raw_dataset=raw_dataset)

In [34]:
cleaner.read_raw_data()

2025-06-27 13:29:53,619 - INFO - Read the dataset successfully.


Unnamed: 0,Gender,Invoice ID,Branch,City,Customer type,Product line,Unit price,Quantity,Tax 5%
0,Female,750-67-8428,A,Yangon,Member,Health and beauty,74.69,7,261.4150
1,Female,226-31-3081,C,Naypyitaw,Normal,Electronic accessories,15.28,5,3.8200
2,Female,355-53-5943,A,Yangon,Member,Electronic accessories,68.84,6,20.6520
3,Female,315-22-5665,C,Naypyitaw,Normal,Home and lifestyle,73.56,10,36.7800
4,Female,665-32-9167,A,Yangon,Member,Health and beauty,36.26,2,3.6260
...,...,...,...,...,...,...,...,...,...
995,Male,745-74-0715,A,Yangon,Normal,Electronic accessories,58.03,2,5.8030
996,Male,690-01-6631,B,Mandalay,Normal,Fashion accessories,17.49,10,8.7450
997,Male,233-67-5758,C,Naypyitaw,Normal,Health and beauty,40.35,1,2.0175
998,Male,727-02-1313,A,Yangon,Member,Food and beverages,31.84,1,1.5920


In [35]:
cleaner.convert_columns_to_lowercase()

2025-06-27 13:29:53,631 - INFO - Columns names converted to lowercase


Unnamed: 0,gender,invoice id,branch,city,customer type,product line,unit price,quantity,tax 5%
0,Female,750-67-8428,A,Yangon,Member,Health and beauty,74.69,7,261.4150
1,Female,226-31-3081,C,Naypyitaw,Normal,Electronic accessories,15.28,5,3.8200
2,Female,355-53-5943,A,Yangon,Member,Electronic accessories,68.84,6,20.6520
3,Female,315-22-5665,C,Naypyitaw,Normal,Home and lifestyle,73.56,10,36.7800
4,Female,665-32-9167,A,Yangon,Member,Health and beauty,36.26,2,3.6260
...,...,...,...,...,...,...,...,...,...
995,Male,745-74-0715,A,Yangon,Normal,Electronic accessories,58.03,2,5.8030
996,Male,690-01-6631,B,Mandalay,Normal,Fashion accessories,17.49,10,8.7450
997,Male,233-67-5758,C,Naypyitaw,Normal,Health and beauty,40.35,1,2.0175
998,Male,727-02-1313,A,Yangon,Member,Food and beverages,31.84,1,1.5920


In [36]:
cleaner.add_underline_to_columns()

2025-06-27 13:29:53,642 - INFO - Added underscore to columns


Unnamed: 0,gender,invoice_id,branch,city,customer_type,product_line,unit_price,quantity,tax_5%
0,Female,750-67-8428,A,Yangon,Member,Health and beauty,74.69,7,261.4150
1,Female,226-31-3081,C,Naypyitaw,Normal,Electronic accessories,15.28,5,3.8200
2,Female,355-53-5943,A,Yangon,Member,Electronic accessories,68.84,6,20.6520
3,Female,315-22-5665,C,Naypyitaw,Normal,Home and lifestyle,73.56,10,36.7800
4,Female,665-32-9167,A,Yangon,Member,Health and beauty,36.26,2,3.6260
...,...,...,...,...,...,...,...,...,...
995,Male,745-74-0715,A,Yangon,Normal,Electronic accessories,58.03,2,5.8030
996,Male,690-01-6631,B,Mandalay,Normal,Fashion accessories,17.49,10,8.7450
997,Male,233-67-5758,C,Naypyitaw,Normal,Health and beauty,40.35,1,2.0175
998,Male,727-02-1313,A,Yangon,Member,Food and beverages,31.84,1,1.5920


In [37]:
data.columns

Index(['Gender', 'Invoice ID', 'Branch', 'City', 'Customer type',
       'Product line', 'Unit price', 'Quantity', 'Tax 5%'],
      dtype='object')

In [61]:
cleaner.outlier_detection(column="unit_price")

2025-06-27 13:30:50,370 - INFO - 0.00% of the values in column 'unit_price' are outliers.


Unnamed: 0,gender,invoice_id,branch,city,customer_type,product_line,unit_price,quantity,tax_5%,calculated_tax


# ERRORS IN THE TAX 5% Column
My findings show an inconsistency in data entery for the tax 5% column. Some rows display correct calucated taxes but most others fail to do so. There are cases of the decimal place being worngly used and there are cases of tax 5% columns' values unequal to the actual 5 percent of the total sum (unit price * qunatity). The main issue seems to have to do with incompetency at data entry rather than fraud. At this point of analysis on, I will be using <calculated_5%_tax> column

In [57]:
cleaner.verify_tax_on_price()

In [41]:
cleaner.drop_duplicate_rows()

2025-06-27 13:29:53,683 - INFO - No duplicate rows were found


In [42]:
data["Product line"].unique()

array(['Health and beauty', 'Electronic accessories',
       'Home and lifestyle', 'Food and beverages', 'Fashion accessories',
       'Sports and travel'], dtype=object)

In [63]:
data.isna().sum()

TypeError: unsupported operand type(s) for +: 'int' and 'str'