#  DataPrepKit Class 


In [1]:
import pandas as pd
import numpy as np

class DataPrepKit:
    """
    A class to preprocessing datasets with optional constructor.
    """

    def __init__(self, df=None):
        """
        Initializes the DataPrepKit.

        Args:
            df (pandas.DataFrame, optional): An optional DataFrame to store. Defaults to None.
        """
        self.__df = df
#------------------------------read file-----------------------------------------------------------------------------------
        
    def read_file(self):
        """
        Read data from a file into a DataFrame and store it internally.

        Parameters:
            file_path (str): The path to the input file.

        Raises:
            ValueError: If the file format is not supported.
        """
        file_path = input("Enter the file path without quotes: ")

        try:
            if ".csv" in file_path.lower():
                self.__data = pd.read_csv(file_path)

            elif ".xls" in file_path.lower() or ".xlsx" in file_path.lower():
                self.__data = pd.read_excel(file_path)

            elif ".json" in file_path.lower():
                self.__data = pd.read_json(file_path)

            else:
                raise ValueError("Unsupported file format.")

        except Exception as e:
            print(f"An error occurred while reading the data: {str(e)}")
            self.__data = None
#----------------------------- setter and getter-----------------------------------------------------------------------------        
    def get_data(self):
        """
        Get the stored DataFrame containing the data.

        Returns:
            DataFrame: The DataFrame containing the data.
        """

        return self.__data

    def set_data(self, data):
        """
        Set the DataFrame containing the data (optional).

        Parameters:
            data (DataFrame): The DataFrame containing the data.
        """

        self.__data = data
#---------------------------Data Summary functions--------------------------------------------------------------------------        
    def summary(self, data=None):
        """
        Generate summary statistics of the DataFrame.
 
        Parameters: data (pandas.DataFrame, optional): The DataFrame to summarize. Defaults to None, 
          in which case it will use the internally stored data.

        Returns: pandas.DataFrame: Summary statistics
        """
        if data is None:# Use internal data if no argument provided
            data = self.__data
        return data.describe()
    
#-------------------------
  
    def mode(self, dataframe=None):
        """
    Get the mode(s) of each column in the DataFrame.

    Parameters:
        dataframe (pandas.DataFrame, optional): The DataFrame to find the mode(s) of.
            Defaults to the internal data (`self.__data`).

    Returns:
        pandas.DataFrame: A DataFrame containing the mode(s) for each column.
        """

        if dataframe is None:
            dataframe = self.__data

    # Use mode() with axis=0 to calculate mode for each column
        return dataframe.mode(axis=0)

#----------------------------------------
    def data_info(self ,dataframe = None):
        """
    This method returns information about a DataFrame.
    
    Parameters:
        dataframe (pandas.DataFrame): The DataFrame to get information about.
        
    Returns:
        str: A string containing information about the DataFrame.
        """
        if dataframe is None:
            dataframe = self.__data
            
        return dataframe.info()
#-----------------Handling Missing Values:--------------------------------------------------------------------------------
    def count_NA(self, dataFrame = None ):  
        """
    Count the number of missing values in a DataFrame.

    Parameters:
    df (pandas.DataFrame): The DataFrame to analyze.

    Returns:
    int: The total number of missing values.
        """
        if dataFrame is None:
            dataFrame = self.__data
                    
        return dataFrame.isnull().sum().sum()
#--------------------------------------
    def drop_NA(self,dataFrame = None):
        """
    Remove rows with missing values from the data.

    Parameters:
    - df (pandas.DataFrame): Data to be cleaned.

    Returns:
    - pandas.DataFrame: Cleaned data without any missing values.
        """
        if dataFrame is None:
            dataFrame = self.__data
        self.__data = dataFrame.dropna()
        return self.__data
#--------------------------------------
    def replace_na_with_mean(self, column_name, df=None):
        """Replaces missing values in a specified column with the mean of that column.

  Parameters:
      column_name (str): The name of the column to process.
      df (pandas.DataFrame, optional): The DataFrame containing the column.
          Defaults to None, in which case the class's internal data (`self.__data`) is used.

  Returns:
      pandas.DataFrame: The modified DataFrame (for potential further processing).
        """

        if df is None:
            df = self.__data

        df[column_name].fillna(df[column_name].mean(), inplace=True)
        self.__data = df  # Assign the modified DataFrame back to self.__data
        return df
    
#-----------------
    def replace_na_with_median (self, column_name,df =None):
        """
    Replace missing values in a specified column with the median.

    Parameters:
    df (pandas.DataFrame): The DataFrame containing the column.
    column_name (str): The name of the column to process.

    Returns:
    pandas.DataFrame: DataFrame with missing values replaced by column median.
        """
        if df is None:
            df = self.__data
            
        df[column_name] = df[column_name].fillna(df[column_name].median())
        self.__data = df  
        return df
#----------------------------------
    def replace_na_with_mode(self, column_name,df= None):
        """
    Replace missing values in a specified column with the mode of that column.

    Parameters:
    df (pandas.DataFrame): The DataFrame containing the column.
    column_name (str): The name of the column to process.

    Returns:
    pandas.DataFrame: DataFrame with missing values replaced by column mode.
        """
        if df is None:
            df = self.__data
        df[column_name].fillna(df[column_name].mode()[0], inplace=True)
        self.__data = df  
        return df
#---------------Categorical Data Encoding:--------------------------------------------------------------------------------
    def encode_categorical(self,df = None):
        """
    Encodes categorical columns in a DataFrame.

    Parameters:
    df : DataFrame
        Input DataFrame with categorical columns to be encoded.

    Returns:
     DataFrame
        DataFrame with categorical columns encoded.
        """
        if df is None:
            df = self.__data
            
        categorical_columns = df.select_dtypes(include=['object']).columns
        encoded_df = pd.get_dummies(df, columns=categorical_columns)
        self.__data = encoded_df
        return encoded_df


# Examples 

In [2]:
dpk = DataPrepKit()
dpk.read_file()# for Data Reading:
dpk.get_data() # to see the dataframe agter reading

Enter the file path without quotes: E:\iris.json


Unnamed: 0,sepalLength,sepalWidth,petalLength,petalWidth,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


In [3]:
dpk.summary() # to print key statistical summaries 

Unnamed: 0,sepalLength,sepalWidth,petalLength,petalWidth
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [4]:
dpk.mode()# to print most frequent values

Unnamed: 0,sepalLength,sepalWidth,petalLength,petalWidth,species
0,5.0,3.0,1.4,0.2,setosa
1,,,1.5,,versicolor
2,,,,,virginica


In [5]:
dpk.data_info() # more info about the data in terms of number of columns, rows and data type

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   sepalLength  150 non-null    float64
 1   sepalWidth   150 non-null    float64
 2   petalLength  150 non-null    float64
 3   petalWidth   150 non-null    float64
 4   species      150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


# Note there are various ways to handel NA (eg:replace_na_with_mean, replace_na_with_mode and replace_na_with_median )  in the DataPrepKit class, but I demonstrated only one example (drop_NA)   

#dpk.replace_na_with_mode('pice') # example of how to use them where price is the col that we want to replace its NA values with mode.

In [7]:
dpk.drop_NA() #to drop all NA values
dpk.count_NA() # to see number of NA in the data frame

0

In [187]:
dpk.encode_categorical()
dpk.get_data()

Unnamed: 0,sepalLength,sepalWidth,petalLength,petalWidth,species_setosa,species_versicolor,species_virginica
0,5.1,3.5,1.4,0.2,1,0,0
1,4.9,3.0,1.4,0.2,1,0,0
2,4.7,3.2,1.3,0.2,1,0,0
3,4.6,3.1,1.5,0.2,1,0,0
4,5.0,3.6,1.4,0.2,1,0,0
...,...,...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,0,0,1
146,6.3,2.5,5.0,1.9,0,0,1
147,6.5,3.0,5.2,2.0,0,0,1
148,6.2,3.4,5.4,2.3,0,0,1
