In [174]:
import numpy as np
import pandas as pd

In [None]:
echo "# DataPrepKit" >> README.md
git init
git add README.md
git commit -m "first commit"
git branch -M main
git remote add origin https://github.com/mona0101/DataPrepKit.git
git push -u origin main

# Testing 

In [None]:
file_path = input("Enter the file path without quotes: ")

if 'csv' in file_path.lower():
    dataFrame = read_csv(file_path)
    display(dataFrame)
    
elif 'xlsx' in file_path.lower() or 'xls' in file_path.lower():
    dataFrame = read_excel(file_path)
    display(dataFrame)
    
elif 'json' in file_path.lower():
    dataFrame = read_json(file_path)
    display(dataFrame)
else:
    print("Unsupported file type.")


In [None]:
display(metrics(dataFrame))#  key statistical summaries of the data

In [None]:
display(summary(dataFrame)) ## another way for full summary 

In [None]:
display(mode(dataFrame)) # printing only the mode

In [None]:
display(info(dataFrame))# to understand the structure of the data frame and the types of the columns


In [None]:
print(count_NA(dataFrame)) # to count the number of NA in the data

In [None]:
column_name = str(input("Enter column name that you want to fill: ")).strip()
display(replace_na_with_mean(dataFrame,column_name))# Replace missing values in a specified column with the mean 

In [None]:
column_name = str(input("Enter column name that you want to fill: ")).strip()
display(replace_na_with_mode(dataFrame,column_name))# Replace missing values in a specified column with the mode 

In [None]:
display(drop_NA(dataFrame)) # or just drop NA , 

In [None]:
display(encode_categorical(dataFrame)) #Categorical Data Encoding:


# functions 

In [None]:
def read_csv(file_path_name):
    """
    Read a CSV file and return its contents as a DataFrame.

    Parameters:
    - file_path_name (str): Path to the CSV file.

    Returns:
    - pandas.DataFrame: DataFrame with CSV data.

    Prints an error message if the file is not found or if an exception occurs during reading.
    Returns None in case of errors.
    """
    try:
        return pd.read_csv(file_path_name)
    except FileNotFoundError:
        print(f"Error: File '{file_path_name}' not found.")
        return None
    except Exception as e:
        print(f"Error reading CSV file '{file_path_name}': {e}")
        return None


In [None]:
def read_excel(file_path_name, sheet_name=0):
    """
    Read data from an Excel file and return it as a DataFrame.

    Parameters:
    - file_path_name (str): Path to the Excel file.
    - sheet_name (str or int, optional): Name or index of the sheet to read. Defaults to 0.

    Returns:
    - pandas.DataFrame: DataFrame with Excel data.

    Prints an error message if the file is not found or if an exception occurs during reading.
    Returns None in case of errors.
    """
    try:
        return pd.read_excel(file_path_name, sheet_name=sheet_name)
    except FileNotFoundError:
        print(f"Error: File '{file_path_name}' not found.")
        return None
    except Exception as e:
        print(f"Error reading Excel file '{file_path_name}': {e}")
        return None


In [None]:
def read_json(file_path_name):
    """
    Read data from a JSON file and return it as DataFrame.

    Parameters:
    - file_path_name (str): Path to the JSON file.

    Returns:
    - pandas.DataFrame: DataFrame with JSON data.

    Prints an error message if the file is not found or if an exception occurs during reading.
    Returns None in case of errors.
    """
    try:
        return pd.read_json(file_path_name)
    except FileNotFoundError:
        print(f"Error: File '{file_path_name}' not found.")
        return None
    except Exception as e:
        print(f"Error reading JSON file '{file_path_name}': {e}")
        return None

# Data Summary functions

In [None]:
def summary(dataframe):
    """
    Generate summary statistics of the DataFrame.

    Parameters:
    - dataframe (pandas.DataFrame): The DataFrame to summarize.

    Returns:
    - pandas.DataFrame: Summary statistics 
    """
    return dataframe.describe()


In [None]:
def metrics(df):
    """
    Returns basic summary statistics of a DataFrame.

    Args:
        df (pandas.DataFrame): The DataFrame to summarize.

    Returns:
        pandas.DataFrame: Summary statistics DataFrame.
    """
    summary_stats = []

    for col in df.select_dtypes(include=[np.number]):
        stats = {
            "Column": col,
            "Mean": df[col].mean(),
            "Median": df[col].median(),
            "Mode": df[col].mode().iloc[0], 
            "Standard Deviation": df[col].std()
        }
        summary_stats.append(stats)

    return pd.DataFrame(summary_stats)

In [None]:
def mode(dataframe):
    """
    Get the mode(s) of the DataFrame.

    Parameters:
    - dataframe (pandas.DataFrame): The DataFrame to find the mode(s) of.

    Returns:
    - pandas.Series or pandas.DataFrame: The mode(s) of the DataFrame.
    """
    return dataframe.mode()

In [None]:
def info(dataframe):
    """
    This method returns information about a DataFrame.
    
    Parameters:
        dataframe (pandas.DataFrame): The DataFrame to get information about.
        
    Returns:
        str: A string containing information about the DataFrame.
    """
    return dataframe.info()

# Handling Missing Values:

In [None]:
def count_NA(dataFrame):
    """
    Count the number of missing values in a DataFrame.

    Parameters:
    df (pandas.DataFrame): The DataFrame to analyze.

    Returns:
    int: The total number of missing values.
    """
    return dataFrame.isnull().sum().sum()

In [None]:
def drop_NA(dataFrame):
    """
    Remove rows with missing values from the data.

    Parameters:
    - df (pandas.DataFrame): Data to be cleaned.

    Returns:
    - pandas.DataFrame: Cleaned data without any missing values.
    """
    return dataFrame.dropna()

In [None]:
def replace_na_with_mean(df, column_name):
    """
    Replace missing values in a specified column with the mean of that column.

    Parameters:
    df (pandas.DataFrame): The DataFrame containing the column.
    column_name (str): The name of the column to process.

    Returns:
    pandas.DataFrame: DataFrame with missing values replaced by column mean.
    """
    df[column_name] = df[column_name].fillna(df[column_name].mean())
    return df

In [None]:
def replace_na_with_median(df, column_name):
    """
    Replace missing values in a specified column with the median.

    Parameters:
    df (pandas.DataFrame): The DataFrame containing the column.
    column_name (str): The name of the column to process.

    Returns:
    pandas.DataFrame: DataFrame with missing values replaced by column median.
    """
    df[column_name] = df[column_name].fillna(df[column_name].median())
    return df


In [None]:
def replace_na_with_mode(df, column_name):
    """
    Replace missing values in a specified column with the mode of that column.

    Parameters:
    df (pandas.DataFrame): The DataFrame containing the column.
    column_name (str): The name of the column to process.

    Returns:
    pandas.DataFrame: DataFrame with missing values replaced by column mode.
    """
    df[column_name].fillna(df[column_name].mode()[0], inplace=True)
    return df

# Categorical Data Encoding:

In [None]:
def encode_categorical(df):
    """
    Encodes categorical columns in a DataFrame.

    Parameters:
    df : DataFrame
        Input DataFrame with categorical columns to be encoded.

    Returns:
     DataFrame
        DataFrame with categorical columns encoded.
    """
    categorical_columns = df.select_dtypes(include=['object']).columns
    encoded_df = pd.get_dummies(df, columns=categorical_columns)
    return encoded_df