<a href="https://colab.research.google.com/github/majidiali1/machine-learning/blob/main/PreProcessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [30]:
import pandas as pd
import numpy as np

def fill_missing_values(df, method='drop', value=None):
    """ methods: Method to handle missing values ('drop', 'constant', 'mean', 'median', 'mode', 'ffill', 'bfill') """

    nNAs = df.isnull().any(axis=1).sum()
    nT = len(df.index)
    pNAs = int(nNAs/nT*100)
    print(f'{nNAs} samples ({pNAs})% include NA values.')

    df_filled = df.copy()

    if method == 'drop':
        df_filled.dropna(inplace=True)
    elif method == 'constant':
        if value is None:
            raise ValueError("For method='constant', a value must be provided.")
        df_filled.fillna(value, inplace=True)
    elif method == 'mean':
        df_filled.fillna(df.mean(), inplace=True)
    elif method == 'median':
        df_filled.fillna(df.median(), inplace=True)
    elif method == 'mode':
        # Mode can return multiple values per column, use the first one
        for column in df_filled.columns:
            df_filled[column].fillna(df_filled[column].mode()[0], inplace=True)
    elif method == 'ffill':
        df_filled.fillna(method='ffill', inplace=True)
    elif method == 'bfill':
        df_filled.fillna(method='bfill', inplace=True)
    else:
        raise ValueError("Unsupported method provided.")

    return df_filled

# Example usage:
# Create a sample DataFrame with missing values
data = {
    'A': [1, np.nan, 3, 4, 5],
    'B': [np.nan, 2, 3, np.nan, 5],
    'C': [1, 2, np.nan, 4, 5]
}
df = pd.DataFrame(data)

# Fill missing values using a specific method, e.g., 'mean'
df_filled = fill_missing_values(df, method='mean')
print(df_filled)


4 samples (80)% include NA values.
      A         B    C
0  1.00  3.333333  1.0
1  3.25  2.000000  2.0
2  3.00  3.000000  3.0
3  4.00  3.333333  4.0
4  5.00  5.000000  5.0
