<a href="https://colab.research.google.com/github/majidiali1/machine-learning/blob/main/PreProcessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Data Cleaning**

In [None]:
import pandas as pd
import numpy as np

def fill_missing_values(df, method='drop', value=None):
    """ methods: Method to handle missing values ('drop', 'constant', 'mean', 'median', 'mode', 'ffill', 'bfill') """

    nNAs = df.isnull().any(axis=1).sum()
    nT = len(df.index)
    pNAs = int(nNAs/nT*100)
    print(f'{nNAs} samples ({pNAs})% include NA values.')

    df_filled = df.copy()

    if method == 'drop':
        df_filled.dropna(inplace=True)
    elif method == 'constant':
        if value is None:
            raise ValueError("For method='constant', a value must be provided.")
        df_filled.fillna(value, inplace=True)
    elif method == 'mean':
        df_filled.fillna(df.mean(), inplace=True)
    elif method == 'median':
        df_filled.fillna(df.median(), inplace=True)
    elif method == 'mode':
        # Mode can return multiple values per column, use the first one
        for column in df_filled.columns:
            df_filled[column].fillna(df_filled[column].mode()[0], inplace=True)
    elif method == 'ffill':
        df_filled.fillna(method='ffill', inplace=True)
    elif method == 'bfill':
        df_filled.fillna(method='bfill', inplace=True)
    else:
        raise ValueError("Unsupported method provided.")

    return df_filled

# Example usage:
# Create a sample DataFrame with missing values
data = {
    'A': [1, np.nan, 3, 4, 5],
    'B': [np.nan, 2, 3, np.nan, 5],
    'C': [1, 2, np.nan, 4, 5]
}
df = pd.DataFrame(data)

# Fill missing values using a specific method, e.g., 'mean'
df_filled = fill_missing_values(df, method='mean')
print(df_filled)



# **Smooth Noise Data**

In [34]:
import pandas as pd
import numpy as np

def smooth_data(df, column_name, method='moving_average', window_size=3, alpha=0.3):
    """ methods: The smoothing method ('moving_average' or 'exponential_moving_average') """
    if method == 'moving_average':
        return df[column_name].rolling(window=window_size, min_periods=1, center=True).mean()
    elif method == 'exponential_moving_average':
        return df[column_name].ewm(alpha=alpha, adjust=False).mean()
    else:
        raise ValueError("Unsupported smoothing method provided.")

# Example usage
# Create a sample DataFrame
data = {
    'time': range(1, 11),
    'value': [2, 3, 4, 15, 6, 7, 67, 5, 4, 5]
}
df = pd.DataFrame(data)

# Apply smoothing
smoothed_series_moving_average = smooth_data(df, 'value', method='moving_average', window_size=3)
smoothed_series_exponential = smooth_data(df, 'value', method='exponential_moving_average', alpha=0.3)

# Print or plot the results
print("Moving Average:\n", smoothed_series_moving_average)
print("\nExponential Moving Average:\n", smoothed_series_exponential)


Moving Average:
 0     2.500000
1     3.000000
2     7.333333
3     8.333333
4     9.333333
5    26.666667
6    26.333333
7    25.333333
8     4.666667
9     4.500000
Name: value, dtype: float64

Exponential Moving Average:
 0     2.000000
1     2.300000
2     2.810000
3     6.467000
4     6.326900
5     6.528830
6    24.670181
7    18.769127
8    14.338389
9    11.536872
Name: value, dtype: float64


# **Detect Outliers**

In [44]:
import pandas as pd
import numpy as np
from scipy import stats

def detect_and_remove_outliers(df, column_name, method='iqr'):
    """
    Detects and removes outliers in a specified column of a pandas DataFrame using the specified method.
    Prints the percentage of outlier samples before removing them.

    Parameters:
    - method: The method for detecting outliers:
        - 'iqr': Uses the Interquartile Range (IQR) for outlier detection.
            IQR is calculated as Q3 - Q1, where Q1 and Q3 are the 25th and 75th percentiles, respectively.
            Outliers are defined as observations that fall below Q1 - 1.5*IQR or above Q3 + 1.5*IQR.
        - 'z_score': Uses Z-scores for outlier detection.
            Z-score of an observation is calculated as (X - μ) / σ, where X is the observation, μ is the mean,
            and σ is the standard deviation of the dataset.
            An observation is considered an outlier if its Z-score is greater than 3 or less than -3, indicating
            it is more than 3 standard deviations away from the mean.

    Returns:
    - DataFrame without outliers.
    """
    if method == 'iqr':
        Q1 = df[column_name].quantile(0.25)
        Q3 = df[column_name].quantile(0.75)
        IQR = Q3 - Q1

        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        outliers = (df[column_name] < lower_bound) | (df[column_name] > upper_bound)
    elif method == 'z_score':
        z = np.abs(stats.zscore(df[column_name]))
        outliers = z > 1
    else:
        raise ValueError("Unsupported method provided.")

    # Calculate and print the percentage of outliers
    outlier_percentage = 100 * outliers.sum() / len(df)
    print(f"Percentage of outlier samples: {outlier_percentage:.2f}%")

    # Remove outliers
    df_cleaned = df[~outliers]

    return df_cleaned

# Example usage
data = {
    'time': range(1, 11),
    'value': [2, 3, 4, 5, 6, 1000, 60, 5, 4, 5]  # Assuming 100 is an outlier
}
df = pd.DataFrame(data)

# Detect and remove outliers using IQR
df_cleaned_iqr = detect_and_remove_outliers(df, 'value', method='iqr')
print("\nDataFrame after removing outliers using IQR:\n", df_cleaned_iqr)

Percentage of outlier samples: 20.00%

DataFrame after removing outliers using IQR:
    time  value
0     1      2
1     2      3
2     3      4
3     4      5
4     5      6
7     8      5
8     9      4
9    10      5
