<a href="https://colab.research.google.com/github/majidiali1/machine-learning/blob/main/PreProcessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Data Cleaning**

In [None]:
import pandas as pd
import numpy as np

def fill_missing_values(df, method='drop', value=None):
    """ methods: Method to handle missing values ('drop', 'constant', 'mean', 'median', 'mode', 'ffill', 'bfill') """

    nNAs = df.isnull().any(axis=1).sum()
    nT = len(df.index)
    pNAs = int(nNAs/nT*100)
    print(f'{nNAs} samples ({pNAs})% include NA values.')

    df_filled = df.copy()

    if method == 'drop':
        df_filled.dropna(inplace=True)
    elif method == 'constant':
        if value is None:
            raise ValueError("For method='constant', a value must be provided.")
        df_filled.fillna(value, inplace=True)
    elif method == 'mean':
        df_filled.fillna(df.mean(), inplace=True)
    elif method == 'median':
        df_filled.fillna(df.median(), inplace=True)
    elif method == 'mode':
        # Mode can return multiple values per column, use the first one
        for column in df_filled.columns:
            df_filled[column].fillna(df_filled[column].mode()[0], inplace=True)
    elif method == 'ffill':
        df_filled.fillna(method='ffill', inplace=True)
    elif method == 'bfill':
        df_filled.fillna(method='bfill', inplace=True)
    else:
        raise ValueError("Unsupported method provided.")

    return df_filled

# Example usage:
# Create a sample DataFrame with missing values
data = {
    'A': [1, np.nan, 3, 4, 5],
    'B': [np.nan, 2, 3, np.nan, 5],
    'C': [1, 2, np.nan, 4, 5]
}
df = pd.DataFrame(data)

# Fill missing values using a specific method, e.g., 'mean'
df_filled = fill_missing_values(df, method='mean')
print(df_filled)



# **Smooth Noise Data**

In [34]:
import pandas as pd
import numpy as np

def smooth_data(df, column_name, method='moving_average', window_size=3, alpha=0.3):
    """ methods: The smoothing method ('moving_average' or 'exponential_moving_average') """
    if method == 'moving_average':
        return df[column_name].rolling(window=window_size, min_periods=1, center=True).mean()
    elif method == 'exponential_moving_average':
        return df[column_name].ewm(alpha=alpha, adjust=False).mean()
    else:
        raise ValueError("Unsupported smoothing method provided.")

# Example usage
# Create a sample DataFrame
data = {
    'time': range(1, 11),
    'value': [2, 3, 4, 15, 6, 7, 67, 5, 4, 5]
}
df = pd.DataFrame(data)

# Apply smoothing
smoothed_series_moving_average = smooth_data(df, 'value', method='moving_average', window_size=3)
smoothed_series_exponential = smooth_data(df, 'value', method='exponential_moving_average', alpha=0.3)

# Print or plot the results
print("Moving Average:\n", smoothed_series_moving_average)
print("\nExponential Moving Average:\n", smoothed_series_exponential)


Moving Average:
 0     2.500000
1     3.000000
2     7.333333
3     8.333333
4     9.333333
5    26.666667
6    26.333333
7    25.333333
8     4.666667
9     4.500000
Name: value, dtype: float64

Exponential Moving Average:
 0     2.000000
1     2.300000
2     2.810000
3     6.467000
4     6.326900
5     6.528830
6    24.670181
7    18.769127
8    14.338389
9    11.536872
Name: value, dtype: float64


# **Detect Outliers**

In [44]:
import pandas as pd
import numpy as np
from scipy import stats

def detect_and_remove_outliers(df, column_name, method='iqr'):
    """
    Detects and removes outliers in a specified column of a pandas DataFrame using the specified method.
    Prints the percentage of outlier samples before removing them.

    Parameters:
    - method: The method for detecting outliers:
        - 'iqr': Uses the Interquartile Range (IQR) for outlier detection.
            IQR is calculated as Q3 - Q1, where Q1 and Q3 are the 25th and 75th percentiles, respectively.
            Outliers are defined as observations that fall below Q1 - 1.5*IQR or above Q3 + 1.5*IQR.
        - 'z_score': Uses Z-scores for outlier detection.
            Z-score of an observation is calculated as (X - μ) / σ, where X is the observation, μ is the mean,
            and σ is the standard deviation of the dataset.
            An observation is considered an outlier if its Z-score is greater than 3 or less than -3, indicating
            it is more than 3 standard deviations away from the mean.

    Returns:
    - DataFrame without outliers.
    """
    if method == 'iqr':
        Q1 = df[column_name].quantile(0.25)
        Q3 = df[column_name].quantile(0.75)
        IQR = Q3 - Q1

        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        outliers = (df[column_name] < lower_bound) | (df[column_name] > upper_bound)
    elif method == 'z_score':
        z = np.abs(stats.zscore(df[column_name]))
        outliers = z > 1
    else:
        raise ValueError("Unsupported method provided.")

    # Calculate and print the percentage of outliers
    outlier_percentage = 100 * outliers.sum() / len(df)
    print(f"Percentage of outlier samples: {outlier_percentage:.2f}%")

    # Remove outliers
    df_cleaned = df[~outliers]

    return df_cleaned

# Example usage
data = {
    'time': range(1, 11),
    'value': [2, 3, 4, 5, 6, 1000, 60, 5, 4, 5]  # Assuming 100 is an outlier
}
df = pd.DataFrame(data)

# Detect and remove outliers using IQR
df_cleaned_iqr = detect_and_remove_outliers(df, 'value', method='iqr')
print("\nDataFrame after removing outliers using IQR:\n", df_cleaned_iqr)

Percentage of outlier samples: 20.00%

DataFrame after removing outliers using IQR:
    time  value
0     1      2
1     2      3
2     3      4
3     4      5
4     5      6
7     8      5
8     9      4
9    10      5


# **Data Reduction: PCA**

In [22]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.datasets import make_blobs

# Generate a sample dataset
X, y = make_blobs(n_samples=400, centers=5, n_features=15, random_state=42)

# Perform PCA
pca = PCA(n_components=5)  # Reduce to 2 dimensions
X_pca = pca.fit_transform(X)

def pca_inverse_transform(new_sample_pca):
  return pca.inverse_transform(new_sample_pca)

def plot_pca_results(pca, X_pca, method = 'scatter-plot', labels=None):
  if method == 'scatter-plot':
    plt.figure(figsize=(8, 6))
    plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y, edgecolor='k', cmap='viridis')
    plt.xlabel('First principal component')
    plt.ylabel('Second principal component')
    plt.title('PCA - First two principal components')
    plt.colorbar(label='Cluster label')
    plt.show()
  elif method == 'explained-variance':
    plt.figure(figsize=(8, 6))
    plt.bar(range(1, pca.n_components_ + 1), pca.explained_variance_ratio_)
    plt.xlabel('Principal Component')
    plt.ylabel('Variance Explained')
    plt.title('PCA Results: Explained Variance')
    plt.show()
  elif method == 'bi-plot':
    coeff = np.transpose(pca.components_[0:2, :])
    xs = X_pca[:,0]
    ys = X_pca[:,1]
    n = coeff.shape[0]
    scalex = 1.0/(xs.max() - xs.min())
    scaley = 1.0/(ys.max() - ys.min())

    plt.scatter(xs * scalex, ys * scaley, c='r')
    for i in range(n):
        plt.arrow(0, 0, coeff[i,0], coeff[i,1], color='b', alpha=0.5)
        if labels is None:
            plt.text(coeff[i,0]*1.15, coeff[i,1]*1.15, "Var"+str(i+1), color='g', ha='center', va='center')
        else:
            plt.text(coeff[i,0]*1.15, coeff[i,1]*1.15, labels[i], color='g', ha='center', va='center')
    plt.xlabel("PC{}".format(1))
    plt.ylabel("PC{}".format(2))
    plt.grid()



#**Data Reduction: Feature subset selection**

In [26]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

def feature_subset_selection(X, y, method='forward'):
    """
    Performs feature subset selection using forward selection or backward elimination.

    Parameters:
    - X: Feature matrix.
    - y: Target vector.
    - method: 'forward' for forward selection, 'backward' for backward elimination.

    Returns:
    - A tuple containing the list of selected features and the performance metric for each number of features.
    """
    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    # Define the model
    model = LogisticRegression(max_iter=1000)

    # Define the Sequential Feature Selector
    sfs = SFS(model,
              k_features='best',
              forward=(method == 'forward'),
              scoring='accuracy',
              cv=5)

    # Fit SFS
    sfs.fit(X_train, y_train)

    # Get the performance metric for each number of features
    metric_per_feature_count = [sfs.subsets_[k]['avg_score'] for k in sfs.subsets_]

    # Plot the performance metric as a function of the number of features
    plt.figure(figsize=(10, 6))
    plt.plot(range(1, len(sfs.subsets_)+1), metric_per_feature_count, marker='o')
    plt.title(f'Feature Selection using {method.capitalize()} Selection')
    plt.xlabel('Number of Features')
    plt.ylabel('Cross-Validation Accuracy')
    plt.grid(True)
    plt.show()

    # Return the selected feature indices and the performance metric
    return sfs.k_feature_idx_, metric_per_feature_count


# **Data Standardization**

In [27]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import pandas as pd

def standardize_data(df, columns, method='z_score'):
    """
    Standardizes the data in the specified columns of a pandas DataFrame.

    Parameters:
    - df: Pandas DataFrame containing the data.
    - columns: List of column names to be standardized.
    - method: Method used for standardization ('z_score' or 'min_max').

    Returns:
    - A DataFrame with the specified columns standardized.
    """
    df_standardized = df.copy()
    scaler = None

    if method == 'z_score':
        scaler = StandardScaler()
    elif method == 'min_max':
        scaler = MinMaxScaler()
    else:
        raise ValueError("Unsupported method provided. Choose 'z_score' or 'min_max'.")

    df_standardized[columns] = scaler.fit_transform(df_standardized[columns])

    return df_standardized

# Example usage:
data = {
    'Feature1': [1, 2, 3, 4, 5],
    'Feature2': [10, 20, 30, 40, 50],
    'NonStandardFeature': [100, 200, 300, 400, 500]  # Assume we don't want to standardize this
}
df = pd.DataFrame(data)

# Standardize the data using Z-score normalization
df_z_score = standardize_data(df, ['Feature1', 'Feature2'], method='z_score')
print("Z-Score Standardization:\n", df_z_score)

# Standardize the data using Min-Max scaling
df_min_max = standardize_data(df, ['Feature1', 'Feature2'], method='min_max')
print("\nMin-Max Scaling:\n", df_min_max)


Z-Score Standardization:
    Feature1  Feature2  NonStandardFeature
0 -1.414214 -1.414214                 100
1 -0.707107 -0.707107                 200
2  0.000000  0.000000                 300
3  0.707107  0.707107                 400
4  1.414214  1.414214                 500

Min-Max Scaling:
    Feature1  Feature2  NonStandardFeature
0      0.00      0.00                 100
1      0.25      0.25                 200
2      0.50      0.50                 300
3      0.75      0.75                 400
4      1.00      1.00                 500


In [30]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

def preprocess_dataframe(df):
    # Make a copy of the DataFrame to avoid changing the original data
    processed_df = df.copy()

    # Process each column based on its dtype
    for column in processed_df.columns:
        # Detect datetime columns and extract components
        if pd.api.types.is_datetime64_any_dtype(processed_df[column]):
            processed_df[f'{column}_Year'] = processed_df[column].dt.year
            processed_df[f'{column}_Month'] = processed_df[column].dt.month
            processed_df[f'{column}_Day'] = processed_df[column].dt.day
            # Optionally, drop the original datetime column if no longer needed
            # processed_df.drop(column, axis=1, inplace=True)

        # Detect categorical columns and apply one-hot encoding
        elif pd.api.types.is_categorical_dtype(processed_df[column]) or processed_df[column].dtype == 'object':
            processed_df = pd.get_dummies(processed_df, columns=[column], drop_first=True)

    # Standardize numerical columns
    # Exclude any newly created columns (Year, Month, Day, and dummies) from standardization
    num_cols = processed_df.select_dtypes(include=['float64', 'int64']).columns
    scaler = StandardScaler()
    processed_df[num_cols] = scaler.fit_transform(processed_df[num_cols])

    return processed_df

# Example usage
data = {
    'Date': pd.to_datetime(['2020-01-01', '2020-01-02', '2020-01-03', '2020-01-04', '2020-01-05']),
    'Category': ['A', 'B', 'A', 'C', 'B'],
    'Value': [100, 150, 120, 130, 110]
}
df = pd.DataFrame(data)
# Convert 'Category' to a categorical data type for demonstration purposes
df['Category'] = df['Category'].astype('category')

# Preprocess the DataFrame
processed_df = preprocess_dataframe(df)

