In [None]:
import pandas as pd
import numpy as np

# Creating sample data
data = {
    'ID': [1, 2, 3, 4, 5, 1, 2],
    'Name': ['John', 'Alice', 'Bob', 'Mary', 'Jane', 'John', 'Alice'],
    'Age': [25, 30, np.nan, 40, 35, 25, 30],
    'Gender': ['M', 'F', 'M', 'F', 'F', 'M', 'F'],
    'Income': [50000, 60000, 45000, 70000, 80000, 50000, 60000],
    'Education': ['Bachelor', 'Master', 'High School', 'PhD', 'Bachelor', 'Bachelor', 'Master'],
    'Score': [85, 90, 75, 95, 80, 85, 90],
    'Label': ['Yes', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No'],
    'Has_Car': [True, False, True, True, True, True, False],
    'Location': ['City', 'Suburb', 'Rural', 'City', 'Suburb', 'City', 'Suburb'],
    'High_Income': [False, False, True, True, True, False, False]  # Target feature
}

# Adding missing values and outliers
data['Age'][2] = np.nan
data['Income'][2] = 200000
data['Score'][3] = 50
data['Has_Car'][1] = np.nan

# Creating DataFrame
df = pd.DataFrame(data)

print(df)

from sklearn.preprocessing import LabelEncoder

# One-hot encoding
df_one_hot = pd.get_dummies(df, columns=['Gender', 'Education', 'Location'])
# One-hot encoding creates binary columns for each category in the specified columns.
# It's useful when the categories have no ordinal relationship and the algorithm might interpret them as such.
print("One-Hot Encoded DataFrame:")
print(df_one_hot)

# Label encoding
label_encoder = LabelEncoder()
df_label_encoded = df.copy()
# Applying label encoding to 'Gender' column
df_label_encoded['Gender'] = label_encoder.fit_transform(df_label_encoded['Gender'])
# Applying label encoding to 'Education' column
df_label_encoded['Education'] = label_encoder.fit_transform(df_label_encoded['Education'])
# Applying label encoding to 'Location' column
df_label_encoded['Location'] = label_encoder.fit_transform(df_label_encoded['Location'])
# Label encoding assigns a unique integer to each category, making it easier for algorithms to process.
print("\nLabel Encoded DataFrame:")
print(df_label_encoded)


"""
One-hot encoding: Convert categorical variables into binary vectors.

When to Use:
------------
- Use one-hot encoding when dealing with categorical variables that have no ordinal relationship and the algorithm might interpret them as such.
- Use it when you want to ensure that categorical variables are treated as distinct entities without any assumed order or hierarchy.
- Use it for machine learning algorithms that require numeric input features and may incorrectly interpret categorical variables with numeric labels as having ordinal relationships.
- Use it when you want to prevent biases introduced by assumed ordinal relationships and ensure accurate model training and prediction.

When to Avoid:
--------------
- Avoid one-hot encoding for categorical variables with a high number of unique categories (high cardinality), as it can lead to a significant increase in dataset dimensionality and computational inefficiencies.
- Avoid it when working with tree-based models like random forests on datasets with extremely high cardinality categorical variables, as it may become computationally expensive or prone to overfitting.
- Avoid one-hot encoding for linear models, especially when applying regularization techniques like Lasso, as it can lead to multicollinearity and less interpretable models.
- Avoid it if memory or storage constraints are a concern, as one-hot encoding can significantly increase memory and storage requirements, especially for large datasets.
- Avoid it when interpretability is crucial, as one-hot encoding can make the resulting model less interpretable due to the binary nature of the encoded features.

"""

   ID   Name   Age Gender  Income    Education  Score Label Has_Car Location  \
0   1   John  25.0      M   50000     Bachelor     85   Yes    True     City   
1   2  Alice  30.0      F   60000       Master     90    No     NaN   Suburb   
2   3    Bob   NaN      M  200000  High School     75   Yes    True    Rural   
3   4   Mary  40.0      F   70000          PhD     50   Yes    True     City   
4   5   Jane  35.0      F   80000     Bachelor     80    No    True   Suburb   
5   1   John  25.0      M   50000     Bachelor     85   Yes    True     City   
6   2  Alice  30.0      F   60000       Master     90    No   False   Suburb   

   High_Income  
0        False  
1        False  
2         True  
3         True  
4         True  
5        False  
6        False  
One-Hot Encoded DataFrame:
   ID   Name   Age  Income  Score Label Has_Car  High_Income  Gender_F  \
0   1   John  25.0   50000     85   Yes    True        False     False   
1   2  Alice  30.0   60000     90    No     NaN 

'\nOne-hot encoding: Convert categorical variables into binary vectors.\n\nWhen to Use:\n------------\n- Use one-hot encoding when dealing with categorical variables that have no ordinal relationship and the algorithm might interpret them as such.\n- Use it when you want to ensure that categorical variables are treated as distinct entities without any assumed order or hierarchy.\n- Use it for machine learning algorithms that require numeric input features and may incorrectly interpret categorical variables with numeric labels as having ordinal relationships.\n- Use it when you want to prevent biases introduced by assumed ordinal relationships and ensure accurate model training and prediction.\n\nWhen to Avoid:\n--------------\n- Avoid one-hot encoding for categorical variables with a high number of unique categories (high cardinality), as it can lead to a significant increase in dataset dimensionality and computational inefficiencies.\n- Avoid it when working with tree-based models lik

In [None]:

from scipy.stats import zscore

# Method 1: Z-Score
# Z-score method identifies outliers based on their deviation from the mean in terms of standard deviation.
# Outliers are typically defined as observations with a z-score greater than a certain threshold (e.g., 3).
threshold = 3
z_scores = zscore(df_label_encoded['Income'])
outlier_indices_zscore = np.where(np.abs(z_scores) > threshold)[0]
df_without_outliers_zscore = df_label_encoded.drop(outlier_indices_zscore)
print("\nDataFrame without outliers using Z-Score method:")
print(df_without_outliers_zscore)

# Method 2: Interquartile Range (IQR)
# IQR method identifies outliers based on the difference between the third quartile (Q3) and the first quartile (Q1).
# Outliers are typically defined as observations that fall below Q1 - 1.5 * IQR or above Q3 + 1.5 * IQR.
Q1 = df_label_encoded['Income'].quantile(0.25)
Q3 = df_label_encoded['Income'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
outlier_indices_iqr = np.where((df_label_encoded['Income'] < lower_bound) | (df_label_encoded['Income'] > upper_bound))[0]
df_without_outliers_iqr = df_label_encoded.drop(outlier_indices_iqr)
print("\nDataFrame without outliers using IQR method:")
print(df_without_outliers_iqr)

"""
Explanation and Comments:

- Z-Score Method:
  - Benefits:
    - Easily identifies outliers based on their deviation from the mean in terms of standard deviation.
    - Provides a standardized measure of how far an observation deviates from the mean, regardless of the original scale of the data.
  - Use Cases:
    - Useful when the distribution of the data is approximately normal.
    - Suitable for datasets where the mean and standard deviation are meaningful measures of central tendency and dispersion.

- Interquartile Range (IQR) Method:
  - Benefits:
    - Robust to outliers and non-normal distributions.
    - Provides a measure of the spread of the middle 50% of the data, making it less sensitive to extreme values compared to the range.
  - Use Cases:
    - Effective for skewed distributions or datasets with a large number of outliers.
    - Commonly used in exploratory data analysis and data cleaning processes.

Both methods are effective in identifying and handling outliers, but the choice between them depends on the characteristics of the data and the assumptions of the analysis.
"""


DataFrame without outliers using Z-Score method:
   ID   Name   Age  Gender  Income  Education  Score Label Has_Car  Location  \
0   1   John  25.0       1   50000          0     85   Yes    True         0   
1   2  Alice  30.0       0   60000          2     90    No     NaN         2   
2   3    Bob   NaN       1  200000          1     75   Yes    True         1   
3   4   Mary  40.0       0   70000          3     50   Yes    True         0   
4   5   Jane  35.0       0   80000          0     80    No    True         2   
5   1   John  25.0       1   50000          0     85   Yes    True         0   
6   2  Alice  30.0       0   60000          2     90    No   False         2   

   High_Income  
0        False  
1        False  
2         True  
3         True  
4         True  
5        False  
6        False  

DataFrame without outliers using IQR method:
   ID   Name   Age  Gender  Income  Education  Score Label Has_Car  Location  \
0   1   John  25.0       1   50000          0  

'\nExplanation and Comments:\n\n- Z-Score Method:\n  - Benefits:\n    - Easily identifies outliers based on their deviation from the mean in terms of standard deviation.\n    - Provides a standardized measure of how far an observation deviates from the mean, regardless of the original scale of the data.\n  - Use Cases:\n    - Useful when the distribution of the data is approximately normal.\n    - Suitable for datasets where the mean and standard deviation are meaningful measures of central tendency and dispersion.\n\n- Interquartile Range (IQR) Method:\n  - Benefits:\n    - Robust to outliers and non-normal distributions.\n    - Provides a measure of the spread of the middle 50% of the data, making it less sensitive to extreme values compared to the range.\n  - Use Cases:\n    - Effective for skewed distributions or datasets with a large number of outliers.\n    - Commonly used in exploratory data analysis and data cleaning processes.\n\nBoth methods are effective in identifying and h

In [None]:
from sklearn.preprocessing import MinMaxScaler

# Min-Max scaling
scaler = MinMaxScaler()
numerical_columns = ['Age', 'Income', 'Score']  # Numerical columns to scale

# Apply Min-Max scaling to selected numerical columns in the z-scored DataFrame
df_withoutiqroutliers_scaled = df_without_outliers_iqr.copy()

# Apply Min-Max scaling to selected numerical columns
df_withoutiqroutliers_scaled[numerical_columns] = scaler.fit_transform(df_without_outliers_iqr[numerical_columns])

# Print the scaled DataFrame
print("\nScaled DataFrame:")
print(df_withoutiqroutliers_scaled)


"""
Explanation:

- Min-Max Scaling: Min-Max scaling is a technique used to scale numerical features to a specific range, typically between 0 and 1.
  It linearly transforms the values so that the minimum value becomes 0 and the maximum value becomes 1, while other values are scaled accordingly.

- Benefits:
  - Normalization: Min-Max scaling normalizes the features, ensuring that they are within the same scale, which can improve the performance
    of certain machine learning algorithms, such as gradient descent-based algorithms.
  - Preservation of Relationship: Min-Max scaling preserves the relationship between the original values, as it scales them linearly within
    a fixed range. This ensures that the relative differences between values are maintained.

- Use Cases:
  - Neural Networks: Min-Max scaling is commonly used in neural networks where input features need to be within a consistent range to ensure
    stable training.
  - Distance-Based Algorithms: It's beneficial for distance-based algorithms such as k-nearest neighbors (KNN) and support vector machines
    (SVM), as it ensures that features with larger scales do not dominate the distance calculations.
  - Visualization: Min-Max scaling can be useful when visualizing data, as it brings all features within the same scale, making it easier
    to compare and interpret the data.

- Considerations:
  - Sensitivity to Outliers: Min-Max scaling is sensitive to outliers, as it scales the data based on the minimum and maximum values. Therefore,
    it's important to handle outliers appropriately before applying Min-Max scaling.
  - Suitability for Non-Normal Distributions: It may not be suitable for features with non-normal distributions or features with a large number
    of outliers, as it can distort the data distribution. In such cases, other scaling methods like StandardScaler may be more appropriate.
"""


"""
Outlier Handling:

When to Use:
------------
- Use outlier handling techniques when dealing with datasets containing observations that deviate significantly from the rest of the data.
- Use it to improve the accuracy and reliability of statistical analyses and machine learning models by reducing the impact of outliers on model performance.
- Use outlier handling techniques to ensure that statistical assumptions are met and to produce more robust and reliable results in data analysis tasks.

When to Avoid:
--------------
- Avoid outlier handling when outliers are valid and meaningful data points that represent rare but genuine occurrences in the data.
- Avoid it when removing outliers would result in the loss of important information or trends in the dataset, leading to biased results or inaccurate conclusions.
- Avoid outlier handling if the dataset is small and removing outliers would significantly reduce the sample size, potentially affecting the statistical power of subsequent analyses.
- Avoid outlier handling techniques that are overly aggressive or subjective, as they may introduce bias or distort the underlying distribution of the data.

Common Outlier Handling Techniques:
------------------------------------
1. Z-Score Method:
   - Benefits: Easily identifies outliers based on their deviation from the mean in terms of standard deviation. Provides a standardized measure of how far an observation deviates from the mean.
   - Use Cases: Suitable for datasets with approximately normal distributions and when the mean and standard deviation are meaningful measures of central tendency and dispersion.

2. Interquartile Range (IQR) Method:
   - Benefits: Robust to outliers and non-normal distributions. Provides a measure of the spread of the middle 50% of the data, making it less sensitive to extreme values.
   - Use Cases: Effective for skewed distributions or datasets with a large number of outliers. Commonly used in exploratory data analysis and data cleaning processes.

3. Tukey's Fences:
   - Benefits: Provides a simple and intuitive method for identifying outliers based on the interquartile range.
   - Use Cases: Suitable for identifying outliers in datasets with a relatively large number of observations and when a clear threshold for outlier detection is needed.

4. Robust Z-Score:
   - Benefits: Similar to the Z-Score method but more robust to outliers and non-normal distributions.
   - Use Cases: Useful when the dataset contains extreme outliers that may skew the distribution and when a more robust measure of deviation from the mean is needed.

5. Isolation Forest:
   - Benefits: Anomaly detection technique that isolates outliers by randomly partitioning the data into subsets. Efficient for high-dimensional datasets and capable of handling mixed data types.
   - Use Cases: Suitable for detecting outliers in large datasets with complex and high-dimensional features, such as cybersecurity or fraud detection applications.

Considerations:
---------------
- Consider the characteristics of the dataset, such as distributional properties, sample size, and the presence of influential outliers, when selecting an outlier handling technique.
- Evaluate the impact of outlier handling on the validity and reliability of subsequent analyses or model performance.
- Document the outlier handling process and rationale to ensure transparency and reproducibility in data analysis workflows.
"""

"""
Normalization:

When to Use:
------------
- Use normalization to scale numerical features to a common range, typically between 0 and 1, to ensure consistency in feature scales.
- Use it when working with machine learning algorithms that are sensitive to feature scales, such as gradient descent-based algorithms.
- Use normalization to improve the convergence speed and stability of optimization algorithms by preventing large-scale features from dominating the optimization process.
- Use it to enhance the performance of distance-based algorithms like KNN and SVM, where feature scales can affect the calculation of distances.

When to Avoid:
--------------
- Avoid normalization when the distribution of features is already within a similar scale or when the algorithm is not sensitive to feature scales.
- Avoid it if the normalization process would result in the loss of important information or if the original scale of features is meaningful for interpretation.
- Avoid normalization if the dataset contains categorical variables or ordinal features that should not be scaled, as it may distort the relationships between variables.

Common Normalization Techniques:
--------------------------------
1. Min-Max Scaling:
   - Benefits: Linearly transforms features to a specific range, typically between 0 and 1, preserving the relationship between values.
   - Use Cases: Suitable for algorithms that require features to be within a consistent range, such as neural networks and distance-based algorithms.

2. Standardization (Z-Score Normalization):
   - Benefits: Centers features around the mean and scales them to have a standard deviation of 1, making them more interpretable and robust to outliers.
   - Use Cases: Effective for algorithms that assume features are normally distributed or when the scale of features is meaningful for interpretation.

3. Robust Scaling:
   - Benefits: Scales features based on percentiles, making it robust to outliers and non-normal distributions.
   - Use Cases: Suitable for datasets with outliers or skewed distributions, where standardization may be affected by extreme values.

Considerations:
---------------
- Consider the characteristics of the dataset, including the distributional properties of features and the requirements of the algorithm, when selecting a normalization technique.
- Evaluate the impact of normalization on the performance and interpretability of machine learning models, and consider alternative scaling methods if necessary.
- Document the normalization process and any assumptions made to ensure transparency and reproducibility in data analysis workflows.
"""


"""
Scaling:

When to Use:
------------
- Use scaling to transform numerical features into a consistent range to improve the performance and convergence of machine learning algorithms.
- Use it when working with algorithms that are sensitive to the scale of features, such as gradient descent-based algorithms and distance-based algorithms.
- Use scaling to ensure that all features contribute equally to the model fitting process and to prevent features with larger scales from dominating the optimization process.
- Use it to enhance the interpretability and stability of machine learning models by making feature coefficients or importance scores comparable across different features.

When to Avoid:
--------------
- Avoid scaling when the scale of features is not relevant for the problem at hand or when the algorithm is not sensitive to feature scales.
- Avoid it if the scaling process would distort the relationships between features or if the original scale of features is meaningful for interpretation.
- Avoid scaling categorical variables or ordinal features that should not be transformed, as it may introduce bias or alter the interpretation of these variables.

Common Scaling Techniques:
---------------------------
1. Min-Max Scaling:
   - Benefits: Linearly transforms features to a specific range, typically between 0 and 1, preserving the relationship between values.
   - Use Cases: Suitable for algorithms that require features to be within a consistent range, such as neural networks and distance-based algorithms.

2. Standardization (Z-Score Scaling):
   - Benefits: Centers features around the mean and scales them to have a standard deviation of 1, making them more interpretable and robust to outliers.
   - Use Cases: Effective for algorithms that assume features are normally distributed or when the scale of features is meaningful for interpretation.

3. Robust Scaling:
   - Benefits: Scales features based on percentiles, making it robust to outliers and non-normal distributions.
   - Use Cases: Suitable for datasets with outliers or skewed distributions, where standardization may be affected by extreme values.

Considerations:
---------------
- Consider the characteristics of the dataset, including the distributional properties of features and the requirements of the algorithm, when selecting a scaling technique.
- Evaluate the impact of scaling on the performance and interpretability of machine learning models, and consider alternative scaling methods if necessary.
- Document the scaling process and any assumptions made to ensure transparency and reproducibility in data analysis workflows.
"""


Scaled DataFrame:
   ID   Name       Age  Gender    Income  Education  Score Label Has_Car  \
0   1   John  0.000000       1  0.000000          0  0.875   Yes    True   
1   2  Alice  0.333333       0  0.333333          2  1.000    No     NaN   
3   4   Mary  1.000000       0  0.666667          3  0.000   Yes    True   
4   5   Jane  0.666667       0  1.000000          0  0.750    No    True   
5   1   John  0.000000       1  0.000000          0  0.875   Yes    True   
6   2  Alice  0.333333       0  0.333333          2  1.000    No   False   

   Location  High_Income  
0         0        False  
1         2        False  
3         0         True  
4         2         True  
5         0        False  
6         2        False  


'\nScaling:\n\nWhen to Use:\n------------\n- Use scaling to transform numerical features into a consistent range to improve the performance and convergence of machine learning algorithms.\n- Use it when working with algorithms that are sensitive to the scale of features, such as gradient descent-based algorithms and distance-based algorithms.\n- Use scaling to ensure that all features contribute equally to the model fitting process and to prevent features with larger scales from dominating the optimization process.\n- Use it to enhance the interpretability and stability of machine learning models by making feature coefficients or importance scores comparable across different features.\n\nWhen to Avoid:\n--------------\n- Avoid scaling when the scale of features is not relevant for the problem at hand or when the algorithm is not sensitive to feature scales.\n- Avoid it if the scaling process would distort the relationships between features or if the original scale of features is meanin

In [None]:
from sklearn.impute import SimpleImputer
# Identify numerical and categorical columns from the DataFrame

numerical_columns = df_withoutiqroutliers_scaled.select_dtypes(include=['float64', 'int64']).columns
categorical_columns = df_withoutiqroutliers_scaled.select_dtypes(include=['object']).columns

# Method 1: Mean Imputation (for numerical columns)
imputer_mean = SimpleImputer(strategy='mean')
df_scaled_mean_imputed = df_withoutiqroutliers_scaled.copy()
df_scaled_mean_imputed[numerical_columns] = imputer_mean.fit_transform(df_withoutiqroutliers_scaled[numerical_columns])

# Method 2: Median Imputation (for numerical columns)
imputer_median = SimpleImputer(strategy='median')
df_scaled_median_imputed = df_withoutiqroutliers_scaled.copy()
df_scaled_median_imputed[numerical_columns] = imputer_median.fit_transform(df_withoutiqroutliers_scaled[numerical_columns])

# Method 3: Most Frequent Imputation (for non-numerical columns)
imputer_mode = SimpleImputer(strategy='most_frequent')
df_scaled_mode_imputed = df_withoutiqroutliers_scaled.copy()
df_scaled_mode_imputed[categorical_columns] = imputer_mode.fit_transform(df_withoutiqroutliers_scaled[categorical_columns])

# Remove rows containing NaN values for the entire DataFrame
df_scaled_mean_imputed_dropna = df_scaled_mean_imputed.dropna()
df_scaled_median_imputed_dropna = df_scaled_median_imputed.dropna()
df_scaled_mode_imputed_dropna = df_scaled_mode_imputed.dropna()

# Remove rows containing NaN values for a particular column (example: 'Income')
column_to_dropna = 'Income'
df_scaled_mean_imputed_dropna_column = df_scaled_mean_imputed.dropna(subset=[column_to_dropna])
df_scaled_median_imputed_dropna_column = df_scaled_median_imputed.dropna(subset=[column_to_dropna])
df_scaled_mode_imputed_dropna_column = df_scaled_mode_imputed.dropna(subset=[column_to_dropna])

print("Mean Imputation (Numerical) with NaN Removal:")
print(df_scaled_mean_imputed_dropna)

print("\nMedian Imputation (Numerical) with NaN Removal:")
print(df_scaled_median_imputed_dropna)

print("\nMost Frequent Imputation (Categorical) with NaN Removal:")
print(df_scaled_mode_imputed_dropna)

print("\nMean Imputation (Numerical) with NaN Removal for Column '{}':".format(column_to_dropna))
print(df_scaled_mean_imputed_dropna_column)

print("\nMedian Imputation (Numerical) with NaN Removal for Column '{}':".format(column_to_dropna))
print(df_scaled_median_imputed_dropna_column)

print("\nMost Frequent Imputation (Categorical) with NaN Removal for Column '{}':".format(column_to_dropna))
print(df_scaled_mode_imputed_dropna_column)


Mean Imputation (Numerical) with NaN Removal:
    ID   Name       Age  Gender    Income  Education  Score Label Has_Car  \
0  1.0   John  0.000000     1.0  0.000000        0.0  0.875   Yes    True   
3  4.0   Mary  1.000000     0.0  0.666667        3.0  0.000   Yes    True   
4  5.0   Jane  0.666667     0.0  1.000000        0.0  0.750    No    True   
5  1.0   John  0.000000     1.0  0.000000        0.0  0.875   Yes    True   
6  2.0  Alice  0.333333     0.0  0.333333        2.0  1.000    No   False   

   Location  High_Income  
0       0.0        False  
3       0.0         True  
4       2.0         True  
5       0.0        False  
6       2.0        False  

Median Imputation (Numerical) with NaN Removal:
    ID   Name       Age  Gender    Income  Education  Score Label Has_Car  \
0  1.0   John  0.000000     1.0  0.000000        0.0  0.875   Yes    True   
3  4.0   Mary  1.000000     0.0  0.666667        3.0  0.000   Yes    True   
4  5.0   Jane  0.666667     0.0  1.000000        

In [None]:
from sklearn.feature_selection import SelectKBest, f_regression, RFE, SelectFromModel
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

# Define numerical columns
numerical_columns = df_scaled_mean_imputed_dropna.select_dtypes(include=['float64', 'int64']).columns

# Define categorical columns
categorical_columns = df_scaled_mean_imputed_dropna.select_dtypes(include=['object']).columns

# Define the target variable
target_variable = 'High_Income'

# Method 1: SelectKBest with f_regression (for numerical columns)
selector_kbest = SelectKBest(score_func=f_regression, k=2)
X_kbest = selector_kbest.fit_transform(df_scaled_mean_imputed_dropna[numerical_columns], df_scaled_mean_imputed_dropna[target_variable])

# Method 2: Recursive Feature Elimination (RFE) with Linear Regression (for numerical columns)
estimator = LinearRegression()
selector_rfe = RFE(estimator, n_features_to_select=2, step=1)
X_rfe = selector_rfe.fit_transform(df_scaled_mean_imputed_dropna[numerical_columns], df_scaled_mean_imputed_dropna[target_variable])

# Print selected features for SelectKBest
selected_indices_kbest = selector_kbest.get_support(indices=True)
selected_features_kbest = df_scaled_mean_imputed_dropna.columns[selected_indices_kbest]
print("Selected features for SelectKBest:", selected_features_kbest)



"""
Feature Selection Methods:

1. SelectKBest (SelectKBest):
   - Summary: Selects the top k features based on their scores using a specified scoring function.
   - Benefits:
     - Directly specifies the desired number of features to select.
     - Computes feature scores independently, making it efficient for large datasets.
   - Use Cases:
     - Suitable for datasets with many features where a predefined number of features is desired.
     - Useful for feature engineering and dimensionality reduction in machine learning pipelines.

2. Recursive Feature Elimination (RFE):
   - Summary: Recursively removes features based on their importance until the specified number of features is reached.
   - Benefits:
     - Considers feature interactions and their collective contribution to the model's performance.
     - Provides more robust feature selection by iteratively evaluating feature importance.
   - Use Cases:
     - Effective for datasets with a moderate number of features and complex relationships between features.
     - Suitable for improving model interpretability and reducing overfitting by selecting relevant features.

3. SelectFromModel (SelectFromModel):
   - Summary: Selects features based on their importance provided by a specified model.
   - Benefits:
     - Utilizes the feature importances provided by the model to select relevant features.
     - Can handle both numerical and categorical features, providing flexibility in feature selection.
   - Use Cases:
     - Useful for datasets with mixed data types or when feature importance needs to be assessed comprehensively.
     - Effective for identifying important features in ensemble models such as random forests or gradient boosting machines.
"""

Selected features for SelectKBest: Index(['ID', 'Name'], dtype='object')


"\nFeature Selection Methods:\n\n1. SelectKBest (SelectKBest):\n   - Summary: Selects the top k features based on their scores using a specified scoring function.\n   - Benefits:\n     - Directly specifies the desired number of features to select.\n     - Computes feature scores independently, making it efficient for large datasets.\n   - Use Cases:\n     - Suitable for datasets with many features where a predefined number of features is desired.\n     - Useful for feature engineering and dimensionality reduction in machine learning pipelines.\n\n2. Recursive Feature Elimination (RFE):\n   - Summary: Recursively removes features based on their importance until the specified number of features is reached.\n   - Benefits:\n     - Considers feature interactions and their collective contribution to the model's performance.\n     - Provides more robust feature selection by iteratively evaluating feature importance.\n   - Use Cases:\n     - Effective for datasets with a moderate number of fe

In [None]:
# Convert categorical variables to numerical using one-hot encoding
df_encoded = pd.get_dummies(df_scaled_mean_imputed_dropna)

# Apply PCA to the encoded DataFrame
pca = PCA(n_components=2)  # Specify the number of principal components
X_pca = pca.fit_transform(df_encoded)

# Print explained variance ratio
print("Explained variance ratio:", pca.explained_variance_ratio_)


"""
Principal Component Analysis (PCA):

- Summary:
  PCA is a dimensionality reduction technique that transforms high-dimensional data into a lower-dimensional space while preserving the maximum variance in the data. It identifies the principal components, which are orthogonal vectors that represent the directions of maximum variance in the original feature space.

- What is Calculated:
  PCA calculates the eigenvectors and eigenvalues of the covariance matrix of the input data. The eigenvectors (principal components) represent the directions of maximum variance, while the eigenvalues represent the amount of variance explained by each principal component.

- How Can it be Used:
  1. Dimensionality Reduction: PCA is used to reduce the dimensionality of data by projecting it onto a lower-dimensional subspace while retaining most of the important information.
  2. Visualization: PCA is often used for data visualization by transforming high-dimensional data into a 2D or 3D space, making it easier to visualize and interpret.
  3. Noise Reduction: PCA can help in reducing noise and removing redundant features from the data, leading to improved model performance.

- Alternatives:
  1. Linear Discriminant Analysis (LDA): LDA is another dimensionality reduction technique that takes into account class labels and aims to maximize the separation between classes.
  2. t-distributed Stochastic Neighbor Embedding (t-SNE): t-SNE is a nonlinear dimensionality reduction technique that is particularly effective for visualizing high-dimensional data in low-dimensional space while preserving local structures.

- Benefits:
  1. Reduces Dimensionality: PCA reduces the number of features in the data while retaining most of the variance, making it computationally efficient and improving model performance.
  2. Removes Redundancy: PCA identifies and removes redundant features, leading to simpler and more interpretable models.
  3. Visualization: PCA helps in visualizing high-dimensional data in lower-dimensional space, facilitating data exploration and interpretation.

- Use Cases:
  1. Image Compression: PCA is used in image compression to reduce the size of images while preserving important features.
  2. Gene Expression Analysis: PCA is used in genomics to identify patterns and reduce noise in gene expression data.
  3. Financial Analysis: PCA is used in finance for portfolio optimization and risk management by identifying the principal components of asset returns.

"""


"""
Principal Component Analysis (PCA):

PCA is a dimensionality reduction technique that transforms high-dimensional data into a lower-dimensional space while preserving the maximum variance in the data. It identifies the principal components, which are orthogonal vectors that represent the directions of maximum variance in the original feature space.

Parameters:
    - n_components (int or None, default=None): Number of components to keep. If None, all components are kept.

Attributes:
    - components_ (ndarray of shape (n_components, n_features)): Principal axes in feature space, representing the directions of maximum variance.
    - explained_variance_ratio_ (ndarray of shape (n_components,)): Percentage of variance explained by each of the selected components.

Methods:
    - fit(X, y=None): Fit the PCA model to the data.
    - transform(X): Apply dimensionality reduction to X.
    - fit_transform(X, y=None): Fit the PCA model to the data and transform X.

Use Case:
PCA is commonly used for dimensionality reduction, visualization, and noise reduction in high-dimensional datasets. It finds application in various domains such as image processing, genetics, finance, and more.

Benefits:
1. Dimensionality Reduction: PCA reduces the number of features in the data while retaining most of the variance, making it computationally efficient and improving model performance.
2. Removes Redundancy: PCA identifies and removes redundant features, leading to simpler and more interpretable models.
3. Visualization: PCA helps in visualizing high-dimensional data in lower-dimensional space, facilitating data exploration and interpretation.
4. Noise Reduction: PCA can help in reducing noise and removing redundant features from the data, leading to improved model performance.

Walkthrough:
1. Import the PCA class from the sklearn.decomposition module.
2. Create a PCA object with the desired number of components.
3. Fit the PCA object to the input data using the fit method.
4. Transform the input data into the lower-dimensional space using the transform method.
5. Optionally, compute and analyze the explained variance ratio to understand the contribution of each principal component to the total variance.

Example:
# Apply PCA to the input DataFrame
pca = PCA(n_components=2)  # Specify the number of principal components
X_pca = pca.fit_transform(df_scaled_mean_imputed_dropna)

# Print explained variance ratio
print("Explained variance ratio:", pca.explained_variance_ratio_)

"""



from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

# Determine the minimum between the number of features and the number of classes minus one
min_components = min(X.shape[1], len(np.unique(y)) - 1)

# Apply Linear Discriminant Analysis (LDA) with the appropriate number of components
lda = LinearDiscriminantAnalysis(n_components=min_components)
X_lda = lda.fit_transform(X, y)

# Print explained variance ratio
print("Explained variance ratio:", lda.explained_variance_ratio_)


"""
Linear Discriminant Analysis (LDA) Alternative:

Apply Linear Discriminant Analysis (LDA) as an alternative dimensionality reduction technique to Principal Component Analysis (PCA) on the provided input DataFrame.

Parameters:
-----------
df_scaled_mean_imputed_dropna : pandas DataFrame
    Input DataFrame after mean imputation and removal of rows with NaN values.

Returns:
--------
X_lda : array-like, shape (n_samples, n_components)
    Reduced-dimensional data after applying Linear Discriminant Analysis (LDA).

Explanation:
------------
Linear Discriminant Analysis (LDA) is a dimensionality reduction technique that takes into account class labels to find the linear combinations of features that best separate different classes in the data.

Use Case:
---------
LDA is commonly used in classification tasks where maximizing class separability is important. It's particularly useful when there are multiple classes in the dataset and the goal is to reduce the dimensionality while preserving class-related information.

Benefits:
---------
- LDA takes into account class labels, making it effective for classification tasks.
- It maximizes class separability in the reduced-dimensional space, leading to better classification performance.
- LDA provides insights into the discriminatory power of different features, helping in feature selection and model interpretation.

Walkthrough:
------------
1. Load the data and separate features and the target variable.
2. Apply LDA to the features with the target variable.
3. Use the reduced-dimensional data for classification or visualization tasks.

By applying LDA, you can reduce the dimensionality of your data while preserving class-related information, which can be beneficial for various classificat

"""


Explained variance ratio: [0.56009292 0.24328888]
Explained variance ratio: [1.]


"\nLinear Discriminant Analysis (LDA) Alternative:\n\nApply Linear Discriminant Analysis (LDA) as an alternative dimensionality reduction technique to Principal Component Analysis (PCA) on the provided input DataFrame.\n\nParameters:\n-----------\ndf_scaled_mean_imputed_dropna : pandas DataFrame\n    Input DataFrame after mean imputation and removal of rows with NaN values.\n\nReturns:\n--------\nX_lda : array-like, shape (n_samples, n_components)\n    Reduced-dimensional data after applying Linear Discriminant Analysis (LDA).\n\nExplanation:\n------------\nLinear Discriminant Analysis (LDA) is a dimensionality reduction technique that takes into account class labels to find the linear combinations of features that best separate different classes in the data.\n\nUse Case:\n---------\nLDA is commonly used in classification tasks where maximizing class separability is important. It's particularly useful when there are multiple classes in the dataset and the goal is to reduce the dimensio

In [None]:
# Load your dataframe (df_scaled_mean_imputed_dropna) here

# Method 1: Random Noise Addition
def add_noise(dataframe, noise_level=0.1):
    """
    Randomly adds noise to numerical features in the DataFrame.

    Parameters:
    - dataframe: Original DataFrame
    - noise_level: Level of noise to be added (default: 0.1)

    Returns:
    - Augmented DataFrame with added noise

    Explanation:
    Random noise addition is a simple data augmentation technique used to introduce variability into the dataset by adding random noise to numerical features. This can help in making the model more robust and generalize better to unseen data.

    Benefits:
    - Increases Robustness: Adding noise helps in making the model more robust by reducing overfitting and capturing underlying patterns in the data.
    - Improves Generalization: Introducing variability through noise encourages the model to learn more generalized patterns, leading to better performance on unseen data.

    Alternatives:
    - Gaussian Noise Injection: Similar to random noise addition, Gaussian noise injection adds noise to the dataset using a Gaussian distribution with specified mean and standard deviation.
    - Uniform Noise Injection: Uniform noise injection adds noise using a uniform distribution within a specified range.

    Use Cases:
    - Classification and Regression Tasks: Random noise addition is commonly used in classification and regression tasks where the goal is to improve model performance and generalization.
    - Image Processing: In image processing tasks, adding random noise can help in augmenting the dataset for tasks like denoising and image reconstruction.

    Walkthrough:
    1. Select Numerical Features: Identify numerical features in the dataset.
    2. Add Noise: Generate random noise with a specified level and add it to the numerical features.
    3. Return Augmented DataFrame: Return the augmented DataFrame with added noise.

    Example:
    >>> # Apply Random Noise Addition
    >>> df_noisy = add_noise(df_scaled_mean_imputed_dropna)
    >>> # Print the first few rows of the augmented DataFrame
    >>> print("Augmented DataFrame with Random Noise:")
    >>> print(df_noisy.head())
    """
    noisy_dataframe = dataframe.copy()
    numerical_columns = noisy_dataframe.select_dtypes(include=['float64', 'int64']).columns
    for column in numerical_columns:
        noise = np.random.normal(0, noise_level, len(noisy_dataframe))
        noisy_dataframe[column] += noise
    return noisy_dataframe

# Data Augmentation Techniques:

# Method 2: Shuffle Data
def shuffle_data(dataframe):
    """
    Shuffles the rows of the DataFrame to introduce randomness.

    Parameters:
    - dataframe: Original DataFrame

    Returns:
    - Augmented DataFrame with shuffled rows

    Explanation:
    Shuffling the data involves randomly reordering the rows of the DataFrame. This technique helps in introducing randomness into the dataset, which can prevent the model from memorizing the order of the data and improve its generalization ability.

    Benefits:
    - Prevents Overfitting: Shuffling the data prevents the model from memorizing patterns based on the order of the data, thus reducing overfitting.
    - Increases Robustness: Introducing randomness through shuffling makes the model more robust by exposing it to a variety of data configurations.

    Alternatives:
    - Random Sampling: Randomly selecting a subset of data points from the dataset can also introduce randomness, but it may not shuffle the entire dataset.
    - Time-Based Shuffling: Shuffling data based on timestamps or time intervals can be useful for time-series data.

    Use Cases:
    - Time-Series Forecasting: Shuffling data can be beneficial in time-series forecasting tasks to prevent the model from learning patterns based on the sequence of timestamps.
    - Text Classification: In natural language processing tasks, shuffling data can help in text classification tasks by preventing the model from learning patterns based on the order of documents.

    Walkthrough:
    1. Shuffle Rows: Randomly shuffle the rows of the DataFrame to introduce randomness.
    2. Return Augmented DataFrame: Return the DataFrame with shuffled rows.

    Example:
    >>> # Apply Shuffle Data
    >>> df_shuffled = shuffle_data(df_scaled_mean_imputed_dropna)
    >>> # Print the first few rows of the augmented DataFrame
    >>> print("Augmented DataFrame with Shuffled Data:")
    >>> print(df_shuffled.head())
    """
    shuffled_dataframe = shuffle(dataframe)
    return shuffled_dataframe

def create_duplicates(dataframe, num_duplicates=1):
    """
    Creates duplicates of the original DataFrame.

    Parameters:
    - dataframe: Original DataFrame
    - num_duplicates: Number of duplicates to create (default: 1)

    Returns:
    - Augmented DataFrame with duplicates

    Explanation:
    Duplicate creation is a simple data augmentation technique that involves replicating existing samples in the dataset. By creating duplicates, we can increase the size of the dataset, which can be beneficial for training robust models, especially when the original dataset is small.

    Benefits:
    - Increased Dataset Size: Duplicate creation expands the dataset size, providing more data for training and improving the model's ability to generalize.
    - Preserve Original Distribution: Duplicates maintain the distribution of the original dataset, ensuring that the characteristics of the data are preserved.

    Alternatives:
    - Bootstrapping: Bootstrapping is a resampling technique that involves generating new samples by randomly sampling with replacement from the original dataset.
    - Synthetic Minority Over-sampling Technique (SMOTE): SMOTE is a method used for imbalanced datasets that generates synthetic samples for minority classes by interpolating between existing samples.

    Use Cases:
    - Imbalanced Datasets: Duplicate creation can be useful for balancing class distributions in imbalanced datasets, where one class is underrepresented.
    - Small Datasets: Duplicate creation can help in augmenting small datasets to provide more training data for machine learning models.

    Walkthrough:
    1. Specify Number of Duplicates: Determine the number of duplicates to create.
    2. Create Duplicates: Replicate existing samples in the dataset based on the specified number of duplicates.
    3. Return Augmented DataFrame: Return the augmented DataFrame with duplicates.

    Example:
    >>> # Apply Duplicate Creation
    >>> df_duplicates = create_duplicates(df_scaled_mean_imputed_dropna, num_duplicates=3)
    >>> # Print the number of samples in the augmented DataFrame
    >>> print("Number of Samples after Duplicate Creation:", len(df_duplicates))
    """
    duplicated_dataframe = pd.concat([dataframe] * num_duplicates, ignore_index=True)
    return duplicated_dataframe


# Data Augmentation Techniques:

# Method 2: Scale Features
def scale_features(dataframe, scale_factor=0.1):
    """
    Scales numerical features in the DataFrame by a specified factor.

    Parameters:
    - dataframe: Original DataFrame
    - scale_factor: Factor by which to scale the features (default: 0.1)

    Returns:
    - Augmented DataFrame with scaled features

    Explanation:
    Scaling features involves multiplying numerical features by a specified scale factor, which can help in augmenting the dataset by adjusting the magnitude of the features. This can be useful for tasks where feature magnitudes play a significant role in model performance.

    Benefits:
    - Adjusts Feature Magnitudes: Scaling features allows adjusting the magnitudes of numerical features, which can improve model convergence and stability.
    - Encourages Feature Importance: By scaling features, the importance of different features can be emphasized or de-emphasized based on their magnitudes.

    Alternatives:
    - Min-Max Scaling: Min-Max scaling scales features to a specified range (e.g., [0, 1]) by subtracting the minimum value and dividing by the range of values.
    - Standardization: Standardization scales features to have a mean of 0 and a standard deviation of 1 by subtracting the mean and dividing by the standard deviation.

    Use Cases:
    - Gradient-Based Optimization: Scaling features is commonly used in gradient-based optimization algorithms such as gradient descent to ensure uniform updates across features.
    - Neural Network Training: In neural network training, scaling features can help in improving the convergence of the training process and avoiding vanishing or exploding gradients.

    Walkthrough:
    1. Select Numerical Features: Identify numerical features in the dataset.
    2. Scale Features: Multiply the numerical features by the specified scale factor.
    3. Return Augmented DataFrame: Return the augmented DataFrame with scaled features.

    Example:
    >>> # Apply Scale Features
    >>> df_scaled = scale_features(df_scaled_mean_imputed_dropna)
    >>> # Print the first few rows of the augmented DataFrame
    >>> print("Augmented DataFrame with Scaled Features:")
    >>> print(df_scaled.head())
    """
    scaled_dataframe = dataframe.copy()
    numerical_columns = scaled_dataframe.select_dtypes(include=['float64', 'int64']).columns
    scaled_dataframe[numerical_columns] *= scale_factor
    return scaled_dataframe


# Data Augmentation Techniques:

# Data Augmentation Techniques:

# Method 2: Synthetic Minority Over-sampling Technique (SMOTE)
def apply_smote(dataframe, target_column, k_neighbors=1):
    """
    Applies Synthetic Minority Over-sampling Technique (SMOTE) to balance classes in the DataFrame.

    Parameters:
    - dataframe: Original DataFrame
    - target_column: Name of the target column containing class labels

    Returns:
    - Augmented DataFrame with balanced classes using SMOTE

    Explanation:
    Synthetic Minority Over-sampling Technique (SMOTE) is a data augmentation technique used to address class imbalance by generating synthetic samples for the minority class. It works by interpolating between existing minority class samples to create new synthetic samples.

    Benefits:
    - Addresses Class Imbalance: SMOTE helps in balancing class distribution by generating synthetic samples for the minority class, which can improve model performance and reduce bias.
    - Preserves Information: SMOTE generates synthetic samples by considering the local structure of the data, preserving important information and reducing the risk of overfitting.

    Alternatives:
    - ADASYN (Adaptive Synthetic Sampling): ADASYN is an extension of SMOTE that adaptively generates synthetic samples based on the density distribution of the minority class, focusing on areas of higher density.
    - Borderline-SMOTE: Borderline-SMOTE is a variant of SMOTE that focuses on generating synthetic samples near the decision boundary between classes to improve classification performance.

    Use Cases:
    - Classification Tasks with Imbalanced Classes: SMOTE is commonly used in classification tasks where class imbalance is prevalent, such as fraud detection, medical diagnosis, and anomaly detection.
    - Machine Learning Pipelines: SMOTE can be integrated into machine learning pipelines to preprocess data before model training, improving model performance and generalization.

    Walkthrough:
    1. Preprocess Data: Encode categorical variables and handle missing values.
    2. Identify Minority Class: Determine the minority class based on the target column.
    3. Apply SMOTE: Apply SMOTE to generate synthetic samples for the minority class, balancing class distribution.
    4. Return Augmented DataFrame: Return the augmented DataFrame with balanced classes using SMOTE.

    Example:
    >>> # Apply SMOTE
    >>> augmented_df_smote = apply_smote(df_scaled_mean_imputed_dropna, 'High_Income')
    >>> # Print the augmented DataFrame with SMOTE
    >>> print("Augmented DataFrame with SMOTE:")
    >>> print(augmented_df_smote)
    """
    # Preprocess data: Encode categorical variables and handle missing values
    dataframe_encoded = pd.get_dummies(dataframe, columns=dataframe.select_dtypes(include=['object']).columns)
    dataframe_encoded.dropna(inplace=True)

    # Apply SMOTE to balance classes
    smote = SMOTE(k_neighbors=k_neighbors)
    X_resampled, y_resampled = smote.fit_resample(dataframe_encoded.drop(columns=[target_column]), dataframe_encoded[target_column])

    # Combine resampled features and target variable into a DataFrame
    augmented_df = pd.concat([pd.DataFrame(X_resampled, columns=dataframe_encoded.drop(columns=[target_column]).columns), pd.Series(y_resampled, name=target_column)], axis=1)

    return augmented_df



# Apply Random Noise Addition
df_noisy = add_noise(df_scaled_mean_imputed_dropna)

# Print the first few rows of the augmented DataFrame
print("Augmented DataFrame with Random Noise:")
print(df_noisy.head())

# Apply Shuffle Data
df_shuffled = shuffle_data(df_scaled_mean_imputed_dropna)

# Print the first few rows of the augmented DataFrame
print("Augmented DataFrame with Shuffled Data:")
print(df_shuffled.head())


# Apply Duplicate Creation
df_duplicates = create_duplicates(df_scaled_mean_imputed_dropna, num_duplicates=3)

# Print the number of samples in the augmented DataFrame
print("Number of Samples after Duplicate Creation:", len(df_duplicates))



# Apply Scale Features
df_scaled = scale_features(df_scaled_mean_imputed_dropna)

# Print the first few rows of the augmented DataFrame
print("Augmented DataFrame with Scaled Features:")
print(df_scaled.head())


# Example of Data Augmentation with Concatenation:

# Augment the data using different techniques
augmented_df_noise = add_noise(df_scaled_mean_imputed_dropna)
augmented_df_shuffle = shuffle_data(df_scaled_mean_imputed_dropna)
augmented_df_duplicate = create_duplicates(df_scaled_mean_imputed_dropna)
augmented_df_scale = scale_features(df_scaled_mean_imputed_dropna)

# Concatenate the augmented data with the original data
concatenated_df = pd.concat([df_scaled_mean_imputed_dropna, augmented_df_noise, augmented_df_shuffle, augmented_df_duplicate, augmented_df_scale], ignore_index=True)

# Print the concatenated DataFrame
print("\nConcatenated DataFrame:")
print(concatenated_df.head())

# Apply SMOTE
augmented_df_smote = apply_smote(df_scaled_mean_imputed_dropna, 'High_Income')

# Print the augmented DataFrame with SMOTE
print("Augmented DataFrame with SMOTE:")
print(augmented_df_smote.head())


Augmented DataFrame with Random Noise:
         ID   Name       Age    Gender    Income  Education     Score Label  \
0  1.002483   John -0.036056  1.043741 -0.003362  -0.233972  0.944378   Yes   
3  4.032752   Mary  0.796761  0.086634  0.602848   2.936484 -0.020696   Yes   
4  4.950992   Jane  0.678570  0.026424  1.060908  -0.192964  0.806892    No   
5  0.793612   John -0.029316  1.055142  0.160090  -0.035306  0.804280   Yes   
6  1.968148  Alice  0.383258  0.059591  0.496383   1.814357  1.083006    No   

  Has_Car  Location  High_Income  
0    True  0.012616        False  
3    True -0.013245         True  
4    True  1.868057         True  
5    True -0.070378        False  
6   False  1.985968        False  
Augmented DataFrame with Shuffled Data:
    ID   Name       Age  Gender    Income  Education  Score Label Has_Car  \
0  1.0   John  0.000000     1.0  0.000000        0.0  0.875   Yes    True   
3  4.0   Mary  1.000000     0.0  0.666667        3.0  0.000   Yes    True   
5  1.

In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

# Original data
data = {
    'ID': [1, 2, 3, 4, 5],
    'Name': ['John', 'Alice', 'Bob', 'Mary', 'Jane'],
    'Age': [25, 30, np.nan, 40, 35],
    'Gender': ['M', 'F', 'M', 'F', 'F'],
    'Income': [50000, 60000, 45000, 70000, 80000],
    'Education': ['Bachelor', 'Master', 'High School', 'PhD', 'Bachelor'],
    'Score': [85, 90, 75, 95, 80],
    'Label': ['Yes', 'No', 'Yes', 'Yes', 'No'],
    'Has_Car': [True, False, True, True, True],
    'Location': ['City', 'Suburb', 'Rural', 'City', 'Suburb'],
    'High_Income': [False, False, True, True, True]  # Target feature
}

# Convert data to DataFrame
df = pd.DataFrame(data)

# Define numerical and categorical columns
numerical_cols = ['Age', 'Income', 'Score']
categorical_cols = ['Gender', 'Education', 'Label', 'Has_Car', 'Location']

# Custom transformer to remove outliers using IQR method
class RemoveOutliers(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        Q1 = np.percentile(X, 25, axis=0)
        Q3 = np.percentile(X, 75, axis=0)
        IQR = Q3 - Q1
        return X[~((X < (Q1 - 1.5 * IQR)) | (X > (Q3 + 1.5 * IQR))).any(axis=1)]

# Define preprocessing steps for numerical and categorical columns
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  # Impute missing values with mean
    ('scaler', MinMaxScaler())  # Scale features to a range between 0 and 1
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute missing values with mode
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # One-hot encode categorical features
])

# Apply transformations to numerical and categorical columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Pipeline for preprocessing and removing outliers
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('outlier_remover', RemoveOutliers())
])

# Apply preprocessing pipeline to the dataframe
df_preprocessed = pipeline.fit_transform(df)

# Convert preprocessed data back to DataFrame
df_preprocessed = pd.DataFrame(df_preprocessed)

print("Preprocessed DataFrame:")
print(df_preprocessed)


Preprocessed DataFrame:
         0         1     2    3    4    5    6    7    8    9    10   11   12  \
0  0.000000  0.142857  0.50  0.0  1.0  1.0  0.0  0.0  0.0  0.0  1.0  0.0  1.0   
1  0.666667  1.000000  0.25  1.0  0.0  1.0  0.0  0.0  0.0  1.0  0.0  0.0  1.0   

    13   14   15  
0  1.0  0.0  0.0  
1  0.0  0.0  1.0  
