In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, TimeSeriesSplit, GroupKFold, ShuffleSplit
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer
from sklearn.feature_selection import SelectKBest, f_classif, chi2, RFE
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Lasso

# Create a sample dataset
data = pd.DataFrame({
    'Feature1': [10, 20, 30, 40, np.nan, 60, 70, 80, 90, 100],  # Contains missing values
    'Feature2': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],  # Continuous data
    'Feature3': np.random.randint(1, 100, size=10),  # Additional feature for RFE
    'Category': ['A', 'B', 'A', 'B', 'A', 'B', 'A', 'B', 'A', 'B'],  # Categorical
    'Target': [1, 0, 1, 0, 1, 0, 1, 0, 1, 0]  # Binary target
})

# ======================================
# 2.1. Data Splitting
# ======================================
X = data.drop(columns=['Target'])
y = data['Target']

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Cross-Validation
kf = KFold(n_splits=5)
skf = StratifiedKFold(n_splits=5)
tss = TimeSeriesSplit(n_splits=3)
gkf = GroupKFold(n_splits=2)
ss = ShuffleSplit(n_splits=5, test_size=0.2)

# ======================================
# 2.2. Feature Scaling
# ======================================
scalers = {
    'StandardScaler': StandardScaler(),  # Z-score normalization
    'MinMaxScaler': MinMaxScaler(),  # Scales between 0-1
    'RobustScaler': RobustScaler()  # Handles outliers
}

scaled_features = {}
for name, scaler in scalers.items():
    scaled_features[name] = scaler.fit_transform(X_train[['Feature2']])

# ======================================
# 2.3. Handling Missing Values
# ======================================
imputers = {
    'SimpleImputer': SimpleImputer(strategy='mean'),
    'KNNImputer': KNNImputer(n_neighbors=2),
    'IterativeImputer': IterativeImputer()
}

imputed_data = {}
for name, imputer in imputers.items():
    imputed_data[name] = imputer.fit_transform(X_train[['Feature1']])

# ======================================
# 2.4. Categorical Encoding
# ======================================
one_hot_encoder = OneHotEncoder()
ordinal_encoder = OrdinalEncoder()
label_encoder = LabelEncoder()

one_hot_encoded = one_hot_encoder.fit_transform(X_train[['Category']]).toarray()
ordinal_encoded = ordinal_encoder.fit_transform(X_train[['Category']])
label_encoded = label_encoder.fit_transform(y_train)  # Used for target labels

# ======================================
# 2.5. Feature Selection
# ======================================
select_k_best = SelectKBest(score_func=f_classif, k=1)
X_new = select_k_best.fit_transform(X_train[['Feature2', 'Feature3']], y_train)

# Wrapper Method: Recursive Feature Elimination (RFE) with 2 features
rfe = RFE(estimator=RandomForestClassifier(), n_features_to_select=1)
rfe.fit(X_train[['Feature2', 'Feature3']], y_train)

# Embedded Method: Lasso Regression
lasso = Lasso(alpha=0.1)
lasso.fit(X_train[['Feature2', 'Feature3']], y_train)
feature_importance_lasso = lasso.coef_

# ======================================
# 2.6. Dimensionality Reduction
# ======================================
pca = PCA(n_components=1)
X_pca = pca.fit_transform(X_train[['Feature2', 'Feature3']])

svd = TruncatedSVD(n_components=1)
X_svd = svd.fit_transform(X_train[['Feature2', 'Feature3']])

lda = LDA(n_components=1)
X_lda = lda.fit_transform(X_train[['Feature2', 'Feature3']], y_train)

# ======================================
# Output & Conclusion
# ======================================
print("Train-Test Split Shapes:", X_train.shape, X_test.shape)
print("Standard Scaler Example:", scaled_features['StandardScaler'][:5])
print("Missing Values Imputation Example (SimpleImputer):", imputed_data['SimpleImputer'][:5])
print("One Hot Encoding Example:", one_hot_encoded[:5])
print("Feature Selection (SelectKBest) Example:", X_new[:5])
print("RFE Selected Features:", rfe.support_)
print("PCA Transformed Data:", X_pca[:5])

Train-Test Split Shapes: (8, 4) (2, 4)
Standard Scaler Example: [[ 0.18569534]
 [-1.67125804]
 [ 0.92847669]
 [-0.92847669]
 [ 1.67125804]]
Missing Values Imputation Example (SimpleImputer): [[ 60.]
 [ 10.]
 [ 80.]
 [ 30.]
 [100.]]
One Hot Encoding Example: [[0. 1.]
 [1. 0.]
 [0. 1.]
 [1. 0.]
 [0. 1.]]
Feature Selection (SelectKBest) Example: [[36]
 [38]
 [28]
 [91]
 [42]]
RFE Selected Features: [False  True]
PCA Transformed Data: [[-15.13011608]
 [-13.06700516]
 [-23.15478396]
 [ 39.90344283]
 [ -9.18121333]]
