In [None]:
import numpy as np


import pandas as pd
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, roc_curve
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
import seaborn as sns

# Create a sample dataset
X, y = make_classification(n_samples=1000, n_features=20, n_informative=10, n_classes=2, random_state=42)
df = pd.DataFrame(X, columns=[f'feature_{i}' for i in range(X.shape[1])])
df['target'] = y

print("Dataset Head:")
print(df.head())

Dataset Head:
   feature_0  feature_1  feature_2  feature_3  feature_4  feature_5  \
0   0.484179   1.112149  -3.700860 -11.437337  -1.117586  -3.649178   
1   0.193246  -0.314601  -2.160414  -1.436861  -0.638985  -0.031291   
2  -0.930375   0.879307  -0.600587  -0.512992  -0.291332   3.757383   
3   0.396046   0.320058  -3.263099   0.571208  -0.864430  -0.165850   
4  -1.552072  -0.235980  -0.711470  -3.847557   1.819077  -3.276689   

   feature_6  feature_7  feature_8  feature_9  ...  feature_11  feature_12  \
0   0.856905   3.169390  -0.246858   1.547641  ...   -1.787139   -0.204962   
1   1.530186  -0.546543  -0.770618  -1.080267  ...    0.380067    0.764223   
2   0.039309   2.128034   0.654688   0.470393  ...    1.190361    0.772833   
3  -1.315438   2.841837  -0.813665   0.692776  ...   -0.204707   -3.177934   
4  -0.489944   0.639253  -0.787551   0.467575  ...    1.138911    0.054534   

   feature_13  feature_14  feature_15  feature_16  feature_17  feature_18  \
0   -1.560433

In [2]:
# Missing Values (Simulated - no missing values in this case)
# df.isnull().sum()  # Check for missing values

# Duplicates (Simulated - no duplicates in this case)
df.drop_duplicates(inplace=True)

# Outliers (Simulated - demonstration of outlier removal)
# Example: Remove outliers based on Z-score for a specific column
from scipy import stats
z = np.abs(stats.zscore(df['feature_0']))
df = df[(z < 3)]  # Keep rows with Z-score < 3

print("\nData Preprocessing Done.")


Data Preprocessing Done.


In [3]:
# Numerical Features - Standardization
numerical_features = df.select_dtypes(include=np.number).columns.tolist()
numerical_features.remove('target')  # Remove target from numerical features

scaler = StandardScaler()
df[numerical_features] = scaler.fit_transform(df[numerical_features])

# Categorical Features - Label Encoding (Simulated - no categorical features in this case)
# le = LabelEncoder()
# df['categorical_feature'] = le.fit_transform(df['categorical_feature'])

print("\nData Transformation Done.")


Data Transformation Done.


In [4]:
df[numerical_features]

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,feature_11,feature_12,feature_13,feature_14,feature_15,feature_16,feature_17,feature_18,feature_19
0,0.230910,1.059226,-1.513279,-2.788369,-1.075629,-1.672603,0.809895,1.159377,-0.229730,1.522854,2.153627,-1.782221,-0.344284,-1.604701,-1.472435,-0.678362,0.227680,-1.088718,-1.822249,-2.071022
1,0.084160,-0.327613,-0.780971,-0.020328,-0.603826,-0.005998,1.465727,-0.544445,-0.757661,-1.117954,-0.022044,0.309386,0.159716,1.603062,-0.122773,0.174903,0.471635,0.552743,0.962646,0.990674
2,-0.482605,0.832898,-0.039449,0.235391,-0.261111,1.739282,0.013486,0.681897,0.678996,0.440318,0.099024,1.091414,0.164193,0.608336,1.002335,-0.717736,1.626186,-0.334575,-0.113615,-1.466061
3,0.186455,0.289293,-1.305173,0.535487,-0.826069,-0.067983,-1.306153,1.009188,-0.801050,0.663793,0.312492,-0.254990,-1.890304,-0.781418,0.611936,-0.523869,-0.720757,-0.442336,-0.894334,2.304142
4,-0.796195,-0.251191,-0.092161,-0.687587,1.819321,-1.501013,-0.502051,-0.000736,-0.774729,0.437486,0.774230,1.041758,-0.209340,-0.469386,-0.115850,0.627086,-1.575291,-2.353773,-0.724019,-0.491238
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.976006,0.408097,1.800377,0.602532,0.709927,0.160553,-0.867679,0.299024,0.180935,0.695600,-0.292650,1.254451,0.865536,0.606895,0.803236,0.050780,-0.295447,-1.100604,-0.218712,-0.278354
996,0.338750,1.396531,3.293715,3.171790,0.734367,0.646027,-0.226787,-1.685221,-0.916008,0.259957,-0.089236,0.585868,1.298551,0.220622,1.365561,-1.924688,-1.321914,0.659427,1.125104,-0.437767
997,0.472305,-0.180648,-0.689950,-1.623170,-0.081124,-1.405783,0.151418,0.886329,-0.367907,-1.992052,0.777507,0.538652,-0.435871,-0.120791,-0.444447,0.375013,0.906695,1.229475,-0.747622,-0.130907
998,1.010683,1.235750,0.456755,0.429955,-1.124166,-0.486990,-1.553240,-0.699996,2.665306,-2.057284,-0.656136,-0.060237,0.073846,-0.155816,0.622720,0.284799,0.042883,-0.869762,0.347698,0.913035


In [5]:
X = df.drop('target', axis=1)
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("\nData Splitting Done.")


Data Splitting Done.


In [6]:
# Interaction Features
X_train['interaction_1_2'] = X_train['feature_1'] * X_train['feature_2']
X_test['interaction_1_2'] = X_test['feature_1'] * X_test['feature_2']

# Polynomial Features
X_train['feature_5_squared'] = X_train['feature_5'] ** 2
X_test['feature_5_squared'] = X_test['feature_5'] ** 2

print("\nFeature Engineering Done.")


Feature Engineering Done.


In [7]:
X_train.columns

Index(['feature_0', 'feature_1', 'feature_2', 'feature_3', 'feature_4',
       'feature_5', 'feature_6', 'feature_7', 'feature_8', 'feature_9',
       'feature_10', 'feature_11', 'feature_12', 'feature_13', 'feature_14',
       'feature_15', 'feature_16', 'feature_17', 'feature_18', 'feature_19',
       'interaction_1_2', 'feature_5_squared'],
      dtype='object')

In [8]:
pca = PCA(n_components=10)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

print("\nDimensionality Reduction (PCA) Done.")


Dimensionality Reduction (PCA) Done.


In [10]:
X_train_pca

array([[ 1.13182114, -0.1331581 ,  0.6948728 , ..., -0.52535025,
         0.52557708, -0.41570069],
       [ 0.99650857,  0.3414514 ,  0.65537647, ..., -0.43792931,
        -0.28411202, -0.94671837],
       [ 1.93451781,  0.26560432, -1.11414013, ..., -0.71272689,
         0.76825387, -1.2886528 ],
       ...,
       [ 2.57002063,  0.2894296 ,  1.29194139, ...,  0.19538941,
        -0.71349985, -0.30345874],
       [ 1.9500036 ,  0.16611142,  0.06262753, ..., -1.48418042,
        -1.41000993,  1.48677885],
       [-1.10860446, -0.41858289, -1.32665748, ..., -0.91694615,
         0.67209879,  1.56991504]], shape=(797, 10))

In [11]:
estimator = LogisticRegression(random_state=42)
rfe = RFE(estimator, n_features_to_select=8)
rfe.fit(X_train_pca, y_train)

selected_features = X_train.columns[rfe.support_]

X_train_selected = X_train[selected_features]
X_test_selected = X_test[selected_features]

print("\nFeature Selection (RFE) Done.")

IndexError: boolean index did not match indexed array along axis 0; size of axis is 22 but size of corresponding boolean axis is 10