In [None]:
# 1. Basic setup
import pandas as pd
import numpy as np

# 2. Train-test split and validation
from sklearn.model_selection import train_test_split, GridSearchCV

# 3. Outlier/novelty detection
from sklearn.neighbors import LocalOutlierFactor
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM
from sklearn.base import BaseEstimator, TransformerMixin

# 4. Class imbalance
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as ImbPipeline

# 5. Binning/discretization
from sklearn.preprocessing import KBinsDiscretizer

# 6. Encoding categorical features + target
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder
from category_encoders.target_encoder import TargetEncoder

# 7. Polynomial/nonlinear features
from sklearn.preprocessing import PolynomialFeatures

# 8. Scaling features
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

# 9. Missing value imputers
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer

# 10. Feature selection
from sklearn.feature_selection import SelectKBest, mutual_info_classif, chi2

# 11. Dimensionality reduction
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.manifold import TSNE
from sklearn.kernel_approximation import RBFSampler

# 12. Models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

# 13. Evaluation
from sklearn.metrics import classification_report, accuracy_score

# 14. Persistence
import joblib

# 15. Column-wise preprocessing
from sklearn.compose import ColumnTransformer

# 16. Pipeline
from sklearn.pipeline import Pipeline

In [None]:
# 1. Load data
df = pd.read_csv("your_data.csv")  # replace with actual file
X = df.drop("target", axis=1)
y = df["target"]

# 2. Train-test-validation split
X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval, test_size=0.25, stratify=y_trainval, random_state=42)

# 3. Outlier removal (custom transformer using LocalOutlierFactor)
class OutlierRemover(BaseEstimator, TransformerMixin):
    def __init__(self, contamination=0.05, n_neighbors=20):
        self.contamination = contamination
        self.n_neighbors = n_neighbors

    def fit(self, X, y=None):
        numeric_X = X.select_dtypes(include='number')
        self.lof_ = LocalOutlierFactor(n_neighbors=self.n_neighbors, contamination=self.contamination)
        self.mask_ = self.lof_.fit_predict(numeric_X) != -1
        return self

    def transform(self, X):
        return X[self.mask_].reset_index(drop=True)

# 4. Preprocessing
num_cols = X.select_dtypes(include='number').columns.tolist()
cat_cols = X.select_dtypes(include='object').columns.tolist()

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse=False))
])

preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_cols),
    ('cat', cat_pipeline, cat_cols)
])

# 5. Full Pipeline
full_pipeline = ImbPipeline([
    ('outlier', OutlierRemover(contamination=0.05)),
    ('preprocess', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('select', SelectKBest(score_func=mutual_info_classif, k=20)),
    ('clf', RandomForestClassifier(random_state=42))
])

# 6. Bias-Variance check (train vs val)
full_pipeline.fit(X_train, y_train)
train_preds = full_pipeline.predict(X_train)
val_preds = full_pipeline.predict(X_val)

print("Train Accuracy:", accuracy_score(y_train, train_preds))
print("Val Accuracy  :", accuracy_score(y_val, val_preds))

# 7. Retrain on train+val for full data usage
X_final = pd.concat([X_train, X_val])
y_final = pd.concat([y_train, y_val])

param_grid = {
    'clf__n_estimators': [100, 200],
    'clf__max_depth': [None, 10],
    'select__k': [10, 20]
}

grid = GridSearchCV(full_pipeline, param_grid, cv=5, scoring='f1_macro', n_jobs=-1)
grid.fit(X_final, y_final)

# 8. Final evaluation on test
y_test_preds = grid.predict(X_test)
print("Test Classification Report:")
print(classification_report(y_test, y_test_preds))

# 9. Save model
joblib.dump(grid.best_estimator_, "final_pipeline_model.pkl", compress=3, protocol=5)
print("Model saved as final_pipeline_model.pkl")
