In [None]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, roc_auc_score
import joblib
import os


In [None]:

def load_and_split_data(df, target_col='target', test_size=0.2, random_state=42):
    X = df.drop(columns=[target_col])
    y = df[target_col]
    return train_test_split(X, y, test_size=test_size, stratify=y, random_state=random_state)


In [None]:

def build_pipeline_and_grid():
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('clf', RandomForestClassifier(random_state=42))
    ])

    param_grid = {
        'clf__n_estimators': [100, 200],
        'clf__max_depth': [5, 10, None],
        'clf__min_samples_split': [2, 5]
    }

    return pipeline, param_grid


In [None]:

def train_model_with_gridsearch(X_train, y_train, pipeline, param_grid, cv=5):
    grid = GridSearchCV(pipeline, param_grid, cv=cv, scoring='roc_auc', verbose=1, n_jobs=-1)
    grid.fit(X_train, y_train)
    return grid


In [None]:

def predict_and_save(grid, X_test, y_test, output_path='model_predictions.csv'):
    best_model = grid.best_estimator_
    y_pred = best_model.predict(X_test)
    y_proba = best_model.predict_proba(X_test)[:, 1]

    report = classification_report(y_test, y_pred, output_dict=True)
    print("Classification Report:")
    print(classification_report(y_test, y_pred))

    predictions_df = pd.DataFrame({
        'Actual': y_test.values,
        'Predicted': y_pred,
        'Probability': y_proba
    })
    predictions_df.to_csv(output_path, index=False)

    return best_model, report


In [None]:

def save_model(model, path='best_model.pkl'):
    joblib.dump(model, path)
    print(f"Model saved to {path}")


In [None]:

def full_model_refit_pipeline(df, target_col='target'):
    X_train, X_test, y_train, y_test = load_and_split_data(df, target_col)
    pipeline, param_grid = build_pipeline_and_grid()
    grid = train_model_with_gridsearch(X_train, y_train, pipeline, param_grid)
    best_model, report = predict_and_save(grid, X_test, y_test)
    save_model(best_model)
    return best_model, report
