In [1]:
import os
import pandas as pd
import numpy as np
import pickle
import logging
import warnings

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import (
    FunctionTransformer,
    KBinsDiscretizer,
    StandardScaler,
    OneHotEncoder,
    OrdinalEncoder,
)
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.base import BaseEstimator, TransformerMixin

warnings.filterwarnings("ignore")

# ========== Logging Setup ==========
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")



In [2]:
# ========== Configuration ==========
CONFIG = {
    "data_path": "/content/insuranceFraud_Dataset.csv",
    "target_column": "fraud_reported",
    "columns_to_drop": [
        "months_as_customer", "policy_number", "insured_zip", "insured_hobbies",
        "incident_date", "incident_location", "total_claim_amount",
        "auto_model", "incident_city", "umbrella_limit"
    ],
    "ordinal_mappings": {
        "policy_csl": ["500/1000", "250/500", "100/300"],
        "policy_deductable": [2000, 1000, 500],
        "insured_education_level": [
            "PhD", "MD", "JD", "Masters", "College", "Associate", "High School"
        ],
        "incident_severity": [
            "Total Loss", "Major Damage", "Minor Damage", "Trivial Damage"
        ],
        "number_of_vehicles_involved": [4, 3, 2, 1],
        "bodily_injuries": [2, 1, 0],
        "witnesses": [3, 2, 1, 0],
        "auto_year": list(range(2015, 1994, -1))
    },
    "preprocessor_output_path": "/content/preprocessor.pkl"
}



In [3]:
# ========== Utility Functions ==========

def load_data(path: str) -> pd.DataFrame:
    if not os.path.exists(path):
        raise FileNotFoundError(f"File not found: {path}")
    logging.info("Loading dataset...")
    return pd.read_csv(path, na_values=["?"])

def split_data(df: pd.DataFrame, target: str, test_size=0.25, random_state=42):
    X = df.drop(target, axis=1)
    y = df[target]
    return train_test_split(X, y, test_size=test_size, random_state=random_state)

def save_pkl(file_path: str, obj: object) -> None:
    try:
        os.makedirs(os.path.dirname(file_path), exist_ok=True)
        with open(file_path, "wb") as f:
            pickle.dump(obj, f)
        logging.info(f"Saved pickle file to: {file_path}")
    except Exception as e:
        logging.error(f"Error saving {file_path}: {e}")

def extract_year(X):
    X = pd.to_datetime(X.squeeze())
    return X.dt.year.to_frame()

def hour_to_period(X):
    hour = X.squeeze()
    if hasattr(hour, "values"):
        hour = hour.values
    bins = [0, 6, 12, 18, 24]
    labels = ["Night", "Morning", "Afternoon", "Evening"]
    return pd.DataFrame(pd.cut(hour, bins=bins, labels=labels, right=False, include_lowest=True))



In [4]:
# ========== Custom Transformers ==========

class KBinsDiscretizerPlusOne(BaseEstimator, TransformerMixin):
    def __init__(self, n_bins=5, encode="ordinal", strategy="uniform"):
        self.n_bins = n_bins
        self.encode = encode
        self.strategy = strategy
        self.kbd = None

    def fit(self, X, y=None):
        self.kbd = KBinsDiscretizer(n_bins=self.n_bins, encode=self.encode, strategy=self.strategy)
        self.kbd.fit(X)
        return self

    def transform(self, X):
        return (self.kbd.transform(X) + 1).astype(int)

class RandomSampleImputer(BaseEstimator, TransformerMixin):
    def __init__(self, random_state=None):
        self.random_state = random_state
        self.feature_values_ = {}

    def fit(self, X, y=None):
        X = pd.DataFrame(X)
        for col in X.columns:
            self.feature_values_[col] = X[col].dropna().values
        return self

    def transform(self, X):
        X = pd.DataFrame(X).copy()
        rng = np.random.default_rng(self.random_state)
        for col in X.columns:
            mask = X[col].isna()
            if mask.any() and col in self.feature_values_:
                X.loc[mask, col] = rng.choice(self.feature_values_[col], size=mask.sum())
        return X.values



In [5]:
# ========== Main Pipeline ==========

def build_pipeline(X: pd.DataFrame) -> Pipeline:
    categorical_columns = X.select_dtypes(include=["object"]).columns
    numerical_columns = X.select_dtypes(exclude=["object"]).columns

    ordinal_columns = list(CONFIG["ordinal_mappings"].keys())
    ordinal_categories = [CONFIG["ordinal_mappings"][col] for col in ordinal_columns]

    nominal_columns = [col for col in categorical_columns if col not in ordinal_columns and col != "policy_bind_date"]

    date_pipeline = Pipeline([
        ("extract_year", FunctionTransformer(extract_year)),
        ("bin_year", KBinsDiscretizerPlusOne()),
        ("scaler", StandardScaler())
    ])

    num_pipeline = Pipeline([
        ("imputer", IterativeImputer(max_iter=100, random_state=42)),
        ("scaler", StandardScaler())
    ])

    ordinal_pipeline = Pipeline([
        ("imputer", RandomSampleImputer()),
        ("ordinal_encoder", OrdinalEncoder(categories=ordinal_categories)),
        ("scaler", StandardScaler())
    ])

    nominal_pipeline = Pipeline([
        ("imputer", RandomSampleImputer()),
        ("one_hot", OneHotEncoder(handle_unknown="ignore", drop="first", sparse_output=False))
    ])

    hour_pipeline = Pipeline([
        ("hour_to_period", FunctionTransformer(hour_to_period, validate=False)),
        ("one_hot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
    ])

    preprocessor = ColumnTransformer([
        ("date", date_pipeline, ["policy_bind_date"]),
        ("drop_columns", "drop", CONFIG["columns_to_drop"]),
        ("hour", hour_pipeline, ["incident_hour_of_the_day"]),
        ("numerical", num_pipeline, numerical_columns),
        ("ordinal", ordinal_pipeline, ordinal_columns),
        ("nominal", nominal_pipeline, nominal_columns)
    ], remainder="passthrough")

    return Pipeline([("preprocessor", preprocessor)])



In [6]:
# ========== Main Execution ==========

def main():
    try:
        df = load_data(CONFIG["data_path"])
        X_train, X_test, y_train, y_test = split_data(df, CONFIG["target_column"])

        pipeline = build_pipeline(X_train)
        logging.info("Fitting preprocessing pipeline...")
        X_train_arr = pipeline.fit_transform(X_train)
        X_test_arr = pipeline.transform(X_test)

        train_arr = np.c_[X_train_arr, y_train.values.reshape(-1, 1)]
        test_arr = np.c_[X_test_arr, y_test.values.reshape(-1, 1)]

        return train_arr, test_arr, pipeline
        save_pkl(CONFIG["preprocessor_output_path"], pipeline.named_steps["preprocessor"])
        logging.info("Preprocessing complete.")

    except Exception as e:
        logging.error(f"Pipeline failed: {e}")




In [7]:
a, b, _= main()

In [8]:
from dataclasses import dataclass
import os
import sys
import logging
import numpy as np
import pickle

from sklearn.linear_model import (
    LogisticRegression, SGDClassifier
)
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (
    RandomForestClassifier,
    GradientBoostingClassifier,
    AdaBoostClassifier,
    BaggingClassifier
)
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder


def save_pkl(file_path, obj):
    """Save an object to a pickle file"""
    with open(file_path, "wb") as f:
        pickle.dump(obj, f)




In [9]:
def evaluate_models(X_train, y_train, X_test, y_test, models, param_grids):
    """
    Uses GridSearchCV to perform hyperparameter tuning and evaluation.
    Returns dictionary of test accuracies.
    """
    try:
        report = {}
        for model_name, model_class in models.items():
            model = model_class()
            params = param_grids.get(model_name, {})
            gs = GridSearchCV(model, param_grid=params, cv=3)
            gs.fit(X_train, y_train)

            best_model = gs.best_estimator_
            y_test_pred = best_model.predict(X_test)
            test_score = accuracy_score(y_test, y_test_pred)
            report[model_name] = test_score

        return report

    except Exception as e:
        logging.error("Error in evaluating models", exc_info=True)
        raise e




In [10]:
def initiate_model_trainer(train_arr, test_arr):
    try:
        # Split features and labels
        X_train, y_train = train_arr[:, :-1], train_arr[:, -1]
        X_test, y_test = test_arr[:, :-1], test_arr[:, -1]

        # Label encoding for classification targets
        le = LabelEncoder()
        y_train = le.fit_transform(y_train)
        y_test = le.transform(y_test)

        # Define models and hyperparameter grids
        models_dict = {
            "LogisticRegression": LogisticRegression,
            "SVC": SVC,
            "KNeighborsClassifier": KNeighborsClassifier,
            "DecisionTreeClassifier": DecisionTreeClassifier,
            "RandomForestClassifier": RandomForestClassifier,
            "GradientBoostingClassifier": GradientBoostingClassifier,
            "XGBClassifier": xgb.XGBClassifier,
            "GaussianNB": GaussianNB,
            "AdaBoostClassifier": AdaBoostClassifier,
            "BaggingClassifier": BaggingClassifier,
            "SGDClassifier": SGDClassifier
        }

        # Parameter grids (only active parameters shown; you can uncomment others as needed)
        param_grids = {
            "LogisticRegression": {
                "penalty": ["l1", "l2", "elasticnet", None],
                # "C": [0.01, 0.1, 1, 10, 100],
                # "solver": ["newton-cg", "lbfgs", "liblinear", "sag", "saga"],
                # "max_iter": [100, 200, 500]
            },
            "SVC": {
                "C": [0.1, 1, 10, 100],
                # "kernel": ["linear", "poly", "rbf", "sigmoid"],
                # "gamma": ["scale", "auto"] + [0.001, 0.01, 0.1, 1],
                # "degree": [2, 3, 4]
            },
            "KNeighborsClassifier": {
                "n_neighbors": [3, 5, 7, 9, 11],
                # "weights": ["uniform", "distance"],
                # "p": [1, 2]
            },
            "DecisionTreeClassifier": {
                "max_depth": [None, 5, 10, 20, 30],
                # "min_samples_split": [2, 5, 10],
                # "min_samples_leaf": [1, 2, 4],
                # "criterion": ["gini", "entropy"]
            },
            "RandomForestClassifier": {
                "n_estimators": [50, 100, 200],
                # "max_depth": [None, 10, 20, 30],
                # "min_samples_split": [2, 5, 10],
                # "min_samples_leaf": [1, 2, 4],
                # "max_features": ["sqrt", "log2"]
            },
            "GradientBoostingClassifier": {
                "n_estimators": [50, 100, 200],
                # "learning_rate": [0.01, 0.1, 0.2],
                # "max_depth": [3, 5, 7],
                # "min_samples_split": [2, 5, 10],
                # "min_samples_leaf": [1, 2, 4],
                # "max_features": ["sqrt", "log2"],
                # "subsample": [0.8, 0.9, 1.0]
            },
            "XGBClassifier": {
                "n_estimators": [50, 100, 200],
                # "learning_rate": [0.01, 0.1, 0.2],
                # "max_depth": [3, 5, 7],
                # "min_child_weight": [1, 3, 5],
                # "subsample": [0.8, 0.9, 1.0],
                # "colsample_bytree": [0.8, 0.9, 1.0],
                # "gamma": [0, 0.1, 0.2],
                # "reg_alpha": [0, 0.1, 0.5],
                # "reg_lambda": [0, 0.1, 0.5]
            },
            "GaussianNB": {
                "var_smoothing": [1e-9, 1e-8, 1e-7]
            },

            "AdaBoostClassifier": {
                "n_estimators": [50, 100, 200],
                # "learning_rate": [0.01, 0.1, 0.5],
                # "base_estimator": [None, DecisionTreeClassifier(max_depth=1)]
            },
            "BaggingClassifier": {
                "n_estimators": [10, 50, 100],
                # "max_samples": [0.5, 0.7, 1.0],
                # "max_features": [0.5, 0.7, 1.0],
                # "base_estimator": [None, DecisionTreeClassifier(max_depth=2)]
            },
            "SGDClassifier": {
                "loss": ["hinge", "log_loss", "modified_huber"],
                # "penalty": ["l2", "l1", "elasticnet"],
                # "alpha": [0.0001, 0.001, 0.01],
                # "l1_ratio": [0, 0.15, 0.5, 1.0],
                # "max_iter": [1000, 2000],
                # "learning_rate": ["constant", "optimal", "invscaling"],
                # "eta0": [0.01, 0.1]
            }
        }


        # Evaluate and select best model
        model_report = evaluate_models(X_train, y_train, X_test, y_test, models_dict, param_grids)
        best_score = max(model_report.values())
        best_model_name = max(model_report, key=model_report.get)
        best_model = models_dict[best_model_name]()

        if best_score < 0.6:
            raise Exception("No suitable model found with accuracy >= 0.6")

        logging.info(f"Best Model: {best_model_name} with Accuracy: {best_score}")
        save_pkl("/content/best_model.pkl", best_model)

        print("Best Model Name:", best_model_name)
        print("Accuracy Score:", best_score)
        print("All Model Scores:", model_report)

        return best_model_name, best_score, model_report

    except Exception as e:
        logging.error("Error in model training pipeline", exc_info=True)
        raise e





In [12]:
# Example usage:
# Replace train_arr and test_arr with your numpy arrays
abc = initiate_model_trainer(a, b)
print(abc)

Best Model Name: BaggingClassifier
Accuracy Score: 0.808
All Model Scores: {'LogisticRegression': 0.716, 'SVC': 0.732, 'KNeighborsClassifier': 0.716, 'DecisionTreeClassifier': 0.788, 'RandomForestClassifier': 0.732, 'GradientBoostingClassifier': 0.792, 'XGBClassifier': 0.776, 'GaussianNB': 0.268, 'AdaBoostClassifier': 0.772, 'BaggingClassifier': 0.808, 'SGDClassifier': 0.696}
('BaggingClassifier', 0.808, {'LogisticRegression': 0.716, 'SVC': 0.732, 'KNeighborsClassifier': 0.716, 'DecisionTreeClassifier': 0.788, 'RandomForestClassifier': 0.732, 'GradientBoostingClassifier': 0.792, 'XGBClassifier': 0.776, 'GaussianNB': 0.268, 'AdaBoostClassifier': 0.772, 'BaggingClassifier': 0.808, 'SGDClassifier': 0.696})
