In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import logging
import gc
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
import optuna
from catboost import CatBoostClassifier, Pool
from datetime import datetime
import warnings
import joblib

warnings.filterwarnings("ignore")

# Create a log filename with the notebook name and current datetime
current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
log_filename = f'kaggle_submission_{current_time}.log'

# Configure logging to save to a file
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler(log_filename),
        logging.StreamHandler()  # This ensures logs are also output to the console
    ]
)

def reduce_mem_usage(df, verbose=True):
    """Iterate through all the columns of a dataframe and modify the data type to reduce memory usage."""
    start_mem = df.memory_usage().sum() / 1024**2
    if verbose:
        logging.info(f'Start memory usage of dataframe: {start_mem:.2f} MB')

    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    if verbose:
        logging.info(f'End memory usage of dataframe: {end_mem:.2f} MB')
        logging.info(f'Decreased by {(100 * (start_mem - end_mem) / start_mem):.1f}%')

    return df

def safe_map(df, column, mapping):
    """Map categorical values to numerical values and log any unknown categories."""
    unknown_categories = set(df[column]) - set(mapping.keys())
    if unknown_categories:
        logging.warning(f"Unknown categories in column {column}: {unknown_categories}")
    df[column] = df[column].map(mapping)
    return df

def import_data(file, **kwargs):
    """Create a dataframe and optimize its memory usage."""
    df = pd.read_csv(file, parse_dates=True, keep_date_col=True, **kwargs)
    df = reduce_mem_usage(df)
    return df

def preprocess_data(df):
    """Preprocess the dataset."""
    gender_mapping = {'Male': 1, 'Female': 0}
    vehicle_damage_mapping = {'Yes': 1, 'No': 0}
    vehicle_age_mapping = {'< 1 Year': 0, '1-2 Year': 1, '> 2 Years': 2}
    
    df = safe_map(df, 'Gender', gender_mapping)
    df = safe_map(df, 'Vehicle_Damage', vehicle_damage_mapping)
    df = safe_map(df, 'Vehicle_Age', vehicle_age_mapping)
    
    df.drop(['Driving_License'], axis=1, inplace=True)
    return df

def feature_engineering(df):
    """Feature engineering on the dataset."""
    df['Previously_Insured_Annual_Premium'] = pd.factorize((df['Previously_Insured'].astype(str) + df['Annual_Premium'].astype(str)))[0]
    df['Previously_Insured_Vehicle_Age'] = pd.factorize((df['Previously_Insured'].astype(str) + df['Vehicle_Age'].astype(str)))[0]
    df['Previously_Insured_Vehicle_Damage'] = pd.factorize((df['Previously_Insured'].astype(str) + df['Vehicle_Damage'].astype(str)))[0]
    df['Previously_Insured_Vintage'] = pd.factorize((df['Previously_Insured'].astype(str) + df['Vintage'].astype(str)))[0]
    return df

# Paths to datasets
train_path = r"C:\Users\paulo\OneDrive\Documents\kaggle_competition_2_datasets\train.csv"
test_path = r"C:\Users\paulo\OneDrive\Documents\kaggle_competition_2_datasets\test.csv"

# Load and optimize data
train_df = import_data(train_path, index_col='id')
test_df = import_data(test_path, index_col='id')

gc.collect()
print(f"DataFrame after import: {type(train_df)}")
logging.info("Data loaded successfully.")

# Apply preprocessing
train_df = preprocess_data(train_df)
test_df = preprocess_data(test_df)
print(f"DataFrame after preprocessing: {type(train_df)}")
logging.info("Data preprocessed successfully.")

# Apply feature engineering
train_df = feature_engineering(train_df)
test_df = feature_engineering(test_df)

gc.collect()
print(f"DataFrame after feature engineering: {type(train_df)}")
logging.info("Feature engineering completed successfully.")

# Normalize numeric columns
num_cols = ['Age', 'Region_Code', 'Annual_Premium', 'Policy_Sales_Channel', 'Vintage']
scaler = StandardScaler()
train_df[num_cols] = scaler.fit_transform(train_df[num_cols])
test_df[num_cols] = scaler.transform(test_df[num_cols])

# Create a stratified sample of the dataset
X = train_df.drop('Response', axis=1)
y = train_df['Response']

# Ensure that categorical columns are in string format
categorical_cols = X.select_dtypes(include=['category']).columns.tolist()
X[categorical_cols] = X[categorical_cols].astype(str)
test_df[categorical_cols] = test_df[categorical_cols].astype(str)

def objective(trial):
    params = {
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.02, 0.2),  # Expanded upper limit
        'iterations': trial.suggest_int('iterations', 1500, 3000),  # Expanded upper limit
        'depth': trial.suggest_int('depth', 7, 12),  # Expanded upper limit
        'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 0.01, 10),
        'random_strength': trial.suggest_loguniform('random_strength', 0.1, 2.0),  # Expanded upper limit
        'bagging_temperature': trial.suggest_loguniform('bagging_temperature', 0.2, 2.0),  # Expanded upper limit
        'task_type': 'GPU',  # Ensure your environment supports GPU
        'eval_metric': 'AUC',
        'loss_function': 'Logloss',
        'random_seed': 42,
        'allow_writing_files': False,
        'verbose': 100  # Display log every 100 iterations
    }

    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

    X_train[categorical_cols] = X_train[categorical_cols].astype(str)
    X_valid[categorical_cols] = X_valid[categorical_cols].astype(str)
    
    train_pool = Pool(X_train, y_train, cat_features=categorical_cols)
    valid_pool = Pool(X_valid, y_valid, cat_features=categorical_cols)

    model = CatBoostClassifier(**params)
    model.fit(train_pool, eval_set=valid_pool, early_stopping_rounds=50, verbose=100)

    valid_preds = model.predict_proba(X_valid)[:, 1]
    auc = roc_auc_score(y_valid, valid_preds)
    
    return auc

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

print('Best parameters:', study.best_params)
print('Best AUC score:', study.best_value)


# Load the best model
best_model = joblib.load(f'catboost_model_trial_{study.best_trial.number}.pkl')

# Ensure test_df categorical columns are in string format
test_df[categorical_cols] = test_df[categorical_cols].astype(str)

# Predict on the test set
test_pool = Pool(test_df, cat_features=categorical_cols)
test_pred = best_model.predict_proba(test_pool)[:, 1]
joblib.dump(test_pred, 'test_pred_cat.pkl')

logging.info("Final CatBoost model and predictions saved.")

# Create a submission dataframe
submission = pd.DataFrame({
    'id': test_df.index,
    'Response': test_pred
})

# Save the submission file
submission_filename = f'submission_{current_time}.csv'
submission.to_csv(submission_filename, index=False)

logging.info(f"Submission file {submission_filename} created successfully.")
print(f"Submission file {submission_filename} created successfully.")

# Plot Optuna visualizations
optuna.visualization.plot_optimization_history(study)
optuna.visualization.plot_intermediate_values(study)
optuna.visualization.plot_parallel_coordinate(study)
optuna.visualization.plot_contour(study)
optuna.visualization.plot_param_importances(study)
optuna.visualization.plot_slice(study)
optuna.visualization.plot_edf(study)


In [None]:
optuna.visualization.plot_intermediate_values(study)


[W 2024-07-25 23:33:49,632] You need to set up the pruning feature to utilize `plot_intermediate_values()`


In [None]:
optuna.visualization.plot_parallel_coordinate(study)


In [None]:
optuna.visualization.plot_contour(study)


In [None]:
optuna.visualization.plot_param_importances(study)


In [None]:
optuna.visualization.plot_slice(study)


In [None]:
optuna.visualization.plot_edf(study)
