In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import logging
import gc
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier, Pool
from scipy.optimize import minimize
from datetime import datetime
import warnings
import joblib

warnings.filterwarnings("ignore")

In [2]:
# Create a log filename with the notebook name and current datetime
current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
log_filename = f'kaggle_submission_{current_time}.log'

# Configure logging to save to a file
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler(log_filename),
        logging.StreamHandler()  # This ensures logs are also output to the console
    ]
)

In [3]:
def reduce_mem_usage(df, verbose=True):
    """Iterate through all the columns of a dataframe and modify the data type to reduce memory usage."""
    start_mem = df.memory_usage().sum() / 1024**2
    if verbose:
        logging.info(f'Start memory usage of dataframe: {start_mem:.2f} MB')


    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    if verbose:
        logging.info(f'End memory usage of dataframe: {end_mem:.2f} MB')
        logging.info(f'Decreased by {(100 * (start_mem - end_mem) / start_mem):.1f}%')

    return df

def safe_map(df, column, mapping):
    """Map categorical values to numerical values and log any unknown categories."""
    unknown_categories = set(df[column]) - set(mapping.keys())
    if unknown_categories:
        logging.warning(f"Unknown categories in column {column}: {unknown_categories}")
    df[column] = df[column].map(mapping)
    return df

def import_data(file, **kwargs):
    """Create a dataframe and optimize its memory usage."""
    df = pd.read_csv(file, parse_dates=True, keep_date_col=True, **kwargs)
    df = reduce_mem_usage(df)
    return df

def preprocess_data(df):
    """Preprocess the dataset."""
    gender_mapping = {'Male': 1, 'Female': 0}
    vehicle_damage_mapping = {'Yes': 1, 'No': 0}
    vehicle_age_mapping = {'< 1 Year': 0, '1-2 Year': 1, '> 2 Years': 2}
    
    df = safe_map(df, 'Gender', gender_mapping)
    df = safe_map(df, 'Vehicle_Damage', vehicle_damage_mapping)
    df = safe_map(df, 'Vehicle_Age', vehicle_age_mapping)
    
    df.drop(['Driving_License'], axis=1, inplace=True)
    return df

def feature_engineering(df):
    """Feature engineering on the dataset."""
    df['Previously_Insured_Annual_Premium'] = pd.factorize((df['Previously_Insured'].astype(str) + df['Annual_Premium'].astype(str)))[0]
    df['Previously_Insured_Vehicle_Age'] = pd.factorize((df['Previously_Insured'].astype(str) + df['Vehicle_Age'].astype(str)))[0]
    df['Previously_Insured_Vehicle_Damage'] = pd.factorize((df['Previously_Insured'].astype(str) + df['Vehicle_Damage'].astype(str)))[0]
    df['Previously_Insured_Vintage'] = pd.factorize((df['Previously_Insured'].astype(str) + df['Vintage'].astype(str)))[0]
    return df



In [4]:
# Paths to datasets
train_path = r"C:\Users\paulo\OneDrive\Documents\kaggle_competition_2_datasets\train.csv"
test_path = r"C:\Users\paulo\OneDrive\Documents\kaggle_competition_2_datasets\test.csv"

# Load and optimize data
train_df = import_data(train_path, index_col='id')
test_df = import_data(test_path, index_col='id')

gc.collect()
print(f"DataFrame after import: {type(train_df)}")
logging.info("Data loaded successfully.")

# Apply preprocessing
train_df = preprocess_data(train_df)
test_df = preprocess_data(test_df)
print(f"DataFrame after preprocessing: {type(train_df)}")
logging.info("Data preprocessed successfully.")

# Apply feature engineering
train_df = feature_engineering(train_df)
test_df = feature_engineering(test_df)

gc.collect()
print(f"DataFrame after feature engineering: {type(train_df)}")
logging.info("Feature engineering completed successfully.")

2024-07-24 19:15:28,630 - INFO - Start memory usage of dataframe: 1053.30 MB
2024-07-24 19:15:30,211 - INFO - End memory usage of dataframe: 318.18 MB
2024-07-24 19:15:30,211 - INFO - Decreased by 69.8%
2024-07-24 19:15:39,654 - INFO - Start memory usage of dataframe: 643.68 MB
2024-07-24 19:15:40,731 - INFO - End memory usage of dataframe: 204.81 MB
2024-07-24 19:15:40,732 - INFO - Decreased by 68.2%
2024-07-24 19:15:40,792 - INFO - Data loaded successfully.


DataFrame after import: <class 'pandas.core.frame.DataFrame'>


2024-07-24 19:15:42,612 - INFO - Data preprocessed successfully.


DataFrame after preprocessing: <class 'pandas.core.frame.DataFrame'>


2024-07-24 19:16:15,100 - INFO - Feature engineering completed successfully.


DataFrame after feature engineering: <class 'pandas.core.frame.DataFrame'>


In [5]:
num_cols = ['Age', 'Region_Code', 'Annual_Premium', 'Policy_Sales_Channel', 'Vintage']
scaler = StandardScaler()
train_df[num_cols] = scaler.fit_transform(train_df[num_cols])
test_df[num_cols] = scaler.transform(test_df[num_cols])

# Separate features and target variable
X = train_df.drop('Response', axis=1)
y = train_df['Response']

In [6]:
# Define CatBoost parameters
cat_params = {
    'loss_function': 'Logloss',
    'eval_metric': 'AUC',
    'class_names': [0, 1],
    'learning_rate': 0.075,
    'iterations': 3000,
    'depth': 9,
    'random_strength': 0,
    'l2_leaf_reg': 0.5,
    'max_leaves': 512,
    'fold_permutation_block': 64,
    # 'task_type': 'GPU',  # Ensure your environment supports GPU
    'random_seed': 42,
    'allow_writing_files': False,
    'verbose': 100,  # Display log every 100 iterations
    # 'thread_count': -1
}

test_pool = Pool(test_df.astype(str), cat_features=X.columns.values)
train_pool = Pool(X.astype(str), y, cat_features=X.columns.values)
model = CatBoostClassifier(**cat_params)
model.fit(train_pool, verbose=100)
        
test_pred = model.predict_proba(test_pool)[:, 1]

joblib.dump(model, f'catboost_model.pkl')

0:	total: 12.4s	remaining: 10h 20m 18s
100:	total: 24m 17s	remaining: 11h 37m 25s


KeyboardInterrupt: 

In [None]:
# Create a submission dataframe
submission = pd.DataFrame({
    'id': test_df.index,
    'Response': test_pred
})

# Save the submission file
submission_filename = f'submission_{current_time}.csv'
submission.to_csv(submission_filename, index=False)

logging.info(f"Submission file {submission_filename} created successfully.")
print(f"Submission file {submission_filename} created successfully.")