In [1]:
import pandas as pd
import numpy as np
import joblib
from catboost import CatBoostClassifier, Pool
import lightgbm as lgb
import xgboost as xgb
from datetime import datetime
import gc
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import logging
import gc
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier, Pool
from scipy.optimize import minimize
from datetime import datetime
import warnings
import joblib
from sklearn.model_selection import train_test_split

warnings.filterwarnings("ignore")


In [2]:
# Create a log filename with the notebook name and current datetime
current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
log_filename = f'kaggle_submission_{current_time}.log'

# Configure logging to save to a file
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler(log_filename),
        logging.StreamHandler()  # This ensures logs are also output to the console
    ]
)

In [3]:
def reduce_mem_usage(df, verbose=True):
    """Iterate through all the columns of a dataframe and modify the data type to reduce memory usage."""
    start_mem = df.memory_usage().sum() / 1024**2
    if verbose:
        logging.info(f'Start memory usage of dataframe: {start_mem:.2f} MB')


    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    if verbose:
        logging.info(f'End memory usage of dataframe: {end_mem:.2f} MB')
        logging.info(f'Decreased by {(100 * (start_mem - end_mem) / start_mem):.1f}%')

    return df

def safe_map(df, column, mapping):
    """Map categorical values to numerical values and log any unknown categories."""
    unknown_categories = set(df[column]) - set(mapping.keys())
    if unknown_categories:
        logging.warning(f"Unknown categories in column {column}: {unknown_categories}")
    df[column] = df[column].map(mapping)
    return df

def import_data(file, **kwargs):
    """Create a dataframe and optimize its memory usage."""
    df = pd.read_csv(file, parse_dates=True, keep_date_col=True, **kwargs)
    df = reduce_mem_usage(df)
    return df

def preprocess_data(df):
    """Preprocess the dataset."""
    gender_mapping = {'Male': 1, 'Female': 0}
    vehicle_damage_mapping = {'Yes': 1, 'No': 0}
    vehicle_age_mapping = {'< 1 Year': 0, '1-2 Year': 1, '> 2 Years': 2}
    
    df = safe_map(df, 'Gender', gender_mapping)
    df = safe_map(df, 'Vehicle_Damage', vehicle_damage_mapping)
    df = safe_map(df, 'Vehicle_Age', vehicle_age_mapping)
    
    # Check if 'Driving_License' column exists before dropping it
    if 'Driving_License' in df.columns:
        df.drop(['Driving_License'], axis=1, inplace=True)
    else:
        logging.warning("'Driving_License' column not found in the dataset.")
    
    return df

def feature_engineering(df):
    """Feature engineering on the dataset."""
    df['Previously_Insured_Annual_Premium'] = pd.factorize((df['Previously_Insured'].astype(str) + df['Annual_Premium'].astype(str)))[0]
    df['Previously_Insured_Vehicle_Age'] = pd.factorize((df['Previously_Insured'].astype(str) + df['Vehicle_Age'].astype(str)))[0]
    df['Previously_Insured_Vehicle_Damage'] = pd.factorize((df['Previously_Insured'].astype(str) + df['Vehicle_Damage'].astype(str)))[0]
    df['Previously_Insured_Vintage'] = pd.factorize((df['Previously_Insured'].astype(str) + df['Vintage'].astype(str)))[0]
    return df


In [4]:

test_path = r"C:\Users\paulo\OneDrive\Documents\kaggle_competition_2_datasets\test.csv"

test_df = import_data(test_path, index_col='id')

gc.collect()

test_df = preprocess_data(test_df)

test_df = feature_engineering(test_df)

gc.collect()

2024-07-25 16:13:53,320 - INFO - Start memory usage of dataframe: 643.68 MB
2024-07-25 16:13:54,393 - INFO - End memory usage of dataframe: 204.81 MB
2024-07-25 16:13:54,394 - INFO - Decreased by 68.2%


0

In [5]:
# Normalize numeric columns
num_cols = ['Age', 'Region_Code', 'Annual_Premium', 'Policy_Sales_Channel', 'Vintage']
scaler = StandardScaler()
test_df[num_cols] = scaler.fit_transform(test_df[num_cols])

gc.collect()

0

In [6]:
# Initialize lists to store predictions
cat_preds = []
lgb_preds = []
xgb_preds = []


In [7]:

# Create a Pool object for CatBoost
test_pool = Pool(test_df.astype(str), cat_features=test_df.columns.values)

# Make predictions with CatBoost models
for i in range(5):
    model = joblib.load(f'catboost_model_fold_{i+1}.pkl')
    test_pred = model.predict_proba(test_pool)[:, 1]
    cat_preds.append(test_pred)
    del model
    gc.collect()

# Average the predictions from each fold for CatBoost
test_pred_cat = np.mean(cat_preds, axis=0)


In [8]:

# Make predictions with LightGBM models
for i in range(5):
    model = joblib.load(f'lgb_model_fold_{i+1}.pkl')
    test_pred = model.predict(test_df, num_iteration=model.best_iteration)
    lgb_preds.append(test_pred)
    del model
    gc.collect()

# Average the predictions from each fold for LightGBM
test_pred_lgb = np.mean(lgb_preds, axis=0)


In [9]:

# Make predictions with XGBoost models
dtest = xgb.DMatrix(test_df, enable_categorical=True)
for i in range(5):
    model = xgb.Booster()
    model.load_model(f'xgb_model_fold_{i+1}.json')
    test_pred = model.predict(dtest, iteration_range=(0, model.best_iteration))
    xgb_preds.append(test_pred)
    del model
    gc.collect()

# Average the predictions from each fold for XGBoost
test_pred_xgb = np.mean(xgb_preds, axis=0)

In [10]:
# Get current time for filenames
current_time = datetime.now().strftime('%Y%m%d_%H%M%S')

# Create submission DataFrame for CatBoost
submission_cat = pd.DataFrame({
    'id': test_df.index,
    'Response': test_pred_cat
})
submission_cat.to_csv(f'submission_cat_{current_time}.csv', index=False)

# Create submission DataFrame for LightGBM
submission_lgb = pd.DataFrame({
    'id': test_df.index,
    'Response': test_pred_lgb
})
submission_lgb.to_csv(f'submission_lgb_{current_time}.csv', index=False)

# Create submission DataFrame for XGBoost
submission_xgb = pd.DataFrame({
    'id': test_df.index,
    'Response': test_pred_xgb
})
submission_xgb.to_csv(f'submission_xgb_{current_time}.csv', index=False)

print("Submission files created successfully!")

Submission files created successfully!


In [11]:
# Blend LightGBM and XGBoost predictions
blend_weight = 0.5  # You can adjust this weight based on validation performance
test_pred_blend = (blend_weight * test_pred_lgb) + ((1 - blend_weight) * test_pred_xgb)

# Create submission DataFrame for the blended predictions
submission_blend = pd.DataFrame({
    'id': test_df.index,
    'Response': test_pred_blend
})
submission_blend.to_csv(f'submission_blend_lgb_xgb_{current_time}.csv', index=False)

print("Blended submission file (LightGBM + XGBoost) created successfully!")


Blended submission file (LightGBM + XGBoost) created successfully!
