In [None]:
import pandas as pd
import numpy as np
import logging
from sklearn.preprocessing import StandardScaler
from catboost import CatBoostClassifier, Pool
from datetime import datetime
import gc
import warnings

warnings.filterwarnings("ignore")

# Create a log filename with the notebook name and current datetime
current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
log_filename = f'kaggle_submission_{current_time}.log'

# Configure logging to save to a file
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler(log_filename),
        logging.StreamHandler()  # This ensures logs are also output to the console
    ]
)

def get_column_stats(df):
    """Get basic statistics for each column in the dataframe."""
    stats = {}
    for col in df.columns:
        if pd.api.types.is_numeric_dtype(df[col]):
            stats[col] = {
                'min': df[col].min(),
                'max': df[col].max(),
                'mean': df[col].mean(),
                'unique': df[col].nunique()
            }
        else:
            stats[col] = {
                'unique': df[col].nunique()
            }
    return stats

def compare_stats(stats_before, stats_after):
    """Compare statistics before and after type conversion."""
    for col in stats_before:
        if stats_before[col] != stats_after[col]:
            logging.warning(f"Column {col} has changed: {stats_before[col]} != {stats_after[col]}")

def calculate_precision_loss(stats_before, stats_after):
    """Calculate and log precision loss for numeric columns."""
    for col in stats_before:
        if 'mean' in stats_before[col]:
            mean_before = stats_before[col]['mean']
            mean_after = stats_after[col]['mean']
            precision_loss = abs(mean_before - mean_after) / abs(mean_before) * 100
            logging.info(f"Column {col} precision loss: {precision_loss:.6f}%")

def reduce_mem_usage(df, verbose=True):
    """Iterate through all the columns of a dataframe and modify the data type to reduce memory usage."""
    start_mem = df.memory_usage().sum() / 1024**2
    if verbose:
        logging.info(f'Start memory usage of dataframe: {start_mem:.2f} MB')

    stats_before = get_column_stats(df)

    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    stats_after = get_column_stats(df)
    compare_stats(stats_before, stats_after)
    calculate_precision_loss(stats_before, stats_after)

    end_mem = df.memory_usage().sum() / 1024**2
    if verbose:
        logging.info(f'End memory usage of dataframe: {end_mem:.2f} MB')
        logging.info(f'Decreased by {(100 * (start_mem - end_mem) / start_mem):.1f}%')

    return df

def safe_map(df, column, mapping):
    """Map categorical values to numerical values and log any unknown categories."""
    unknown_categories = set(df[column]) - set(mapping.keys())
    if unknown_categories:
        logging.warning(f"Unknown categories in column {column}: {unknown_categories}")
    df[column] = df[column].map(mapping)
    return df

def import_data(file, **kwargs):
    """Create a dataframe and optimize its memory usage."""
    df = pd.read_csv(file, parse_dates=True, keep_date_col=True, **kwargs)
    df = reduce_mem_usage(df)
    return df

def preprocess_data(df):
    """Preprocess the dataset."""
    gender_mapping = {'Male': 1, 'Female': 0}
    vehicle_damage_mapping = {'Yes': 1, 'No': 0}
    vehicle_age_mapping = {'< 1 Year': 0, '1-2 Year': 1, '> 2 Years': 2}
    
    df = safe_map(df, 'Gender', gender_mapping)
    df = safe_map(df, 'Vehicle_Damage', vehicle_damage_mapping)
    df = safe_map(df, 'Vehicle_Age', vehicle_age_mapping)
    
    df.drop(['Driving_License'], axis=1, inplace=True)
    return df

def feature_engineering(df):
    """Feature engineering on the dataset."""
    df['Previously_Insured_Annual_Premium'] = pd.factorize((df['Previously_Insured'].astype(str) + df['Annual_Premium'].astype(str)))[0]
    df['Previously_Insured_Vehicle_Age'] = pd.factorize((df['Previously_Insured'].astype(str) + df['Vehicle_Age'].astype(str)))[0]
    df['Previously_Insured_Vehicle_Damage'] = pd.factorize((df['Previously_Insured'].astype(str) + df['Vehicle_Damage'].astype(str)))[0]
    df['Previously_Insured_Vintage'] = pd.factorize((df['Previously_Insured'].astype(str) + df['Vintage'].astype(str)))[0]
    return df

def scale_dataset(df):
    """Scale the entire dataset using StandardScaler."""
    scaler = StandardScaler()
    
    # Fit and transform the scaler on the entire dataset
    scaled_values = scaler.fit_transform(df)
    
    # Create a new DataFrame with the scaled values
    scaled_df = pd.DataFrame(scaled_values, index=df.index, columns=df.columns)
    
    return scaled_df


In [None]:

# Paths to datasets
train_path = r"C:\Users\paulo\OneDrive\Documents\kaggle_competition_2_datasets\train.csv"
test_path = r"C:\Users\paulo\OneDrive\Documents\kaggle_competition_2_datasets\test.csv"

# Load and optimize data
train_df = import_data(train_path, index_col='id')
test_df = import_data(test_path, index_col='id')

gc.collect()
logging.info("Data loaded successfully.")

# Apply preprocessing
train_df = preprocess_data(train_df)
test_df = preprocess_data(test_df)
logging.info("Data preprocessed successfully.")

# Apply feature engineering
train_df = feature_engineering(train_df)
test_df = feature_engineering(test_df)

gc.collect()
logging.info("Feature engineering completed successfully.")


In [None]:

# Separate features and target variable
X = train_df.drop('Response', axis=1)
y = train_df['Response']

# Scale the entire dataset
train_df = scale_dataset(train_df)
test_df = scale_dataset(test_df)

gc.collect()
logging.info(f"Features and target variable separated and scaled.")

# Define CatBoost parameters
cat_params = {
    'loss_function': 'Logloss',
    'eval_metric': 'AUC',
    'class_names': [0, 1],
    'learning_rate': 0.075,
    'iterations': 5500,
    'depth': 9,
    'random_strength': 0,
    'l2_leaf_reg': 0.5,
    'max_leaves': 512,
    'fold_permutation_block': 64,
    'random_seed': 42,
    'allow_writing_files': False,
    'verbose': 100  # Display log every 100 iterations
}

# Convert the test dataframe to strings and create the Pool object
test_pool = Pool(test_df.astype(str), cat_features=X.columns.values)

# Train CatBoost model on the entire training set
train_pool = Pool(X.astype(str), y, cat_features=X.columns.values)
model = CatBoostClassifier(**cat_params)
model.fit(train_pool, verbose=500)

# Predict on the test set using the Pool object
test_preds = model.predict_proba(test_pool)[:, 1]

# Create a submission dataframe
submission = pd.DataFrame({
    'id': test_df.index,
    'Response': test_preds
})

# Save the submission file
submission_filename = f'submission_{current_time}.csv'
submission.to_csv(submission_filename, index=False)

logging.info(f"Submission file {submission_filename} created successfully.")
print(f"Submission file {submission_filename} created successfully.")
