In [4]:
# Cell 1: Library Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.feature_selection import RFE
from sklearn.metrics import roc_auc_score
import logging
import os
from datetime import datetime


In [5]:

# Cell 2: Setup Logging
class Logger:
    def __init__(self):
        self.logger = self.setup_logging()
    
    def setup_logging(self):
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        log_file_name = f'exploration_{timestamp}.log'

        if os.path.exists('exploration.log'):
            os.remove('exploration.log')

        logger = logging.getLogger(__name__)
        logger.setLevel(logging.INFO)

        console_handler = logging.StreamHandler()
        file_handler = logging.FileHandler(log_file_name)

        formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
        console_handler.setFormatter(formatter)
        file_handler.setFormatter(formatter)

        logger.addHandler(console_handler)
        logger.addHandler(file_handler)
        return logger
    
    def info(self, message):
        self.logger.info(message)
    
    def error(self, message):
        self.logger.error(message)

logger = Logger()
logger.info("Setup and Imports complete.")


2024-07-14 20:26:34,965 - __main__ - INFO - Setup and Imports complete.
2024-07-14 20:26:34,965 - __main__ - INFO - Setup and Imports complete.


In [6]:

# Cell 3: Data Loading
def load_data(train_path, test_path):
    try:
        train_df = pd.read_csv(train_path)
        test_df = pd.read_csv(test_path)
        logger.info("Datasets loaded successfully.")
        return train_df, test_df
    except Exception as e:
        logger.error(f"Error loading datasets: {e}")
        return None, None

train_df, test_df = load_data(r"C:\Users\paulo\OneDrive\Documents\Binary-Classification-of-Insurance-Cross-Selling\model testing\xgboost\featured engineered\klib_full_trainset.csv", r"C:\Users\paulo\OneDrive\Documents\Binary-Classification-of-Insurance-Cross-Selling\model testing\xgboost\featured engineered\klib_full_testset.csv")

if train_df is not None and test_df is not None:
    logger.info(f"Train dataset shape: {train_df.shape}")
    logger.info(f"Test dataset shape: {test_df.shape}")
else:
    raise ValueError("Failed to load datasets. Check the file paths and try again.")


2024-07-14 20:26:52,780 - __main__ - INFO - Datasets loaded successfully.
2024-07-14 20:26:52,780 - __main__ - INFO - Datasets loaded successfully.
2024-07-14 20:26:52,783 - __main__ - INFO - Train dataset shape: (11465233, 10)
2024-07-14 20:26:52,783 - __main__ - INFO - Train dataset shape: (11465233, 10)
2024-07-14 20:26:52,785 - __main__ - INFO - Test dataset shape: (7669866, 10)
2024-07-14 20:26:52,785 - __main__ - INFO - Test dataset shape: (7669866, 10)


In [7]:

# Cell 4: Basic Data Analysis
def basic_analysis(df):
    logger.info(f"Dataset Head:\n{df.head()}")
    logger.info(f"Dataset Info:\n{df.info()}")
    logger.info(f"Dataset Description:\n{df.describe()}")
    logger.info(f"Missing Values:\n{df.isnull().sum()}")

basic_analysis(train_df)


2024-07-14 20:26:52,805 - __main__ - INFO - Dataset Head:
   gender       age  region_code  previously_insured  vehicle_age  \
0       1 -1.158738         35.0                   0            0   
1       1  0.312817         28.0                   0            2   
2       0 -0.891183         14.0                   1            1   
3       0 -0.222294          1.0                   0            0   
4       0 -0.155405         15.0                   1            0   

   vehicle_damage  annual_premium  policy_sales_channel   vintage  response  
0               1        2.182685                 124.0  0.288840         0  
1               1        1.793991                  26.0  1.551661         1  
2               0        0.483608                 152.0  1.126553         0  
3               1       -1.740113                 156.0 -1.099013         0  
4               0        0.101067                 152.0  1.626680         0  
2024-07-14 20:26:52,805 - __main__ - INFO - Dataset Head:
 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11465233 entries, 0 to 11465232
Data columns (total 10 columns):
 #   Column                Dtype  
---  ------                -----  
 0   gender                int64  
 1   age                   float64
 2   region_code           float64
 3   previously_insured    int64  
 4   vehicle_age           int64  
 5   vehicle_damage        int64  
 6   annual_premium        float64
 7   policy_sales_channel  float64
 8   vintage               float64
 9   response              int64  
dtypes: float64(5), int64(5)
memory usage: 874.7 MB


2024-07-14 20:26:56,092 - __main__ - INFO - Dataset Description:
             gender           age   region_code  previously_insured  \
count  1.146523e+07  1.146523e+07  1.146523e+07        1.146523e+07   
mean   5.409327e-01 -1.803875e-09  2.642147e+01        4.632843e-01   
std    4.983217e-01  1.000000e+00  1.299778e+01        4.986502e-01   
min    0.000000e+00 -1.225627e+00  0.000000e+00        0.000000e+00   
25%    0.000000e+00 -9.580718e-01  1.500000e+01        0.000000e+00   
50%    1.000000e+00 -1.554052e-01  2.800000e+01        0.000000e+00   
75%    1.000000e+00  7.141503e-01  3.500000e+01        1.000000e+00   
max    1.000000e+00  2.988372e+00  5.200000e+01        1.000000e+00   

        vehicle_age  vehicle_damage  annual_premium  policy_sales_channel  \
count  1.146523e+07    1.146523e+07    1.146523e+07          1.146523e+07   
mean   5.222163e-01    5.023184e-01    3.289200e-09          1.125585e+02   
std    5.764531e-01    4.999946e-01    1.000000e+00          5.3

In [8]:

# Cell 5: Correlation Analysis
def correlation_analysis(df, output_dir='plots'):
    os.makedirs(output_dir, exist_ok=True)
    corr_matrix = df.corr()
    plt.figure(figsize=(12, 8))
    sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm')
    plt.title('Correlation Matrix')
    plt.savefig(os.path.join(output_dir, 'correlation_matrix.png'))
    plt.close()
    logger.info(f"Correlation Matrix:\n{corr_matrix}")

correlation_analysis(train_df)


2024-07-14 20:27:01,041 - __main__ - INFO - Correlation Matrix:
                        gender       age  region_code  previously_insured  \
gender                1.000000  0.157092    -0.000074           -0.087611   
age                   0.157092  1.000000     0.037084           -0.276337   
region_code          -0.000074  0.037084     1.000000           -0.022260   
previously_insured   -0.087611 -0.276337    -0.022260            1.000000   
vehicle_age          -0.117469 -0.538492    -0.023741            0.191670   
vehicle_damage        0.096969  0.287935     0.026364           -0.836272   
annual_premium        0.010863  0.055432    -0.000915            0.008541   
policy_sales_channel -0.115575 -0.590963    -0.037758            0.236781   
vintage              -0.009562 -0.013404    -0.005542            0.019455   
response              0.055475  0.123421     0.012782           -0.346290   

                      vehicle_age  vehicle_damage  annual_premium  \
gender             

In [9]:

# Cell 6: Skewness and Distribution Analysis
def skewness_analysis(df, output_dir='plots'):
    os.makedirs(output_dir, exist_ok=True)
    numeric_features = df.select_dtypes(include=[np.number])
    skewness = numeric_features.skew().sort_values(ascending=False)
    logger.info(f"Feature Skewness:\n{skewness}")

    for feature in numeric_features:
        plt.figure(figsize=(10, 6))
        sns.histplot(df[feature], kde=True)
        plt.title(f'Distribution of {feature}')
        plt.savefig(os.path.join(output_dir, f'distribution_{feature}.png'))
        plt.close()

skewness_analysis(train_df)


2024-07-14 20:27:03,468 - __main__ - INFO - Feature Skewness:
response                2.294351
age                     0.639029
vehicle_age             0.561634
previously_insured      0.147260
vehicle_damage         -0.009274
vintage                -0.108188
region_code            -0.130700
gender                 -0.164282
annual_premium         -0.346541
policy_sales_channel   -0.920312
dtype: float64
2024-07-14 20:27:03,468 - __main__ - INFO - Feature Skewness:
response                2.294351
age                     0.639029
vehicle_age             0.561634
previously_insured      0.147260
vehicle_damage         -0.009274
vintage                -0.108188
region_code            -0.130700
gender                 -0.164282
annual_premium         -0.346541
policy_sales_channel   -0.920312
dtype: float64


In [10]:

# Cell 7: Binning
def binning(df, output_dir='plots'):
    os.makedirs(output_dir, exist_ok=True)
    for feature in ['age', 'annual_premium']:
        df[f'{feature}_Binned'] = pd.cut(df[feature], bins=5)
        plt.figure(figsize=(10, 6))
        df[f'{feature}_Binned'].value_counts().sort_index().plot(kind='bar')
        plt.title(f'Binning of {feature}')
        plt.savefig(os.path.join(output_dir, f'binning_{feature}.png'))
        plt.close()
        logger.info(f"Binned {feature}:\n{df[f'{feature}_Binned'].value_counts()}")

binning(train_df)


2024-07-14 20:39:55,816 - __main__ - INFO - Binned age:
age_Binned
(-1.23, -0.383]    5200471
(-0.383, 0.46]     2641977
(0.46, 1.303]      2151902
(1.303, 2.146]     1158358
(2.146, 2.988]      312525
Name: count, dtype: int64
2024-07-14 20:39:55,816 - __main__ - INFO - Binned age:
age_Binned
(-1.23, -0.383]    5200471
(-0.383, 0.46]     2641977
(0.46, 1.303]      2151902
(1.303, 2.146]     1158358
(2.146, 2.988]      312525
Name: count, dtype: int64
2024-07-14 20:39:56,430 - __main__ - INFO - Binned annual_premium:
annual_premium_Binned
(-0.771, 0.199]     4317478
(0.199, 1.168]      3891166
(-1.745, -0.771]    2120287
(1.168, 2.138]      1001525
(2.138, 3.107]       134777
Name: count, dtype: int64
2024-07-14 20:39:56,430 - __main__ - INFO - Binned annual_premium:
annual_premium_Binned
(-0.771, 0.199]     4317478
(0.199, 1.168]      3891166
(-1.745, -0.771]    2120287
(1.168, 2.138]      1001525
(2.138, 3.107]       134777
Name: count, dtype: int64


In [11]:

# Cell 8: Feature Importance using Random Forest
def feature_importance(df, output_dir='plots'):
    os.makedirs(output_dir, exist_ok=True)
    X = df.drop(columns=['response'])
    y = df['response']

    # Encoding categorical variables
    X = pd.get_dummies(X)

    # Split the data
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    # Fit RandomForest
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    importances = model.feature_importances_
    indices = np.argsort(importances)[::-1]

    # Plot feature importances
    plt.figure(figsize=(12, 8))
    plt.title("Feature Importances")
    plt.bar(range(X_train.shape[1]), importances[indices], align="center")
    plt.xticks(range(X_train.shape[1]), X_train.columns[indices], rotation=90)
    plt.xlim([-1, X_train.shape[1]])
    plt.savefig(os.path.join(output_dir, 'feature_importances.png'))
    plt.close()

    important_features = X_train.columns[indices]
    logger.info(f"Feature Importances:\n{pd.DataFrame({'Feature': important_features, 'Importance': importances[indices]})}")

feature_importance(train_df)


2024-07-14 21:50:42,229 - __main__ - INFO - Feature Importances:
                                   Feature  Importance
0                           annual_premium    0.307406
1                                  vintage    0.303010
2                              region_code    0.100187
3                           vehicle_damage    0.074205
4                                      age    0.070622
5                     policy_sales_channel    0.055101
6                       previously_insured    0.050984
7                              vehicle_age    0.009925
8               age_Binned_(-1.23, -0.383]    0.007812
9                                   gender    0.005822
10               age_Binned_(-0.383, 0.46]    0.005117
11               age_Binned_(1.303, 2.146]    0.002345
12                age_Binned_(0.46, 1.303]    0.002125
13               age_Binned_(2.146, 2.988]    0.001997
14  annual_premium_Binned_(-1.745, -0.771]    0.000964
15   annual_premium_Binned_(-0.771, 0.199]    0.000845


In [12]:

# Cell 9: Principal Component Analysis (PCA)
def pca_analysis(df, output_dir='plots'):
    os.makedirs(output_dir, exist_ok=True)
    X = df.drop(columns=['response'])
    y = df['response']
    X = pd.get_dummies(X)

    # Standardize features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Apply PCA
    pca = PCA(n_components=2)
    X_pca = pca.fit_transform(X_scaled)
    
    plt.figure(figsize=(10, 6))
    plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap='viridis', alpha=0.5)
    plt.xlabel('PCA Component 1')
    plt.ylabel('PCA Component 2')
    plt.title('PCA - First two components')
    plt.colorbar()
    plt.savefig(os.path.join(output_dir, 'pca_components.png'))
    plt.close()

    logger.info(f"PCA Explained Variance Ratio:\n{pca.explained_variance_ratio_}")

pca_analysis(train_df)


2024-07-14 21:59:01,600 - __main__ - INFO - PCA Explained Variance Ratio:
[0.20637606 0.11814852]
2024-07-14 21:59:01,600 - __main__ - INFO - PCA Explained Variance Ratio:
[0.20637606 0.11814852]


In [13]:



# Cell 10: t-SNE and UMAP (only t-SNE shown for brevity)
def tsne_analysis(df, output_dir='plots'):
    os.makedirs(output_dir, exist_ok=True)
    X = df.drop(columns=['response'])
    y = df['response']
    X = pd.get_dummies(X)

    # Standardize features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Apply t-SNE
    tsne = TSNE(n_components=2, random_state=42)
    X_tsne = tsne.fit_transform(X_scaled)
    
    plt.figure(figsize=(10, 6))
    plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=y, cmap='viridis', alpha=0.5)
    plt.xlabel('t-SNE Component 1')
    plt.ylabel('t-SNE Component 2')
    plt.title('t-SNE - First two components')
    plt.colorbar()
    plt.savefig(os.path.join(output_dir, 'tsne_components.png'))
    plt.close()

    logger.info("t-SNE analysis complete.")

tsne_analysis(train_df)


In [None]:

# Cell 11: Recursive Feature Elimination (RFE)
def rfe_analysis(df, output_dir='plots'):
    os.makedirs(output_dir, exist_ok=True)
    X = df.drop(columns=['response'])
    y = df['response']
    X = pd.get_dummies(X)

    # Split the data
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    # Fit model
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    rfe = RFE(estimator=model, n_features_to_select=10)
    rfe.fit(X_train, y_train)

    selected_features = X_train.columns[rfe.support_]
    logger.info(f"Selected Features by RFE:\n{selected_features}")

    # Plot selected features importance
    plt.figure(figsize=(12, 8))
    plt.title("Selected Feature Importances by RFE")
    plt.bar(selected_features, rfe.estimator_.feature_importances_, align="center")
    plt.xticks(rotation=90)
    plt.savefig(os.path.join(output_dir, 'rfe_feature_importances.png'))
    plt.close()

rfe_analysis(train_df)
