# Dataset Creation Notebook
#### Step 0. Notebook Setup

Purpose: Import required libraries and suppress unnecessary warnings so outputs stay readable.

In [2]:
## Dataset Module
import pandas as pd
import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings('ignore', category=FutureWarning)

#### Step 1. Load Raw Datasets

Purpose: Load the raw insurance policy data and claims reference data from CSV files.
These datasets are the starting point for all downstream feature engineering.

In [3]:
## load datasets
insurance = pd.read_csv('../../data/input/exp/Motor_vehicle_insurance_data.csv', delimiter=";")

#### Step 2. Remove Columns Unavailable for New Customers

Purpose: Drop variables that depend on future information or historical outcomes.
These fields would not exist at prediction time for a new customer and would cause data leakage.

In [4]:
## columns to drop which won't be available for new customers
cols_to_drop = ['ID','Date_last_renewal','Date_start_contract','Date_next_renewal', 
                'Seniority', 'Policies_in_force', 'Max_policies', 
                'Max_products','Lapse', 'Date_lapse', 'Cost_claims_year', 
                'N_claims_year', 'N_claims_history', 'R_Claims_history']

insurance_new = insurance.drop(columns=cols_to_drop)

#### Step 3. Define Categorical Variable Mappings

Purpose: Convert encoded categorical variables into human-readable labels.
This improves interpretability and avoids passing opaque numeric codes downstream.

Variable encodings:

- 1. Distribution Channel - 0: Agent, 1: Broker
- 2. Payment - 0: half-yearly, 1: annually
- 3. Type_risk - 1: motorbikes, 2: vans, 3: passenger cars, 4: agricultural vehicles
- 4. Area - 0: rural, 1: urban
- 5. Second_driver - 0: No, 1: Yes
- 6. Type_fuel - P: petrol, D: diesel 

In [None]:
### Categorical Variables Mapping Dictionaries

# Define mapping dictionaries for each categorical variable
distribution_channel_map = {0: 'Agent', 1: 'Broker'}
payment_method_map = {0: 'half-yearly', 1: 'annually'}
type_risk_map = {1: 'motorbikes', 2: 'vans', 3: 'passenger cars', 4: 'agricultural vehicles'}
area_type_map = {0: 'rural', 1: 'urban'}
second_driver_map = {0: 'No', 1: 'Yes'}
type_fuel_map = {'P': 'petrol', 'D': 'diesel'}


#### Step 4. Apply Categorical Transformations

Purpose: Replace encoded categorical values in the dataset using the mapping functions defined above.

In [6]:
# Apply the mappings to the insurance dataframe using .map() for efficiency
insurance_new['Distribution_channel'] = insurance_new['Distribution_channel'].map(distribution_channel_map)
insurance_new['Payment'] = insurance_new['Payment'].map(payment_method_map)
insurance_new['Type_risk'] = insurance_new['Type_risk'].map(type_risk_map)
insurance_new['Area'] = insurance_new['Area'].map(area_type_map)
insurance_new['Second_driver'] = insurance_new['Second_driver'].map(second_driver_map)
insurance_new['Type_fuel'] = insurance_new['Type_fuel'].map(type_fuel_map)

# Check for any unmapped values (will show as NaN)
for col in ['Distribution_channel', 'Payment', 'Type_risk', 'Area', 'Second_driver', 'Type_fuel']:
    unmapped_count = insurance_new[col].isna().sum()
    if unmapped_count > 0:
        print(f"Warning: {unmapped_count} unmapped values found in {col}")

#### Step 5. Convert Date Columns to Datetime

Purpose: Ensure all date variables are stored in a proper datetime format.
Only columns starting with Date_ are converted, preventing accidental type changes.

In [None]:

## convert date columns to datetime format for variables that start with 'Date_'
date_columns = [col for col in insurance_new.columns if col.startswith('Date_')]
for col in date_columns:
    insurance_new[col] = pd.to_datetime(insurance_new[col],format='%d/%m/%Y', errors='coerce')
    


#### Step 6. Sanity Check Final Dataset

Purpose: Confirm that the dataset loads correctly and inspect the first few rows.

In [None]:
# display the first few rows of the final dataset
insurance_new.head()


#### Step 7. Completion Confirmation

Purpose: Confirm that all data preparation steps completed successfully.

In [None]:
# Confirm completion
print("Dataset module loaded successfully.")
