# Dataset Creation Notebook
#### Step 0. Notebook Setup

Purpose: Import required libraries and suppress unnecessary warnings so outputs stay readable.

In [None]:
## Dataset Module
import pandas as pd
import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings('ignore', category=FutureWarning)

#### Step 1. Load Raw Datasets

Purpose: Load the raw insurance policy data and claims reference data from CSV files.
These datasets are the starting point for all downstream feature engineering.

In [None]:
## load datasets
insurance = pd.read_csv('../../data/input/exp/Motor_vehicle_insurance_data.csv', delimiter=";")
# claims =  pd.read_csv('../../data/input/exp/sample_type_claim.csv', delimiter=';')  ## To be used in future expansions

#### Step 2. Remove Columns Unavailable for New Customers

Purpose: Drop variables that depend on future information or historical outcomes.
These fields would not exist at prediction time for a new customer and would cause data leakage.

In [4]:
## columns to drop which won't be available for new customers
cols_to_drop = ['ID','Date_last_renewal','Date_start_contract','Date_next_renewal', 
                'Seniority', 'Policies_in_force', 'Max_policies', 
                'Max_products','Lapse', 'Date_lapse', 'Cost_claims_year', 
                'N_claims_year', 'N_claims_history', 'R_Claims_history']

insurance_new = insurance.drop(columns=cols_to_drop)

#### Step 3. Define Categorical Variable Mappings

Purpose: Convert encoded categorical variables into human-readable labels.
This improves interpretability and avoids passing opaque numeric codes downstream.

Variable encodings:

- 1. Distribution Channel - 0: Agent, 1: Broker
- 2. Payment - 0: Half-yearly, 1: Annually
- 3. Type_risk - 1: Motorbikes, 2: Vans, 3: Passenger Cars, 4: Agricultural Vehicles
- 4. Area - 0: rural, 1: urban
- 5. Second_driver - 0: No, 1: Yes
- 6. Type_fuel - P: Petrol, D: Diesel 

In [None]:
### Categorical Variables Mapping Functions

# Function to map Distribution Channel
def distribution_channel(x):
    if x == 0:
        return 'Agent'
    elif x == 1:
        return 'Broker'
    else:
        return 'Others'

# Function to map Payment
def payment_method(x):
    if x == 0:
        return 'Half-yearly'
    elif x == 1:
        return 'Annually'
    else:
        return 'Others'
    
# Function to map Type_risk
def type_risk(x):
    if x == 1:
        return 'Motorbikes'
    elif x == 2:
        return 'Vans'
    elif x == 3:
        return 'Passenger Cars'
    elif x == 4:
        return 'Agricultural Vehicles'
    else:
        return 'Others'
    
# Function to map Area
def area_type(x):
    if x == 0:
        return 'Rural'
    elif x == 1:
        return 'Urban'
    else:
        return 'Others'
    
# Function to map Second_driver
def second_driver(x):
    if x == 0:
        return 'No'
    elif x == 1:
        return 'Yes'
    else:
        return 'Others'
    
# Function to map Type_fuel
def type_fuel(x):
    if x == 'P':
        return 'Petrol'
    elif x == 'D':
        return 'Diesel'
    else:
        return 'Others'

#### Step 4. Apply Categorical Transformations

Purpose: Replace encoded categorical values in the dataset using the mapping functions defined above.

In [6]:
# Apply the functions to the insurance dataframe
insurance_new['Distribution_channel'] = insurance_new['Distribution_channel'].apply(distribution_channel)
insurance_new['Payment'] = insurance_new['Payment'].apply(payment_method)
insurance_new['Type_risk'] = insurance_new['Type_risk'].apply(type_risk)    
insurance_new['Area'] = insurance_new['Area'].apply(area_type)
insurance_new['Second_driver'] = insurance_new['Second_driver'].apply(second_driver)
insurance_new['Type_fuel'] = insurance_new['Type_fuel'].apply(type_fuel)

#### Step 5. Convert Date Columns to Datetime

Purpose: Ensure all date variables are stored in a proper datetime format.
Only columns starting with Date_ are converted, preventing accidental type changes.

In [None]:

## convert date columns to datetime format for variables that start with 'Date_'
date_columns = [col for col in insurance_new.columns if col.startswith('Date_')]
for col in date_columns:
    insurance_new[col] = pd.to_datetime(insurance_new[col],format='%d/%m/%Y', errors='coerce')
    


#### Step 6. Sanity Check Final Dataset

Purpose: Confirm that the dataset loads correctly and inspect the first few rows.

In [None]:
# display the first few rows of the final dataset
insurance_new.head()


#### Save the processed dataset
Purpose: Save the preprocessed dataset to be used by other notebooks

In [None]:
## save the cleaned dataset
insurance_new.to_csv('../../data/output/insurance_new.csv', index=False)