## Setup
Import libraries for:
- File and OS operations
- Data manipulation and analysis
- System interactions
- SQLite database connectivity
- Numerical computations
- Plotting and visualization

In [2]:
import os
import pandas as pd
import sys
import sqlite3
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Visualization Configuration
Configure the visual style for Seaborn plots and enable inline display for Matplotlib in the notebook.

In [4]:
sns.set_style("whitegrid")
%matplotlib inline

# READING THE DATA

## Data Path Configuration
Specify the directory containing the Excel data files.

In [5]:
data_path = r"C:\Users\dimet\Documents\GitHub\NOVAIMS_BDMwDS_PROJECT\00 Data"

Function: Load an Excel Sheet into a DataFrame
Defines load_excel_dataframe, which builds the full file path, attempts to read the given sheet into a pandas DataFrame, logs its shape on success, and returns an empty DataFrame on failure.

In [None]:
# Function to load each Excel file into its own DataFrame
def load_excel_dataframe(filename: str, sheet_name: str) -> pd.DataFrame:
    """
    Load a sheet from an Excel file into a pandas DataFrame.
    Prints the shape on success or an error message on failure.
    """
    file_path = os.path.join(data_path, filename)
    try:
        df = pd.read_excel(file_path, sheet_name=sheet_name)
        print(f"Loaded '{sheet_name}' from '{filename}' (shape: {df.shape})")
        return df
    except Exception as e:
        print(f"Error loading '{sheet_name}' from '{filename}': {e}")
        return pd.DataFrame()

Calls the loader for each Excel file and sheet, assigning the result to appropriately named variables.

In [7]:
# Load DataFrames
funnel_df = load_excel_dataframe("Funnel.xlsx", "funnel_data")
policies_df = load_excel_dataframe("Policy.xlsx", "policies_data")
regional_df = load_excel_dataframe("Regional.xlsx", "regional_data")

Loaded 'funnel_data' from 'Funnel.xlsx' (shape: (9373, 18))
Loaded 'policies_data' from 'Policy.xlsx' (shape: (13365, 26))
Loaded 'regional_data' from 'Regional.xlsx' (shape: (15539, 34))


# Master DataFrame Construction
Sequentially enrich the base funnel_df by merging in policy-level data, then regional data, using the specified join strategy.

Function signature and parameters
Defines build_master_df, which accepts three DataFrames plus optional keys and join type.

## Step 1: Merge policies data
Joins policies_df (deduplicated on policy_key) into funnel_df, tagging any overlapping columns with _pol.

## Step 2: Merge regional data
Joins regional_df (deduplicated on region_key) into the intermediate result, tagging overlapping columns with _reg.

In [8]:
def build_master_df(funnel_df: pd.DataFrame,
                    policies_df: pd.DataFrame,
                    regional_df: pd.DataFrame,
                    *,
                    policy_key: str = "policy_number",
                    region_key: str = "zipcode_link",
                    how: str = "left") -> pd.DataFrame:
    merged = funnel_df.merge(
        policies_df.drop_duplicates(subset=policy_key),
        on=policy_key,
        how=how,
        suffixes=("", "_pol")
    )
    merged = merged.merge(
        regional_df.drop_duplicates(subset=region_key),
        on=region_key,
        how=how,
        suffixes=("", "_reg")
    )
    return merged


In [9]:
master_df = build_master_df(funnel_df, policies_df, regional_df)

In [10]:
master_df

Unnamed: 0,affinity_name,status_report,offer_number,policy_number,zipcode_link,zip4,birth_date,brand,date_offer,date_request,...,FAM_CHILD_Y,FAM_CHILD_O,FAM_WCHILD_Y,FAM_WCHILD_MED,FAM_WCHILD_OLD,CIT_HOUSEHOLD,LOAN,SAVINGS,SHOP_ONLINE,CAR
0,Insuro,Requestwithdrawn,1000,10000.0,10000,2132,1985-01-01,HYUNDAI,2018-10-11,2018-10-11,...,,,,,,,,,,
1,other,Tailoredofferwithdrawn,1001,,10001,6027,1987-04-01,AUDI,2018-10-11,NaT,...,4.0,3.0,1.0,1.0,4.0,5.0,2.0,6.0,5.0,5.0
2,other,Incompleterequest,1002,,10002,3824,1972-11-01,VOLKSWAGEN,2018-10-11,NaT,...,4.0,4.0,1.0,2.0,2.0,6.0,3.0,5.0,6.0,5.0
3,other,Policycreated,1003,10002.0,10003,6921,1983-08-01,MAZDA,2018-10-11,2018-10-11,...,4.0,3.0,2.0,3.0,3.0,5.0,3.0,5.0,6.0,5.0
4,other,Policycreated,1004,10003.0,10004,8266,1990-04-01,VOLVO,2018-10-12,2018-10-12,...,3.0,3.0,2.0,3.0,3.0,5.0,4.0,4.0,6.0,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9368,other,Requestaccepted,9439,11946.0,15568,1161,1960-09-01,VOLKSWAGEN,2020-03-08,2020-03-08,...,2.0,4.0,1.0,2.0,3.0,6.0,4.0,5.0,6.0,5.0
9369,other,Waitforapproval,9440,11947.0,15582,5015,1953-04-01,NISSAN,2020-03-08,2020-03-08,...,3.0,3.0,1.0,3.0,4.0,5.0,4.0,6.0,6.0,5.0
9370,Insuro,Tailoredofferrequested,9441,,10332,3078,1976-04-01,TOYOTA,2020-03-08,NaT,...,4.0,3.0,2.0,3.0,3.0,5.0,5.0,4.0,6.0,4.0
9371,T&B,Calculatenewpremium,9442,,12968,1965,1951-07-01,TOYOTA,2020-03-08,NaT,...,1.0,3.0,1.0,3.0,4.0,3.0,2.0,3.0,3.0,5.0


## Conversion Indicator
Create a binary flag “conv” set to 1 when a policy is created, else 0

In [12]:
master_df["conv"] = np.where(
    master_df["status_report"].eq("Policycreated"),
    1,
    0
)

## Date Parsing
Coerce both original and policy-suffix start dates into datetime, invalid parsing becomes NaT

In [13]:
master_df['policy_start_date']      = pd.to_datetime(master_df['policy_start_date'],      errors='coerce')
master_df['policy_start_date_pol']  = pd.to_datetime(master_df['policy_start_date_pol'],  errors='coerce')

## Cutoff Definition
Define the threshold date for churn classification

In [14]:
threshold = pd.to_datetime('2019-11-01')

## Churn Classification
Assign churn = -1 if missing start date or on/after cutoff;
otherwise churn = 1 if policy-suffix date is missing; else churn = 0

In [15]:
master_df['churn'] = np.where(
    (master_df['policy_start_date'].isna()) |
    (master_df['policy_start_date'] >= threshold),
    -1,
    np.where(
        master_df['policy_start_date_pol'].isna(),
        1,
        0
    )
)

In [16]:
master_df[['policy_start_date', 'policy_start_date_pol', 'churn']].head()

Unnamed: 0,policy_start_date,policy_start_date_pol,churn
0,2018-11-01,NaT,1
1,NaT,NaT,-1
2,NaT,NaT,-1
3,2018-10-12,NaT,1
4,2018-11-01,NaT,1


# MASTER DATA OUTPUT

In [18]:
csv_name    = "Dataset.csv"                           # final file name
csv_file    = os.path.join(data_path, csv_name)       # full destination path

# ── Persist ───────────────────────────────────────────────────────────────
# Write the DataFrame to CSV (no index column by default)
master_df.to_csv(csv_file, index=False)

print(f"Saved {len(master_df):,} rows to {csv_file}")

Saved 9,373 rows to C:\Users\dimet\Documents\GitHub\NOVAIMS_BDMwDS_PROJECT\00 Data\Dataset.csv
