# Data Mining Project - Group XX 2025/2026

# Import Libraries

In [1]:
import sqlite3
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from math import ceil

from itertools import product
from ydata_profiling import ProfileReport

# for better resolution plots
%config InlineBackend.figure_format = 'retina'

#o svg consegue ampliar infinitamente os gráficos sem perder qualidade mas às vezes é mais lento 
#por isso agora usamos retina


sns.set()

# Loading the Data

Import the datasets from csv files using commas as separators of the columns and setting the unique customer identifier as the index of both columns.

In [2]:
flightsDB = pd.read_csv('data/DM_AIAI_FlightsDB.csv', sep = ",", index_col= "Loyalty#")
customerDB = pd.read_csv('data/DM_AIAI_CustomerDB.csv', sep = ",", index_col= "Loyalty#")
metaData = pd.read_csv('data/DM_AIAI_Metadata.csv', sep = ";", header= None)

Remove the 'Unnamed' column referring to a sequential numbering of the rows, as we set the column "Loyalty#" as the index

In [3]:
customerDB = customerDB.iloc[:, 1:]
customerDB

Unnamed: 0_level_0,First Name,Last Name,Customer Name,Country,Province or State,City,Latitude,Longitude,Postal code,Gender,Education,Location Code,Income,Marital Status,LoyaltyStatus,EnrollmentDateOpening,CancellationDate,Customer Lifetime Value,EnrollmentType
Loyalty#,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
480934,Cecilia,Householder,Cecilia Householder,Canada,Ontario,Toronto,43.653225,-79.383186,M2Z 4K1,female,Bachelor,Urban,70146.0,Married,Star,2/15/2019,,3839.14,Standard
549612,Dayle,Menez,Dayle Menez,Canada,Alberta,Edmonton,53.544388,-113.490930,T3G 6Y6,male,College,Rural,0.0,Divorced,Star,3/9/2019,,3839.61,Standard
429460,Necole,Hannon,Necole Hannon,Canada,British Columbia,Vancouver,49.282730,-123.120740,V6E 3D9,male,College,Urban,0.0,Single,Star,7/14/2017,1/8/2021,3839.75,Standard
608370,Queen,Hagee,Queen Hagee,Canada,Ontario,Toronto,43.653225,-79.383186,P1W 1K4,male,College,Suburban,0.0,Single,Star,2/17/2016,,3839.75,Standard
530508,Claire,Latting,Claire Latting,Canada,Quebec,Hull,45.428730,-75.713364,J8Y 3Z5,male,Bachelor,Suburban,97832.0,Married,Star,10/25/2017,,3842.79,2021 Promotion
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100012,Ethan,Thompson,Ethan Thompson,Canada,Quebec,Quebec City,46.759733,-71.141009,Y0C 7D6,male,Bachelor,Suburban,,Single,Star,2/27/2019,2/27/2019,,Standard
100013,Layla,Young,Layla Young,Canada,Alberta,Edmonton,53.524829,-113.546357,L3S 9Y3,female,Bachelor,Rural,,Married,Star,9/20/2017,9/20/2017,,Standard
100014,Amelia,Bennett,Amelia Bennett,Canada,New Brunswick,Moncton,46.051866,-64.825428,G2S 2B6,male,Bachelor,Rural,,Married,Star,11/28/2020,11/28/2020,,Standard
100015,Benjamin,Wilson,Benjamin Wilson,Canada,Quebec,Quebec City,46.862970,-71.133444,B1Z 8T3,female,College,Urban,,Married,Star,4/9/2020,4/9/2020,,Standard


# Metadata

**FlightsDB Database Variable Description**
- **Loyalty#:**	Unique customer identifier linking to CustomerDB
- **Year:**	Year of flight activity record
- **Month:**	Month of flight activity record (1-12)
- **YearMonthDate:**	First day of the month for the activity period
- **NumFlights:**	Total number of flights taken by customer in the month
- **NumFlightsWithCompanions:**	Number of flights where customer traveled with companions
- **DistanceKM:**	Total distance traveled in kilometers for the month
- **PointsAccumulated:**	Loyalty points earned by customer during the month
- **PointsRedeemed:**	Loyalty points spent/redeemed by customer during the month
- **DollarCostPointsRedeemed:**	Dollar value of points redeemed during the month

**CustomerDB Database Variable Description**
- **Loyalty#:**  Unique customer identifier for loyalty program members
- **First Name:**   Customer's first name
- **Last Name:**   Customer's last name 
- **Customer Name:** Customer's full name (concatenated)
- **Country:**	Customer's country of residence
- **Province or State:**	Customer's province or state
- **City:**	Customer's city of residence
- **Latitude:**	Geographic latitude coordinate of customer location
- **Longitude:**	Geographic longitude coordinate of customer locatio
- **Postal code:**	Customer's postal/ZIP code
- **Gender:**	Customer's gender
- **Education:**	Customer's highest education level (Bachelor, College, etc.)
- **Location:** Code	Urban/Suburban/Rural classification of customer residence
- **Income:**	Customer's annual income
- **Marital Status:**	Customer's marital status (Married, Single, Divorced)
- **LoyaltyStatus:**	Current tier status in loyalty program (Star > Nova > Aurora)
- **EnrollmentDateOpening:**	Date when customer joined the loyalty program
- **CancellationDate:**	Date when customer left the program
- **Customer Lifetime:** Value	Total calculated monetary value of customer relationship
- **EnrollmentType:**	Method of joining loyalty program

# Data Preparation

## Feature Engineering (FlightsDB)

### Features without using mean/median/mode

We have selected the following features to add aggregated by Loyalty#:
- `TotalFlights:` Total number of flights taken by the customer over their entire membership period.

- `TotalFlightsWithCompanions:` Total number of flights taken with at least one companion.

- `TotalDistanceKM:` Total distance flown (in kilometers) across all flights taken by the customer.

- `TotalPointsAccumulated:` Total loyalty points earned throughout the customer’s history.

- `TotalPointsRedeemed:` Total loyalty points redeemed by the customer.

- `TotalAvgDistancePerFlight:` Average distance per flight calculated across all trips in the customer’s history.


In [4]:
# Convert YearMonthDate to datetime
flightsDB['YearMonthDate'] = pd.to_datetime(flightsDB['YearMonthDate'])
flightsDB['Month'] = flightsDB['YearMonthDate'].dt.month
flightsDB['Year'] = flightsDB['YearMonthDate'].dt.year

# Aggregate by customer (lifetime summary metrics)
flights_agg = flightsDB.groupby('Loyalty#').agg({
    'NumFlights': 'sum',
    'NumFlightsWithCompanions': 'sum',
    'DistanceKM': 'sum',
    'PointsAccumulated': 'sum',
    'PointsRedeemed': 'sum',
}).reset_index()

# Rename aggregated columns directly
flights_agg.rename(columns={
    'NumFlights': 'TotalFlights',
    'NumFlightsWithCompanions': 'TotalFlightsWithCompanions',
    'DistanceKM': 'TotalDistanceKM',
    'PointsAccumulated': 'TotalPointsAccumulated',
    'PointsRedeemed': 'TotalPointsRedeemed',
}, inplace=True)

# Add new aggregated features to the flightsDB
flightsDB_expanded = flightsDB.merge(flights_agg, on='Loyalty#', how='left')

# Preview
flightsDB_expanded.describe().T


Unnamed: 0,count,mean,min,25%,50%,75%,max,std
Loyalty#,608436.0,550037.873084,100018.0,326961.0,550834.0,772194.0,999986.0,258935.180575
Year,608436.0,2020.0,2019.0,2019.0,2020.0,2021.0,2021.0,0.816497
Month,608436.0,6.5,1.0,3.75,6.5,9.25,12.0,3.452055
YearMonthDate,608436.0,2020-06-16 02:40:00,2019-01-01 00:00:00,2019-09-23 12:00:00,2020-06-16 00:00:00,2021-03-08 18:00:00,2021-12-01 00:00:00,
NumFlights,608436.0,3.908107,0.0,0.0,0.0,7.2,21.0,5.057889
NumFlightsWithCompanions,608436.0,0.983944,0.0,0.0,0.0,0.9,11.0,2.003785
DistanceKM,608436.0,7939.341419,0.0,0.0,856.4,15338.175,42040.0,10260.421873
PointsAccumulated,608436.0,793.777781,0.0,0.0,85.275,1533.7125,4204.0,1025.918521
PointsRedeemed,608436.0,235.251678,0.0,0.0,0.0,0.0,7496.0,983.233374
DollarCostPointsRedeemed,608436.0,2.324835,0.0,0.0,0.0,0.0,74.0,9.725168


### Features without using mean/median/mode

We will add the following features to our model in order to better analyse the customers and hopefully better segment them in the future.

- `PointsRedemptionRatio:` The proportion of earned points that a customer has redeemed. Calculated as PointsRedeemed / PointsAccumulated. Shows engagement with the loyalty program and redemption behavior.

- `FlightWithCompanionRatio:` Ratio of flights where the customer traveled with companions. Calculated as NumFlightsWithCompanions / NumFlights. Reveals travel habits for group or family targeting.

- `FlightsVariance:` Variance or standard deviation of number of flights per customer over time. Calculated as groupby(Loyalty#)['NumFlights'].var(). Measures consistency in travel patterns.

- `Recency:` Number of months since the customer last took a trip. Calculated as the time difference between the dataset’s latest recorded date and the customer’s most recent active flight month. A core RFM metric used to evaluate engagement freshness and identify churn risk.

- `DistancePerFlight`: 

In [5]:
# === FEATURE ENGINEERING ON flightsDB_expanded === #

# Latest date in dataset 
latest_date = flightsDB['YearMonthDate'].max()

# Group by customer using aggregated values
grouped = flightsDB_expanded.groupby('Loyalty#')

# PointsRedemptionRatio (Total)
flightsDB_expanded['PointsRedemptionRatio'] = np.where(
    flightsDB_expanded['TotalPointsAccumulated'] > 0,
    flightsDB_expanded['TotalPointsRedeemed'] / flightsDB_expanded['TotalPointsAccumulated'],
    0
)

# Total Flight With Companion Ratio (lifetime)
flightsDB_expanded['TotalFlightWithCompanionRatio'] = np.where(
    flightsDB_expanded['TotalFlights'] > 0,
    flightsDB_expanded['TotalFlightsWithCompanions'] / flightsDB_expanded['TotalFlights'],
    0
)

# FlightsVariance (month-to-month consistency)
flightsDB_expanded['FlightsVariance'] = grouped['NumFlights'].transform('var').fillna(0)

# Recency (months since last active flight)
active_flights = flightsDB_expanded[flightsDB_expanded['NumFlights'] > 0]
last_active_flight = active_flights.groupby('Loyalty#')['YearMonthDate'].max()

flightsDB_expanded['Recency'] = (
    (latest_date - flightsDB_expanded['Loyalty#'].map(last_active_flight)).dt.days / 30
).fillna(999).astype(int)

# Total Distance per Flight
flightsDB_expanded['DistancePerFlight'] = np.where(
    flightsDB_expanded['TotalFlights'] > 0,
    flightsDB_expanded['TotalDistanceKM'] / flightsDB_expanded['TotalFlights'],
    0
)

flightsDB_expanded.describe().T

Unnamed: 0,count,mean,min,25%,50%,75%,max,std
Loyalty#,608436.0,550037.873084,100018.0,326961.0,550834.0,772194.0,999986.0,258935.180575
Year,608436.0,2020.0,2019.0,2019.0,2020.0,2021.0,2021.0,0.816497
Month,608436.0,6.5,1.0,3.75,6.5,9.25,12.0,3.452055
YearMonthDate,608436.0,2020-06-16 02:40:00,2019-01-01 00:00:00,2019-09-23 12:00:00,2020-06-16 00:00:00,2021-03-08 18:00:00,2021-12-01 00:00:00,
NumFlights,608436.0,3.908107,0.0,0.0,0.0,7.2,21.0,5.057889
NumFlightsWithCompanions,608436.0,0.983944,0.0,0.0,0.0,0.9,11.0,2.003785
DistanceKM,608436.0,7939.341419,0.0,0.0,856.4,15338.175,42040.0,10260.421873
PointsAccumulated,608436.0,793.777781,0.0,0.0,85.275,1533.7125,4204.0,1025.918521
PointsRedeemed,608436.0,235.251678,0.0,0.0,0.0,0.0,7496.0,983.233374
DollarCostPointsRedeemed,608436.0,2.324835,0.0,0.0,0.0,0.0,74.0,9.725168


NÃO COLOCAR A PRÓXIMA CÉLULA EM CÓDIGO, GERA ERROS NO GITHUB!!!!!!

 Folder path
folder_path = "feature engineering"

 Create folder if it doesn't exist
os.makedirs(folder_path, exist_ok=True)

 Save expanded dataset to CSV inside that folder
file_path = os.path.join(folder_path, "flightsDB_expanded.csv")
flightsDB_expanded.to_csv(file_path, index=False)

print(f"CSV file successfully created: {file_path}")


# Feature Engineering (CustomerDB)

### Features without using mean/median/mode

We have selected the following features.
- `IsMarried:` Indicates whether the customer is married (1 = married, 0 = not married).

- `IsUrban:` Indicates whether the customer resides in an urban area based on their location code.

- `IsHighIncome:` Flags customers whose income exceeds the 75th percentile within the dataset.

- `IncomeBins:` Groups customers into discrete income categories ranging from low to very high.

- `HouseholdType:` Combined indicator describing both marital status and location type of the customer.

- `YearsSinceEnrollment:` Number of years since the customer joined the loyalty program.

- `IsActive:` Indicates whether the customer is currently active in the loyalty program (no cancellation date).

- `CLVBins:` Categorizes customers into five groups based on their Customer Lifetime Value (CLV), from lowest to highest.


In [6]:
# Convert enrollment and cancellation dates to datetime
customerDB['EnrollmentDateOpening'] = pd.to_datetime(customerDB['EnrollmentDateOpening'], errors='coerce')
customerDB['CancellationDate'] = pd.to_datetime(customerDB['CancellationDate'], errors='coerce')

# Today reference (or max date from flightsDB if preferred)
reference_date = pd.Timestamp.today()

# IsMarried
customerDB['IsMarried'] = customerDB['Marital Status'].str.lower().eq('married').astype(int)

# IsUrban
customerDB['IsUrban'] = customerDB['Location Code'].str.lower().eq('urban').astype(int)

# IsHighIncome → Above 75th percentile
income_threshold = customerDB['Income'].quantile(0.75)
customerDB['IsHighIncome'] = (customerDB['Income'] > income_threshold).astype(int)

# IncomeBins (quartiles)
# Compute quartile bins with duplicate edges dropped
bins = pd.qcut(
    customerDB['Income'],
    q=4,
    duplicates='drop'
)

# Assign category labels based on number of bins actually created
bins_labels = ['Low', 'Med-Low', 'Med-High', 'High'][:bins.cat.categories.size]
customerDB['IncomeBins'] = pd.qcut(
    customerDB['Income'],
    q=4,
    labels=bins_labels,
    duplicates='drop'
)


# HouseholdType → Marital + Location combo
customerDB['HouseholdType'] = (
    customerDB['Location Code'].fillna('Unknown') + '-' +
    customerDB['Marital Status'].fillna('Unknown')
)

# YearsSinceEnrollment
customerDB['YearsSinceEnrollment'] = (
    (reference_date - customerDB['EnrollmentDateOpening']).dt.days / 365.25
).fillna(0).clip(lower=0)

# IsActive → No cancellation = still active
customerDB['IsActive'] = customerDB['CancellationDate'].isna().astype(int)

# CLVBins → Categorize Customer Lifetime Value into 5 bins
clv_bins = 5
customerDB['CLVBins'] = pd.qcut(
    customerDB['Customer Lifetime Value'],
    q=clv_bins,
    labels=["Very Low", "Low", "Medium", "High", "Very High"],
    duplicates='drop'  # handle case where not enough unique values exist
)


# Create expanded dataset (copy including new features)
customerDB_expanded = customerDB.copy()

# Preview result
customerDB_expanded.describe().T


Unnamed: 0,count,mean,min,25%,50%,75%,max,std
Latitude,16921.0,47.1745,42.984924,44.231171,46.087818,49.28273,60.721188,3.307971
Longitude,16921.0,-91.814768,-135.05684,-120.23766,-79.383186,-74.596184,-52.712578,22.242429
Income,16901.0,37758.0384,0.0,0.0,34161.0,62396.0,99981.0,30368.992499
EnrollmentDateOpening,16921.0,2018-10-05 09:50:05.567046912,2015-01-27 00:00:00,2017-01-17 00:00:00,2018-10-31 00:00:00,2020-07-09 00:00:00,2021-12-30 00:00:00,
CancellationDate,2308.0,2019-12-14 06:51:09.670710528,2015-01-27 00:00:00,2019-01-29 18:00:00,2020-01-12 00:00:00,2021-02-13 00:00:00,2021-12-30 00:00:00,
Customer Lifetime Value,16901.0,7990.460188,1898.01,3979.72,5780.18,8945.69,83325.38,6863.173093
IsMarried,16921.0,0.581644,0.0,0.0,1.0,1.0,1.0,0.493304
IsUrban,16921.0,0.327227,0.0,0.0,0.0,1.0,1.0,0.469215
IsHighIncome,16921.0,0.249631,0.0,0.0,0.0,0.0,1.0,0.432812
YearsSinceEnrollment,16921.0,7.076222,3.841205,5.316906,7.00616,8.791239,10.765229,1.968457


### Features using mean/median/mode

We have selected these features using the mean/median/mode:
- `IncomeAboveMean`: Indicates whether the customer's income is above the overall average income.

- `IncomeAboveMedian`: Indicates whether the customer's income is above the overall median income.

- `CityMeanIncome`: The average income of all customers residing in the same city.

- `EducationMeanIncome`: The average income of customers who share the same education level.

- `IncomeVsEducationMean`: Difference between the customer’s income and the average income for their education group.

- `MaritalStatusEducationMode`: The most common education level among customers with the same marital status.

- `EducationMatchesMaritalMode`: Indicates whether the customer’s education matches the most common education for their marital status group.

- `CityMeanCLV`: The average Customer Lifetime Value (CLV) for customers living in the same city.

- `CLVvsCityMean`: Difference between the customer’s CLV and the average CLV for their city.


In [7]:

# Copy base dataset to expanded version
customerDB_expanded = customerDB.copy()

# --- 1 & 2. Income Above Mean / Median ---
income_mean = customerDB_expanded['Income'].mean()
income_median = customerDB_expanded['Income'].median()

customerDB_expanded['IncomeAboveMean'] = (customerDB_expanded['Income'] > income_mean).astype(int)
customerDB_expanded['IncomeAboveMedian'] = (customerDB_expanded['Income'] > income_median).astype(int)

# --- 3. City Mean Income ---
customerDB_expanded['CityMeanIncome'] = customerDB_expanded.groupby('City')['Income'] \
    .transform('mean')

# --- 4. Education Mean Income ---
customerDB_expanded['EducationMeanIncome'] = customerDB_expanded.groupby('Education')['Income'] \
    .transform('mean')

# --- 5. Income vs Education Mean ---
customerDB_expanded['IncomeVsEducationMean'] = (
    customerDB_expanded['Income'] - customerDB_expanded['EducationMeanIncome']
)

# --- 6. MaritalStatusEducationMode ---
marital_status_edu_mode = (
    customerDB_expanded.groupby('Marital Status')['Education']
    .agg(lambda x: x.mode().iloc[0] if not x.mode().empty else None)
)

customerDB_expanded['MaritalStatusEducationMode'] = (
    customerDB_expanded['Marital Status']
    .map(marital_status_edu_mode)
)

# --- 7. Education Matches Marital Mode? (binary flag) ---
customerDB_expanded['EducationMatchesMaritalMode'] = (
    customerDB_expanded['Education'] == customerDB_expanded['MaritalStatusEducationMode']
).astype(int)

# --- 8. City Mean CLV ---
customerDB_expanded['CityMeanCLV'] = customerDB_expanded.groupby('City')['Customer Lifetime Value'] \
    .transform('mean')

# --- 9. CLV vs City Mean ---
customerDB_expanded['CLVvsCityMean'] = (
    customerDB_expanded['Customer Lifetime Value'] - customerDB_expanded['CityMeanCLV']
)

# ✅ Done — preview expanded table
customerDB_expanded.describe().T


Unnamed: 0,count,mean,min,25%,50%,75%,max,std
Latitude,16921.0,47.1745,42.984924,44.231171,46.087818,49.28273,60.721188,3.307971
Longitude,16921.0,-91.814768,-135.05684,-120.23766,-79.383186,-74.596184,-52.712578,22.242429
Income,16901.0,37758.0384,0.0,0.0,34161.0,62396.0,99981.0,30368.992499
EnrollmentDateOpening,16921.0,2018-10-05 09:50:05.567046912,2015-01-27 00:00:00,2017-01-17 00:00:00,2018-10-31 00:00:00,2020-07-09 00:00:00,2021-12-30 00:00:00,
CancellationDate,2308.0,2019-12-14 06:51:09.670710528,2015-01-27 00:00:00,2019-01-29 18:00:00,2020-01-12 00:00:00,2021-02-13 00:00:00,2021-12-30 00:00:00,
Customer Lifetime Value,16901.0,7990.460188,1898.01,3979.72,5780.18,8945.69,83325.38,6863.173093
IsMarried,16921.0,0.581644,0.0,0.0,1.0,1.0,1.0,0.493304
IsUrban,16921.0,0.327227,0.0,0.0,0.0,1.0,1.0,0.469215
IsHighIncome,16921.0,0.249631,0.0,0.0,0.0,0.0,1.0,0.432812
YearsSinceEnrollment,16921.0,7.076222,3.841205,5.316906,7.00616,8.791239,10.765229,1.968457


 Folder path
folder_path = "feature engineering"

 Create folder if it doesn't exist
os.makedirs(folder_path, exist_ok=True)

 Save expanded dataset to CSV inside that folder
file_path = os.path.join(folder_path, "customerDB_expanded.csv")
customerDB_expanded.to_csv(file_path, index=False)

print(f"CSV file successfully created: {file_path}")


# Feature Engineering combined with both datasets

## Merge Datasets

df_final = pd.merge(flightsDB, customerDB, on='Loyalty#', how='inner')
df_final.head()

df_final['FlightsPerIncome'] = df_final['NumFlights'] / df_final['Income']
df_final['NetPointsPerIncome'] = df_final['NetPoints'] / df_final['Income']
df_final['DistancePerLifetime'] = df_final['DistanceKM'] / df_final['CustomerLifetimeMonths']

In [8]:
# Create a second expanded flightsDB, keeping only one row per customer with aggregated features
flightsDB_expanded2 = flightsDB.merge(flights_agg, on='Loyalty#', how='right')

# Merge customerDB_expanded with flightsDB_expanded2 on Loyalty#
dfs_merged = customerDB_expanded.join(flightsDB_expanded2, on="Loyalty#", how="left")

# Fill only numeric columns with 0
num_cols = dfs_merged.select_dtypes(include=['number']).columns
dfs_merged[num_cols] = dfs_merged[num_cols].fillna(0)

# Keep categorical NA values untouched

from intertools import zip_longest

### Features without using mean/median

FlightsPerYearOfMembership
FlightsPerIncome
OutstandingPointsPerIncome
DistancePerCLV
PointsRedeemedPerCLVDollar
RedemptionRateByLoyaltyStatus
CompanionRatioByMaritalStatus
DistanceTraveledByEducationLevel
FlightsPerUrbanicityClass
HighValueCustomerActivityRate

In [9]:
# Defensive helper for zero-division
def safe_div(numerator, denominator):
    return (numerator / denominator).replace([np.inf, -np.inf], 0).fillna(0)


# FlightsPerYearOfMembership
dfs_merged["FlightsPerYearOfMembership"] = safe_div(
    dfs_merged["TotalFlights"],
    dfs_merged["YearsSinceEnrollment"]
)

# FlightsPerIncome
dfs_merged["FlightsPerIncome"] = safe_div(
    dfs_merged["TotalFlights"],
    dfs_merged["Income"]
)

# OutstandingPointsPerIncome
dfs_merged["TotalOutstandingPoints"] = (
    dfs_merged["TotalPointsAccumulated"] - dfs_merged["TotalPointsRedeemed"]
).clip(lower=0)

dfs_merged["OutstandingPointsPerIncome"] = safe_div(
    dfs_merged["TotalOutstandingPoints"],
    dfs_merged["Income"]
)

# DistancePerCLV
dfs_merged["DistancePerCLV"] = safe_div(
    dfs_merged["TotalDistanceKM"],
    dfs_merged["Customer Lifetime Value"]
)

# PointsRedeemedPerCLVDollar
dfs_merged["PointsRedeemedPerCLVDollar"] = safe_div(
    dfs_merged["TotalPointsRedeemed"],
    dfs_merged["TotalDollarCostPointsRedeemed"]
)

# RedemptionRateByLoyaltyStatus
loyalty_points_mean = dfs_merged.groupby("LoyaltyStatus")["TotalPointsRedeemed"].transform("mean")
dfs_merged["RedemptionRateByLoyaltyStatus"] = safe_div(
    dfs_merged["TotalPointsRedeemed"],
    loyalty_points_mean
)

# CompanionRatioByMaritalStatus
marital_comp_mean = dfs_merged.groupby("Marital Status")["TotalFlightsWithCompanions"].transform("mean")
dfs_merged["CompanionRatioByMaritalStatus"] = safe_div(
    dfs_merged["TotalFlightsWithCompanions"],
    marital_comp_mean
)

# DistanceTraveledByEducationLevel
edu_distance_mean = dfs_merged.groupby("Education")["TotalDistanceKM"].transform("mean")
dfs_merged["DistanceTraveledByEducationLevel"] = safe_div(
    dfs_merged["TotalDistanceKM"],
    edu_distance_mean
)

# FlightsPerUrbanicityClass
urban_flights_mean = dfs_merged.groupby("Location Code")["TotalFlights"].transform("mean")
dfs_merged["FlightsPerUrbanicityClass"] = safe_div(
    dfs_merged["TotalFlights"],
    urban_flights_mean
)

# HighValueCustomerActivityRate
dfs_merged["HighValueCustomerActivityRate"] = safe_div(
    dfs_merged["TotalFlights"],
    dfs_merged["Customer Lifetime Value"]
)

# Final result preview
dfs_merged.describe().T


KeyError: 'TotalDollarCostPointsRedeemed'

In [None]:
len(dfs_merged.columns)

78

### Features using mean/median

 Folder path
folder_path = "feature engineering"

 Create folder if it doesn't exist
os.makedirs(folder_path, exist_ok=True)

 Save expanded dataset to CSV inside that folder
file_path = os.path.join(folder_path, "merged_datasets_expanded.csv")
dfs_merged.to_csv(file_path, index=False)

print(f"CSV file successfully created: {file_path}")
