# Data Mining Project - Group XX 2025/2026

# Import Libraries

In [None]:
import sqlite3
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from math import ceil

from itertools import product
from ydata_profiling import ProfileReport

# for better resolution plots
%config InlineBackend.figure_format = 'retina'

#o svg consegue ampliar infinitamente os gráficos sem perder qualidade mas às vezes é mais lento 
#por isso agora usamos retina


sns.set()

# Loading the Data

Import the datasets from csv files using commas as separators of the columns and setting the unique customer identifier as the index of both columns.

In [None]:
flightsDB = pd.read_csv('DM_AIAI_FlightsDB.csv', sep = ",", index_col= "Loyalty#")
customerDB = pd.read_csv('DM_AIAI_CustomerDB.csv', sep = ",", index_col= "Loyalty#")
metaData = pd.read_csv('DM_AIAI_Metadata.csv', sep = ";", header= None)

Remove the 'Unnamed' column referring to a sequential numbering of the rows, as we set the column "Loyalty#" as the index

In [None]:
customerDB = customerDB.iloc[:, 1:]
customerDB

# Metadata

**FlightsDB Database Variable Description**
- **Loyalty#:**	Unique customer identifier linking to CustomerDB
- **Year:**	Year of flight activity record
- **Month:**	Month of flight activity record (1-12)
- **YearMonthDate:**	First day of the month for the activity period
- **NumFlights:**	Total number of flights taken by customer in the month
- **NumFlightsWithCompanions:**	Number of flights where customer traveled with companions
- **DistanceKM:**	Total distance traveled in kilometers for the month
- **PointsAccumulated:**	Loyalty points earned by customer during the month
- **PointsRedeemed:**	Loyalty points spent/redeemed by customer during the month
- **DollarCostPointsRedeemed:**	Dollar value of points redeemed during the month

**CustomerDB Database Variable Description**
- **Loyalty#:**  Unique customer identifier for loyalty program members
- **First Name:**   Customer's first name
- **Last Name:**   Customer's last name 
- **Customer Name:** Customer's full name (concatenated)
- **Country:**	Customer's country of residence
- **Province or State:**	Customer's province or state
- **City:**	Customer's city of residence
- **Latitude:**	Geographic latitude coordinate of customer location
- **Longitude:**	Geographic longitude coordinate of customer locatio
- **Postal code:**	Customer's postal/ZIP code
- **Gender:**	Customer's gender
- **Education:**	Customer's highest education level (Bachelor, College, etc.)
- **Location:** Code	Urban/Suburban/Rural classification of customer residence
- **Income:**	Customer's annual income
- **Marital Status:**	Customer's marital status (Married, Single, Divorced)
- **LoyaltyStatus:**	Current tier status in loyalty program (Star > Nova > Aurora)
- **EnrollmentDateOpening:**	Date when customer joined the loyalty program
- **CancellationDate:**	Date when customer left the program
- **Customer Lifetime:** Value	Total calculated monetary value of customer relationship
- **EnrollmentType:**	Method of joining loyalty program

# Data Preparation

## Feature Engineering (FlightsDB)

In [None]:
# AvgDistancePerFlight = DistanceKM / NumFlights
flightsDB['AvgDistancePerFlight'] = flightsDB['DistanceKM'] / flightsDB['NumFlights']

#PropFlightsWithCompanions = NumFlightsWithCompanions / NumFlights
flightsDB['PropFlightsWithCompanions'] = flightsDB['NumFlightsWithCompanions'] / flightsDB['NumFlights']

#NetPoints = PointsAccumulated - PointsRedeemed
flightsDB['NetPoints'] = flightsDB['PointsAccumulated'] - flightsDB['PointsRedeemed']

#DollarPerPoint = DollarCostPointsRedeemed / PointsRedeemed 
flightsDB['DollarPerPoint'] = flightsDB['DollarCostPointsRedeemed'] / flightsDB['PointsRedeemed']

In [None]:
flightsDB.head()

!!!!!!!!    SE JÁ FOR PARA CRIAR FEATURES USANDO VALORES DE MÉDIA, MEDIANA,.....           !!!!!!!!!!!!

In [None]:
# Transformar YearMonthDate em datetime
flights['YearMonthDate'] = pd.to_datetime(flights['YearMonthDate'])
flights['Month'] = flights['YearMonthDate'].dt.month
flights['Year'] = flights['YearMonthDate'].dt.year

# Exemplo de agregações mensais/anuais por cliente
agg_flights = flights.groupby('Loyalty#').agg({
    'NumFlights': 'sum',
    'NumFlightsWithCompanions': 'sum',
    'DistanceKM': 'sum',
    'PointsAccumulated': 'sum',
    'PointsRedeemed': 'sum',
    'DollarCostPointsRedeemed': 'sum',
    'AvgDistancePerFlight': 'mean',
    'PropFlightsWithCompanions': 'mean',
    'NetPoints': 'sum',
    'DollarPerPoint': 'mean'
}).reset_index()

## Feature Engineering (customerDB)

In [None]:
# transform EnrollmentDateOpening in datetime and calculate CustomerLifetimeMonths
customerDB['EnrollmentDateOpening'] = pd.to_datetime(customerDB['EnrollmentDateOpening'])
customerDB['CustomerLifetimeMonths'] = ((pd.Timestamp.today() - customerDB['EnrollmentDateOpening']).dt.days) / 30

# Codifyar LoyaltyStatus como ordinal
status_map = {'Aurora': 0, 'Nova': 1, 'Star': 2}
customerDB['LoyaltyStatusCode'] = customerDB['LoyaltyStatus'].map(status_map)

# Categorize Income as a categorical variable
income_bins = [0, 40000, 80000, 150000, float('inf')]
income_labels = ['Low', 'Medium', 'High', 'Very High']
customerDB['IncomeCategory'] = pd.cut(customerDB['Income'], bins=income_bins, labels=income_labels)


In [None]:
customerDB.head()

## Merge Datasets

In [None]:
df_final = pd.merge(flightsDB, customerDB, on='Loyalty#', how='inner')
df_final.head()

## Feature Engineering combined with both datasets

In [None]:
df_final['FlightsPerIncome'] = df_final['NumFlights'] / df_final['Income']
df_final['NetPointsPerIncome'] = df_final['NetPoints'] / df_final['Income']
df_final['DistancePerLifetime'] = df_final['DistanceKM'] / df_final['CustomerLifetimeMonths']

In [None]:
df_final.head()