# 0.0 Imports

In [39]:
import pandas as pd
import numpy  as np
import inflection
import umap.umap_ as umap
from typing import Tuple
from functools import reduce 
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import silhouette_score

In [2]:
# !pip install inflection

Collecting inflection
  Using cached inflection-0.5.1-py2.py3-none-any.whl (9.5 kB)
Installing collected packages: inflection
Successfully installed inflection-0.5.1


In [32]:
# !pip install umap-learn -q

# 1.0 Loading Data

In [4]:
%%bigquery df_raw

SELECT * FROM gcpproject-394221.ecommerce_database.ecommerce_table2 WHERE InvoiceDate <= "2023-05-15"

Query is running:   0%|          |

Downloading:   0%|          |

In [5]:
df_raw.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,562935,23199,JUMBO BAG APPLES,30,2017-08-08,2.08,14156.0,EIRE
1,562935,23200,JUMBO BAG PEARS,30,2017-08-08,2.08,14156.0,EIRE
2,562935,85123A,WHITE HANGING HEART T-LIGHT HOLDER,64,2017-08-08,2.55,14156.0,EIRE
3,562935,23322,LARGE WHITE HEART OF WICKER,40,2017-08-08,2.55,14156.0,EIRE
4,562935,22386,JUMBO BAG PINK POLKADOT,50,2017-08-08,2.08,14156.0,EIRE


## 1.1 Rename Columns

In [6]:
# Mudar para snake case
old_cols = ['InvoiceNo', 'StockCode', 'Description', 'Quantity', 'InvoiceDate',
       'UnitPrice', 'CustomerID', 'Country']

snake_case = lambda x: inflection.underscore(x)

new_cols = list( map(snake_case, old_cols))
df_raw.columns = new_cols

## 1.2 Data Dimensions

In [7]:
print("Número de linhas do conjunto de dados: {}".format(df_raw.shape[0]))
print("Número de colunas do conjunto de dados: {}".format(df_raw.shape[1]))

Número de linhas do conjunto de dados: 404772
Número de colunas do conjunto de dados: 8


## 1.3 Data Types

In [8]:
df_raw.dtypes

invoice_no       object
stock_code       object
description      object
quantity          Int64
invoice_date     dbdate
unit_price      float64
customer_id     float64
country          object
dtype: object

## 1.4 Change Types

In [9]:
df_raw['invoice_date'] = pd.to_datetime(df_raw['invoice_date'])
df_raw['customer_id'] = df_raw['customer_id'].astype(int)

# 2.0 Data Preparation

In [10]:
def coluna_para_int(dataframe: pd.DataFrame, nome_coluna: str) -> bool:
    """
    Converte a coluna especificada em um dataframe para o tipo inteiro.

    Args:
        dataframe (pd.DataFrame): O dataframe a ser processado.
        nome_coluna (str): O nome da coluna a ser convertida.

    Returns:
        bool: True se a conversão foi bem sucedida, False caso contrário.
    """
    
    if nome_coluna not in dataframe.columns:
        raise ValueError(f'Coluna {nome_coluna} nao encontrada')
        
    try:
        dataframe[nome_coluna] = dataframe[nome_coluna].astype(int)
    except:
        raise ValueError(f'Nao foi possivel converter a coluna {nome_coluna} para inteiro.')
    return True

In [11]:
coluna_para_int(df_raw, 'customer_id')

True

In [12]:
def coluna_para_date(dataframe: pd.DataFrame, nome_coluna: str, formato_data: str = None) -> bool: 
    """
    Descricao
    
    Args:
    
    Retornos:
    """
    
    if nome_coluna not in dataframe.columns:
        raise ValueError(f'Coluna {nome_coluna} nao foi encontrada no dataframe de entrada')

    try:
        if formato_data:
            dataframe[nome_coluna] = pd.to_datetime(dataframe[nome_coluna], format=formato_data)
        else:
            dataframe[nome_coluna] = pd.to_datetime(dataframe[nome_coluna])
    except (ValueError, TypeError):
        raise ValueError(f"Could not convert column '{nome_coluna}'")

    # Retorna True se a conversão foi bem sucedida
    return True

In [13]:
def keep_features(dataframe: pd.DataFrame, keep_columns: list) -> pd.DataFrame:
    """
    Retorna um DataFrame com as colunas especificadas em keep_columns.
    
    Args:
        dataframe (pd.DataFrame): O DataFrame a ser processado.
        keep_columns (list): A lista de nomes de colunas a serem mantidas no DataFrame resultante.

    Returns:
        pd.DataFrame: O DataFrame resultante com apenas as colunas especificadas em keep_columns.
    """
    return dataframe[keep_columns]

def filtragem_features(dataframe_raw: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """
    Descricao
    
    Args:
    
    Retornos:
    """
    df_returns = dataframe_raw.loc[dataframe_raw['quantity'] < 0, ['customer_id', 
                                                                   'quantity']]
    df_purchases = dataframe_raw.loc[dataframe_raw['quantity'] >= 0, :]
    
    # Filter main data
    df_filtered = keep_features(dataframe_raw, ['invoice_no', 'stock_code', 'quantity',
                                                'invoice_date', 'unit_price', 
                                                'customer_id', 'country'])
    
    return df_filtered, df_purchases, df_returns

In [14]:
df_raw.columns

Index(['invoice_no', 'stock_code', 'description', 'quantity', 'invoice_date',
       'unit_price', 'customer_id', 'country'],
      dtype='object')

In [15]:
df_filtered, df_purchases, df_returns = filtragem_features(df_raw)

In [16]:
df_filtered.head(1)

Unnamed: 0,invoice_no,stock_code,quantity,invoice_date,unit_price,customer_id,country
0,562935,23199,30,2017-08-08,2.08,14156,EIRE


In [17]:
df_purchases.head(1)

Unnamed: 0,invoice_no,stock_code,description,quantity,invoice_date,unit_price,customer_id,country
0,562935,23199,JUMBO BAG APPLES,30,2017-08-08,2.08,14156,EIRE


In [18]:
df_returns.head(1)

Unnamed: 0,customer_id,quantity
138,13384,-5


# 3.0. Feature engineering

In [19]:
def calculate_gross_revenue(dataframe_purchases: pd.DataFrame) -> pd.DataFrame:
    """
    Calcula a receita bruta de cada cliente com base nas colunas 'Quantity' e 'UnitPrice' e retorna
    um DataFrame com as colunas 'CustomerID' e 'gross_revenue'.

    Args:
        dataframe_purchases (pd.DataFrame): O DataFrame das compras contendo as colunas 'CustomerID', 'Quantity' e 'UnitPrice'.

    Returns:
        pd.DataFrame: O DataFrame resultante contendo as colunas 'CustomerID' e 'gross_revenue'.
    """
    # Verifica se as colunas necessárias estão presentes no DataFrame de entrada
    required_columns = {'customer_id', 'quantity', 'unit_price'}
    missing_columns = required_columns - set(dataframe_purchases.columns)
    if missing_columns:
        raise ValueError(f"O DataFrame de entrada está faltando as seguintes colunas: {missing_columns}")
    
    # Calcula a receita bruta e agrupa por CustomerID
    df = dataframe_purchases.copy()
    df.loc[:, 'gross_revenue'] = df.loc[:, 'quantity'] * df.loc[:, 'unit_price']
    grouped_df = df.groupby('customer_id').agg({'gross_revenue': 'sum'}).reset_index().copy()
    
    return grouped_df

def create_recency(dataframe_purchases: pd.DataFrame, dataframe_filtered: pd.DataFrame) -> pd.DataFrame:
    """
    Calcula a recência da última compra para cada cliente.

    Args:
        dataframe_purchases (pd.DataFrame): DataFrame com as informações de compras de todos os clientes.
        dataframe_filtered (pd.DataFrame): DataFrame filtrado apenas com as informações dos clientes que desejamos calcular a recência.

    Returns:
        pd.DataFrame: DataFrame com as colunas 'CustomerID' e 'recency_days', indicando a recência em dias da última compra para cada cliente.

    """
    required_columns = {'customer_id', 'invoice_date'}
    missing_columns = required_columns - set(dataframe_purchases.columns)
    if missing_columns:
        raise ValueError(f"O DataFrame de entrada está faltando as seguintes colunas: {missing_columns}")
    # calcula a data da última compra de cada cliente
    df_recency = dataframe_purchases.loc[:, ['customer_id', 'invoice_date']].groupby('customer_id').max().reset_index()
    
    # calcula a recência em dias da última compra de cada cliente em relação à data mais recente da base de dados filtrada
    df_recency['recency_days'] = (dataframe_filtered['invoice_date'].max() - df_recency['invoice_date']).dt.days
    
    # retorna o DataFrame apenas com as colunas 'CustomerID' e 'recency_days'
    return df_recency[['customer_id', 'recency_days']]

def create_quantity_purchased(dataframe_purchases: pd.DataFrame) -> pd.DataFrame:
    """
    Calcula a quantidade de produtos adquiridos por cada cliente.

    Args:
        dataframe_purchases (pd.DataFrame): DataFrame com as informações de compras de todos os clientes.

    Returns:
        pd.DataFrame: DataFrame com as colunas 'CustomerID' e 'qty_products', indicando a quantidade de produtos adquiridos por cada cliente.
    """
    required_columns = {'customer_id', 'stock_code'}
    missing_columns = required_columns - set(dataframe_purchases.columns)
    if missing_columns:
        raise ValueError(f"O DataFrame de entrada está faltando as seguintes colunas: {missing_columns}")
    # agrupa as informações de compras por CustomerID e conta o número de StockCode para cada grupo
    qty_purchased = dataframe_purchases.loc[:, ['customer_id', 'stock_code']].groupby('customer_id').count()
    
    # renomeia a coluna StockCode para qty_products e reseta o índice para transformar o CustomerID em uma coluna
    qty_purchased = qty_purchased.reset_index().rename(columns={'stock_code': 'qty_products'})
    
    # retorna o DataFrame com as colunas 'CustomerID' e 'qty_products'
    return qty_purchased
def create_freq_purchases(dataframe_purchases: pd.DataFrame) -> pd.DataFrame:
    """
    Calculates the purchase frequency of each customer based on the purchase history.

    Parameters
    ----------
    dataframe_purchases : pd.DataFrame
        DataFrame with purchase history of each customer, containing columns CustomerID, InvoiceNo, and InvoiceDate.

    Returns
    -------
    pd.DataFrame
        DataFrame with the purchase frequency of each customer, containing columns CustomerID and frequency.
    """
    required_columns = {'customer_id', 'invoice_no', 'invoice_date'}
    missing_columns = required_columns - set(dataframe_purchases.columns)
    if missing_columns:
        raise ValueError(f"O DataFrame de entrada está faltando as seguintes colunas: {missing_columns}")

    # Calculate time range of purchases for each customer
    df_aux = (dataframe_purchases[['customer_id', 'invoice_no', 'invoice_date']]
              .drop_duplicates()
              .groupby('customer_id')
              .agg(max_=('invoice_date', 'max'),
                   min_=('invoice_date', 'min'),
                   days_=('invoice_date', lambda x: ((x.max() - x.min()).days) + 1),
                   buy_=('invoice_no', 'count'))
              .reset_index())

    # Calculate frequency of purchases for each customer
    df_aux['frequency'] = df_aux[['buy_', 'days_']].apply(
        lambda x: x['buy_'] / x['days_'] if x['days_'] != 0 else 0, axis=1)

    return df_aux
def create_qty_returns(dataframe_returns: pd.DataFrame) -> pd.DataFrame:
    """
    Computes the total quantity of returned products for each customer.

    Args:
        dataframe_returns: A pandas DataFrame containing information about returns.

    Returns:
        A pandas DataFrame with the total quantity of returned products for each customer.
    """
    # Validate input data
    
    if not all(col in dataframe_returns.columns for col in ['customer_id', 'quantity']):
        raise ValueError("Input DataFrame must contain 'CustomerID' and 'Quantity' columns")
    
    # Compute quantity of returns
    df_returns = dataframe_returns[['customer_id', 'quantity']].groupby('customer_id').sum().reset_index().rename(columns={'quantity': 'qty_returns'})
    df_returns['qty_returns'] = df_returns['qty_returns']* -1
    
    return df_returns
def run_feature_engineering(dataframe_filtered: pd.DataFrame, dataframe_purchases: pd.DataFrame, dataframe_returns: pd.DataFrame) -> pd.DataFrame:
    """
    Performs feature engineering on the input dataframes and returns a new dataframe with the engineered features.

    Args:
        dataframe_filtered: A pandas DataFrame containing filtered customer order data.
        dataframe_purchases: A pandas DataFrame containing customer purchase data.
        dataframe_returns: A pandas DataFrame containing customer return data.

    Returns:
        A pandas DataFrame with the engineered features for each customer.
    """
    # Check if input dataframes are empty
    if dataframe_filtered.empty:
        raise ValueError("Input DataFrame 'dataframe_filtered' is empty")
    if dataframe_purchases.empty:
        raise ValueError("Input DataFrame 'dataframe_purchases' is empty")

    # Check if required columns are present in input dataframes
    required_columns = ['customer_id', 'invoice_date', 'stock_code', 'quantity', 'unit_price']
    for df, name in zip([dataframe_filtered, dataframe_purchases], ['dataframe_filtered', 'dataframe_purchases']):
        missing_columns = set(required_columns) - set(df.columns)
        if missing_columns:
            raise ValueError(f"Missing columns {missing_columns} in input DataFrame '{name}'")
    if 'customer_id' not in dataframe_returns.columns:
        raise ValueError("Column 'CustomerID' not found in input DataFrame 'dataframe_returns'")
    if 'quantity' not in dataframe_returns.columns:
        raise ValueError("Column 'Quantity' not found in input DataFrame 'dataframe_returns'")

    # Perform feature engineering
    df_fengi = keep_features(dataframe_filtered, ['customer_id']).drop_duplicates(ignore_index=True)
    gross_revenue = calculate_gross_revenue(dataframe_purchases)
    df_recency = create_recency(dataframe_purchases, dataframe_filtered)
    df_qty_products = create_quantity_purchased(dataframe_purchases)
    df_freq = create_freq_purchases(dataframe_purchases)
    returns = create_qty_returns(dataframe_returns)
    
    # Merge dataframes
    dfs = [df_fengi, gross_revenue, df_recency, df_qty_products, df_freq, returns]
    df_fengi = reduce(lambda left,right: pd.merge(left, right, on='customer_id', how='left'), dfs)

    # Fill NaN values
    df_fengi['qty_returns'] = df_fengi['qty_returns'].fillna(0)

    # Select final features and return dataframe
    features = ['customer_id', 'gross_revenue', 'recency_days', 'qty_products', 'frequency', 'qty_returns']
    return keep_features(df_fengi, features).dropna()

In [20]:
df_fengi = run_feature_engineering(df_filtered, df_purchases, df_returns)
df_fengi.head()

Unnamed: 0,customer_id,gross_revenue,recency_days,qty_products,frequency,qty_returns
0,14156,117379.63,9.0,1400.0,0.151515,859
1,12594,3208.22,37.0,114.0,0.031469,0
2,12571,519.61,39.0,34.0,0.024096,0
3,12444,4085.46,21.0,172.0,0.033113,0
4,13919,348.81,59.0,26.0,0.031746,37


In [21]:
df_fengi.shape

(4330, 6)

# 4.0. EDA

In [22]:
df_eda = keep_features(df_fengi, ['gross_revenue', 'recency_days', 'qty_products', 'frequency', 'qty_returns'])
df_eda.head()

Unnamed: 0,gross_revenue,recency_days,qty_products,frequency,qty_returns
0,117379.63,9.0,1400.0,0.151515,859
1,3208.22,37.0,114.0,0.031469,0
2,519.61,39.0,34.0,0.024096,0
3,4085.46,21.0,172.0,0.033113,0
4,348.81,59.0,26.0,0.031746,37


# 5.0. Experimentacao

## 5.1 K-Clusters

In [23]:
k_clusters = np.arange(2, 5, 1)
k_clusters

array([2, 3, 4])

## 5.2. Scaler

In [26]:
scaler = MinMaxScaler()

## 5.3 Dimensionality reduction algorithms

In [28]:
def train_pca(
    scaler, dataframe: pd.DataFrame, to_scaling: bool = True, **kwargs
) -> pd.DataFrame:
    """
    Treina um modelo PCA no dataframe de entrada e aplica-o aos dados usando o scaler especificado.

    Args:
        scaler: objeto scaler para transformar os dados
        dataframe (pd.DataFrame): dataframe de entrada
        to_scaling (bool): indica se os dados devem ser escalados antes de aplicar o PCA (padrão True)
        **kwargs: argumentos adicionais para a inicialização do modelo PCA
    Raises:
        ValueError: se o dataframe de entrada estiver vazio
    """
    if dataframe.empty:
        raise ValueError("Input dataframe is empty.")

    if to_scaling:
        X = scaler.fit_transform(dataframe)
    else:
        X = dataframe.copy()
    pca = PCA(**kwargs)
    principal_components = pca.fit_transform(X)
    df_pca = pd.DataFrame(principal_components)
    return df_pca


def train_umap(
    scaler, dataframe: pd.DataFrame, to_scaling: bool = True, **kwargs
) -> pd.DataFrame:
    """ """
    if dataframe.empty:
        raise ValueError("Input dataframe is empty.")

    if to_scaling:
        X = scaler.fit_transform(dataframe)
    else:
        X = dataframe.copy()
        
    reducer = umap.UMAP(random_state=42, **kwargs)
    embedding = reducer.fit_transform(X)

    df_umap = pd.DataFrame()
    df_umap["embedding_x"] = embedding[:, 0]
    df_umap["embedding_y"] = embedding[:, 1]
    return df_umap


def train_tsne(
    scaler, dataframe: pd.DataFrame, to_scaling: bool = True, **kwargs
) -> pd.DataFrame:
    """ """
    if dataframe.empty:
        raise ValueError("Input dataframe is empty.")

    if to_scaling:
        X = scaler.fit_transform(dataframe)
    else:
        X = dataframe.copy()

    reducer = TSNE(random_state=42, **kwargs)
    embedding = reducer.fit_transform(X)

    df_tsne = pd.DataFrame()
    df_tsne["embedding_x"] = embedding[:, 0]
    df_tsne["embedding_y"] = embedding[:, 1]
    return df_tsne

def train_tree_embedding(scaler, dataframe: pd.DataFrame,
    target: str = "gross_revenue", **kwargs
) -> pd.DataFrame:
    """
    
    """
    
    # Separate features and target variable
    X = dataframe.drop(columns=[target], axis=1)
    X = pd.DataFrame(scaler.fit_transform(X.values),
                     columns=X.columns, 
                     index=X.index)
    y = dataframe[target]
    
    # Define and train the model
    rf_model = RandomForestRegressor(random_state=42, **kwargs)
    rf_model.fit(X, y)

    # Compute leaf indices for each sample
    leaf_indices = rf_model.apply(X)

    # Create DataFrame with leaf indices
    df_leaf = pd.DataFrame(leaf_indices)

    return df_leaf

In [36]:
df_pca = train_pca(scaler, df_eda, n_components=2)

In [37]:
df_umap = train_umap(scaler, df_eda)

In [40]:
df_tsne = train_tsne(scaler, df_eda)



In [41]:
df_tree = train_tree_embedding(scaler, df_eda)

In [42]:
df_tree.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,5378,5431,5380,5449,5367,5472,5346,5361,5431,5473,...,5472,5401,5397,5379,5372,5365,5393,5411,5433,5435
1,4458,4731,4307,4388,4450,4621,4507,4469,3893,4785,...,4657,3864,4690,4369,3408,4375,4380,4099,4164,4666
2,822,769,837,813,748,767,1160,806,1162,780,...,799,439,835,557,619,821,984,1254,1005,574
3,4633,4733,4306,4411,4402,4739,4726,4498,4131,4792,...,4739,4185,4701,4451,4588,4777,4471,4106,4254,4692
4,829,636,787,897,1151,969,846,648,824,602,...,856,996,589,628,636,837,538,607,1061,549


## 5.4. Algoritmos de clusterização

In [43]:
def train_kmeans(dataframe: pd.DataFrame, k_clusters: list, **kwargs) -> list:
    """
    Treina o modelo KMenas para diferentes valores de k_clusters e calcula a pontuação de silhueta para cada um.

    Args:
        dataframe: um dataframe com as features de treinamento
        k_clusters: uma lista de inteiros que representa o número de clusters a serem testados
        **kwargs: argumentos adicionais para a inicialização do modelo de mistura gaussiana

    Returns:
        Uma lista de pontuações de silhueta correspondentes a cada valor de k_clusters.
    """

    silhouette_scores = []

    for k in k_clusters:
        # Model training
        kmeans_model = KMeans(n_clusters=k, random_state=42, **kwargs).fit(dataframe)

        # Model predict
        labels = kmeans_model.predict(dataframe)

        # Model performance
        silhouette = silhouette_score(dataframe, labels, metric="euclidean")
        silhouette_scores.append(silhouette)

    return silhouette_scores

def train_gmm(dataframe: pd.DataFrame, k_clusters: list, **kwargs) -> list:
    
    silhouette_scores = []

    for k in k_clusters:
        # Model training
        gmm_model = GaussianMixture(n_components=k, random_state=42, **kwargs).fit(dataframe)

        # Model predict
        labels = gmm_model.predict(dataframe)

        # Model performance
        silhouette = silhouette_score(dataframe, labels, metric="euclidean")
        silhouette_scores.append(silhouette)

    return silhouette_scores


In [44]:
kmeans_list = train_kmeans(df_pca, k_clusters, n_init=10)

In [45]:
gmm_list = train_gmm(df_pca, k_clusters, n_init=100)