In [3]:
import pandas as pd
from sklearn.cluster import KMeans

In [4]:
def rfm_variables(df):
  ## quantidade de vendas
  f_sales = len(df)
  ## quantidade de vendas com lucro
  f_profit = len(df[df['Profit'] > 0])
  ## total de vendas, lucro, quantidade e custo de entrega
  m_sales = round(df['Sales'].sum(),2)
  m_profit = round(df['Profit'].sum(),2)
  m_qty = df['Quantity'].sum()
  m_deliver = round(df['Shipping Cost'].sum(),2)
  ## periodicidade
  df_sort = df[['Order Date']].sort_values(by='Order Date').drop_duplicates()
  df_sort['diff'] = df_sort['Order Date'] - df_sort['Order Date'].shift(1)
  df_sort['diff_int'] = df_sort['diff'].dt.days
  r_days = round(df_sort['diff_int'].mean(),2)
  return f_sales, f_profit, m_sales, m_profit, m_qty, m_deliver, r_days

In [5]:
def fit_data(data, variable):
  rfm = pd.DataFrame()
  variaveis = data[variable].unique()
  for variavel in variaveis:
    dados = data[data[variable] == variavel]
    f_vendas, f_lucro, m_vendas, m_lucro, m_qtde, m_entrega, r_dias = rfm_variables(dados)
    rfm = rfm.append(
        {
          'referencia': variavel, 
          'm_vendas': m_vendas, 
          'm_lucro': m_lucro,
          'm_qtde': m_qtde, 
          'm_entrega': m_entrega, 
          'r_dias': r_dias,
          'f_vendas': f_vendas, 
          'f_lucro': f_lucro           
        },
        ignore_index = True
    )
  return rfm

In [None]:
data = pd.read_feather('tavbase/gs.feather')
data

In [8]:
variaveis = [
    'f_vendas', 'f_lucro', 'm_entrega', 'm_lucro', 'm_qtde', 
    'm_vendas', 'r_dias'
]

In [None]:
## Aula 09 - Clusterização ou Agrupamento
country_rfm = fit_data(data, 'Country')
country_rfm = country_rfm.fillna(0)
country_rfm['cluster'] = KMeans(
    n_clusters=3,
    random_state=0
).fit(
    country_rfm[
        variaveis
    ]
).labels_
country_rfm

In [None]:
cluster = pd.DataFrame()
for index, row in enumerate(KMeans(n_clusters=3, random_state=0
    ).fit(country_rfm[variaveis]).cluster_centers_):
    print(row)
    cluster = cluster.append(
        {
            'cluster': index,
            'clf_vendas': row[0],
            'cls_lucro': row[1],
            'clm_entrega': row[2],
            'clm_lucro': row[3],
            'clm_qtde': row[4],
            'clm_vendas': row[5],
            'clr_dias': row[6]
        },
        ignore_index = True
    )
cluster    

In [None]:
country_rfm = country_rfm.merge(
    cluster,
    on='cluster',
    how='left'
)
country_rfm

In [18]:
country_rfm.to_feather('tavbase/clusterizacao_pais.feather')