In [43]:
libraries <- c("tidyverse","cluster","dendextend","fpc",
             "factoextra","gridExtra", "lubridate", "dplyr", "cluster")

if(sum(as.numeric(!libraries %in% installed.packages())) != 0){
  instalador <- libraries[!libraries %in% installed.packages()]
  for(i in 1:length(instalador)) {
    install.packages(instalador, dependencies = T)
    break()}
  sapply(libraries, require, character = T) 
} else {
  sapply(libraries, require, character = T) 
}

In [44]:
#databases reading
olist_customers_dataset <- read.csv("../input/brazilian-ecommerce/olist_customers_dataset.csv", row.names=NULL, header=T)
olist_order_items_dataset <- read.csv("../input/brazilian-ecommerce/olist_order_items_dataset.csv", row.names=NULL, header=T)
olist_orders_dataset <- read.csv("../input/brazilian-ecommerce/olist_orders_dataset.csv", row.names=NULL, header=T)
olist_products_dataset <- read.csv("../input/brazilian-ecommerce/olist_products_dataset.csv", row.names=NULL, header=T)

In [45]:
database <- merge(olist_orders_dataset, olist_order_items_dataset, by.x = "order_id")
database <- merge(database, olist_products_dataset, by.x = "product_id")
database <- merge(database, olist_customers_dataset, by.x = "customer_id")

database$order_approved_at <- as.Date(database$order_approved_at)

In [46]:
#frequencia: é o numero de vezes que um mesmo cliente fez compras na loja, independente da quantidade de itens
#recencia: quantidade de dias passados entre a última data da base e a última data de compra do cliente
#receita: valor gasto em moeda BRL

date_max <- max(database$order_approved_at, na.rm = TRUE)

rfm <- database %>%
       group_by(customer_id) %>%              
       summarise(
            frequencia=n(),
            receita=sum(price),
            recencia= as.numeric(difftime(date_max, order_approved_at, units = "days")))

rfm <- na.omit(rfm)

summary(rfm)

In [47]:
rfm_agg <- aggregate(cbind(frequencia,receita,recencia) ~ customer_id, rfm, FUN=sum)

In [48]:
rfm_agg$R_score <- 0
rfm_agg$R_score[rfm_agg$recencia >= 354.00] <- 1
rfm_agg$R_score[rfm_agg$recencia >= 226.00 & rfm_std$recencia <354] <- 2
rfm_agg$R_score[rfm_agg$recencia > 121 & rfm_std$recencia <226] <- 3
rfm_agg$R_score[rfm_agg$recencia <= 121] <- 4

rfm_agg$F_score <- 0
rfm_agg$F_score[rfm_agg$frequencia >= 1] <- 1
rfm_agg$F_score[rfm_agg$frequencia >= 2] <- 2
rfm_agg$F_score[rfm_agg$frequencia >= 3] <- 3
rfm_agg$F_score[rfm_agg$frequencia >= 21] <- 4

rfm_agg$M_score <- 0
rfm_agg$M_score[rfm_agg$receita >= 49] <- 1
rfm_agg$M_score[rfm_agg$receita >= 91] <- 2
rfm_agg$M_score[rfm_agg$receita >= 152] <- 3
rfm_agg$M_score[rfm_agg$receita >= 164] <- 4

head(rfm_agg)

In [49]:
rfm_std <- rfm_agg %>%
column_to_rownames("customer_id") %>%
scale() %>%
data.frame()

In [50]:
rfm_std <- rfm_std[1:5000, 4:6]
rfm_std <- na.omit(rfm_std)

In [51]:
#checking elbow
factoextra::fviz_nbclust(rfm_std, FUN = hcut, method = "wss")

In [52]:
#testing other quantities of centers
rfm_std.k3 <- kmeans(rfm_std, centers = 4)
rfm_std.k4 <- kmeans(rfm_std, centers = 5)
rfm_std.k5 <- kmeans(rfm_std, centers = 6)

#create graphs
G2 <- fviz_cluster(rfm_std.k3, geom = "point",  data = rfm_std) + ggtitle("k = 4")
G3 <- fviz_cluster(rfm_std.k4, geom = "point",  data = rfm_std) + ggtitle("k = 5")
G4 <- fviz_cluster(rfm_std.k5, geom = "point",  data = rfm_std) + ggtitle("k = 6")

#print graphics
grid.arrange(G2, G3, G4, nrow = 2)