In [1]:
library(tidyverse)
list.files(path = "../input")
library(ggplot2)
library(ggpubr)
library(plyr)
library(modeest)
library(gridExtra)
require(mclust)

In [2]:
path="../input/bcancer006/fdata.csv"
data = read.csv(path, header=T, sep=";")

In [3]:
head(data)

In [4]:
nrow(data); ncol(data)

In [5]:
summary(data)

In [6]:
ggplot(data, aes(x=as.factor(outcome) )) +
  geom_bar(color="blue", fill=rgb(0.1,0.4,0.5,0.7) ) +
  labs(x = "Pacientes recurrentes antes de los 24 meses",
         y = "# Pacientes",
         color = "Legend")+
  scale_x_discrete(breaks=c("R","N"),
        labels=c("Recurrente", "No-Recurrente"))

In [7]:
table(data$outcome)

In [8]:
hist(data$time, breaks=10, main = "Distribución de tiempo", 
     xlab = "Tiempo", border = "purple", 
     col = "orange")

In [9]:
hist(data$radius_mean, breaks=8, main = "Distribución de Radio promedios", 
     xlab = "Radio promedio", border = "purple", 
     col = "orange")

In [10]:
hist(data$perimeter_mean, breaks=10, main = "Distribución de Perímetros promedios", 
     xlab = "Perímetro promedio", border = "purple", 
     col = "orange")

In [11]:
hist(data$area_mean, breaks=10, main = "Distribución de Areas promedios", 
     xlab = "Area promedio", border = "purple", 
     col = "orange")

In [12]:
hist(data$smoothness_mean, breaks=10, main = "Distribución de Suavidad promedios", 
     xlab = "Suavidad promedio", border = "purple", 
     col = "orange")

In [13]:
hist(data$fractal_dimension_mean, breaks=10, main = "Distribución de dimensión fractal promedio", 
     xlab = "Dimensión fractal promedio", border = "purple", 
     col = "orange")

In [14]:
data_frame = data[c(2,4:13)]


In [15]:
head(data_frame)

In [16]:

parsingOutcome <- function(col){
    
        col <- revalue(col, c("N" = 2))
        col <- revalue(col, c("R" = 1))
        col <- as.integer((col)) 
    return (col)
}

In [17]:

  data_frame[["outcome"]] <- parsingOutcome(data_frame[["outcome"]])


In [18]:
head(data_frame)

In [19]:
library(corrplot)

In [20]:
corrplot(cor(data_frame), method = "color", addCoef.col="grey", order = "AOE",number.cex=0.75)

In [21]:
data_frame.sub <- data_frame[,c(2:10)]
# Creamos un objeto PCA
data_frame.pca <- prcomp(data_frame.sub , scale.=TRUE)

In [22]:
summary(data_frame.pca)

In [23]:
library("FactoMineR")
library("factoextra")

In [24]:
pca <- PCA(data_frame.sub, scale.unit = TRUE, ncp = 8, graph = TRUE)

In [25]:
pca$eig

In [26]:
fviz_eig(pca, addlabels = TRUE, ylim = c(0, 60))

In [27]:
pca_var <- get_pca_var(pca)
corrplot(pca_var$cos2, is.corr=FALSE)

In [28]:
data_full = data[c(2, 4:33)]
head(data_full)

In [29]:
data_full.sub <- data_full[,c(2:31)]
# Creamos un objeto PCA
data_full.pca <- prcomp(data_full.sub , scale.=TRUE)

In [30]:
summary(data_full.pca)

In [31]:
pca_full <- PCA(data_full.sub, scale.unit = TRUE, ncp = 8, graph = TRUE)

In [32]:
pca_full$eig

In [33]:
pca_var <- get_pca_var(pca_full)
corrplot(pca_var$cos2, is.corr=FALSE)

In [34]:
plot.box<-function(
  table,
  x,
  y,
  xlab,
  ylab,
  tittle
){
  ggboxplot(
    table, x = x, y = y,
    color = x, 
    palette = c("#00AFBB", "#E7B800", "#FC4E07"),
    xlab = xlab,
    add = "jitter",
    ylab = ylab
  ) + scale_x_discrete(
      breaks=c("1","2"),
      labels=c("R", "NR"))
}

In [35]:
length(data_frame$outcome)

In [36]:
class <- c(data_frame$outcome)
concavepoints <- c(data_frame[(9)])
table <- data.frame('class'= class, 'concave.points_mean'= concavepoints)
plot.box(table, "class","concave.points_mean","Pacientes","Concave.Points_mean", "Grafico 1")

In [37]:
concavity_mean <- c(data_frame$concavity_mean)
table <- data.frame('class'= class, 'concavity_mean'= concavity_mean)
plot.box(table, "class","concavity_mean","Pacientes","concavity_mean", "Grafico 2")

In [38]:
radius_mean <- c(data_frame$radius_mean)
table <- data.frame('class'= class, 'radius_mean'= radius_mean)
plot.box(table, "class","radius_mean","Pacientes","radius_mean", "Grafico 3")

In [39]:
area_mean <- c(data_frame$area_mean)
table <- data.frame('class'= class, 'radius_mean'= area_mean)
plot.box(table, "class","area_mean","Pacientes","area_mean", "Grafico 3")

In [40]:
attach(data_frame)
plot(area_mean, perimeter_mean, col=c("red","blue")[outcome]); detach(data_frame)

In [41]:
attach(data_frame)
plot(area_mean,smoothness_mean, col=c("red","blue")[outcome]); detach(data_frame)

In [42]:
attach(data_frame)
plot(concave.points_mean,perimeter_mean, col=c("red","blue")[outcome]); detach(data_frame)

In [43]:
attach(data_frame)
plot(fractal_dimension_mean,smoothness_mean, col=c("red","blue")[outcome]); detach(data_frame)

In [44]:
library(cluster)
library(BBmisc)
data_frame.scaled = normalize(data_frame, method="standardize")

In [45]:
head(data_frame.scaled)

In [46]:

fviz_nbclust(data_frame.scaled, kmeans, method = "wss")

In [47]:
fviz_nbclust(data_frame.scaled, kmeans, method = "silhouette")

In [48]:
fviz_nbclust(data_frame.scaled, kmeans, method = "gap_stat")

In [49]:
df.3k <- kmeans(data_frame.scaled, 3, nstart = 25)
df.2k <- kmeans(data_frame.scaled, 2, nstart = 25)
df.4k <- kmeans(data_frame.scaled, 4, nstart = 25)
df.5k <- kmeans(data_frame.scaled, 5, nstart = 25)

In [50]:
fviz_cluster(df.2k, data = data_frame.scaled)
fviz_cluster(df.3k, data = data_frame.scaled)
fviz_cluster(df.4k, data = data_frame.scaled)
fviz_cluster(df.5k, data = data_frame.scaled)

In [51]:
data_frame_nc = data_frame[c(2:11)]

In [52]:
head(data_frame_nc)

In [53]:
data_frame_nc.scaled = normalize(data_frame_nc, method="standardize")

In [54]:
head(data_frame_nc.scaled)

In [55]:
fviz_nbclust(data_frame_nc.scaled, kmeans, method = "wss")
fviz_nbclust(data_frame_nc.scaled, kmeans, method = "silhouette")

In [56]:
df.3k <- kmeans(data_frame_nc.scaled, 3, nstart = 25)
df.2k <- kmeans(data_frame_nc.scaled, 2, nstart = 25)
df.4k <- kmeans(data_frame_nc.scaled, 4, nstart = 25)
df.5k <- kmeans(data_frame_nc.scaled, 5, nstart = 25)
fviz_cluster(df.2k, data = data_frame_nc.scaled)
fviz_cluster(df.3k, data = data_frame_nc.scaled)
fviz_cluster(df.4k, data = data_frame_nc.scaled)
fviz_cluster(df.5k, data = data_frame_nc.scaled)

## PCA y Cluster

In [57]:
dfncscaled_pca = data_frame_nc.scaled[c(1,2,3,4,7,8)]
head(dfncscaled_pca)

In [58]:
df.3k <- kmeans(dfncscaled_pca, 3, nstart = 25)
df.2k <- kmeans(dfncscaled_pca, 2, nstart = 25)
df.4k <- kmeans(dfncscaled_pca, 4, nstart = 25)
df.5k <- kmeans(dfncscaled_pca, 5, nstart = 25)
fviz_cluster(df.2k, data = dfncscaled_pca)
fviz_cluster(df.3k, data = dfncscaled_pca)
fviz_cluster(df.4k, data = dfncscaled_pca)
fviz_cluster(df.5k, data = dfncscaled_pca)

In [59]:
head(data_frame)

## Agrupamiento

In [60]:
mod1 = Mclust(data_frame[,2:10]) #DEFAULT 

In [61]:
summary(mod1)

### Sin class, pc

In [62]:
data_frame_nc_reduced = data_frame_nc[c(1,2,3,4,7,8)]

In [63]:
mod2 = Mclust(data_frame_nc_reduced[1:6])

In [64]:
summary(mod2)

In [65]:
plot(mod2, what = "classification")

In [66]:
class = data_frame$outcome

In [67]:
BIC = mclustBIC(data_frame_nc_reduced[,1:6], prior = priorControl(functionName="defaultPrior", shrinkage=0.1))

In [68]:
plot(BIC)
summary(BIC)

In [69]:
modBest = Mclust(data_frame_nc_reduced[,1:6], x=BIC)

In [70]:
summary(modBest)

In [71]:
plot(modBest, what="classification")

In [72]:
table(class, modBest$classification)

In [73]:
BIC2 = mclustBIC(data_frame_nc[1:10], prior = priorControl(functionName="defaultPrior", shrinkage=0.1))

In [74]:
modBest = Mclust(data_frame_nc[1:10], x=BIC2)

In [75]:
plot(BIC2)
summary(BIC2)
table(class, modBest$classification)

In [76]:
plot(modBest, what="classification")

In [77]:
library(ROSE)

In [78]:
data_frame2 = data_frame

In [79]:
data_frame2$class = factor(data_frame$outcome)
data_frame2 = data_frame2[2:12]
head(data_frame2)

In [80]:
data_frame.syn <- ROSE(class ~ ., data = data_frame2, seed = 1)$data

table(data_frame$outcome)

table(data_frame.syn$class)


In [81]:
head(data_frame.syn)

In [82]:
data_frame.syn_pca <- prcomp(data_frame.syn[1:10] , scale.=TRUE)

In [83]:
summary(data_frame.syn_pca)

In [84]:
pca_full <- PCA(data_frame.syn[1:10], scale.unit = TRUE, ncp = 8, graph = TRUE)

In [85]:
pca_full$eig

In [86]:
pca_var <- get_pca_var(pca_full)
corrplot(pca_var$cos2, is.corr=FALSE)

In [87]:
attach(data_frame.syn[1:11])
plot(area_mean, perimeter_mean, col=c("red","blue")[class]); detach(data_frame.syn[1:11])

In [88]:
df_syn_scaled = normalize(data_frame.syn[1:11], method="standardize")

In [89]:
fviz_nbclust(df_syn_scaled[1:10], kmeans, method = "wss")
fviz_nbclust(df_syn_scaled[1:10], kmeans, method = "silhouette")

In [90]:
df_syn_scaled_nc = df_syn_scaled[1:10]

In [91]:
dfs.3k <- kmeans(df_syn_scaled_nc, 3, nstart = 25)
dfs.2k <- kmeans(df_syn_scaled_nc, 2, nstart = 25)
dfs.4k <- kmeans(df_syn_scaled_nc, 4, nstart = 25)
dfs.5k <- kmeans(df_syn_scaled_nc, 5, nstart = 25)
fviz_cluster(dfs.2k, data = df_syn_scaled_nc)
fviz_cluster(dfs.3k, data = df_syn_scaled_nc)
fviz_cluster(dfs.4k, data = df_syn_scaled_nc)
fviz_cluster(dfs.5k, data = df_syn_scaled_nc)

In [92]:
sil_coef2 <-silhouette(dfs.2k$cluster, dist(df_syn_scaled_nc))
sil_coef3 <-silhouette(dfs.3k$cluster, dist(df_syn_scaled_nc))
sil_coef4 <-silhouette(dfs.4k$cluster, dist(df_syn_scaled_nc))
fviz_silhouette(sil.obj = sil_coef2, print.summary = TRUE, palette = "Set2",
                ggtheme = theme_classic())
fviz_silhouette(sil.obj = sil_coef3, print.summary = TRUE, palette = "Set2",
                ggtheme = theme_classic())
fviz_silhouette(sil.obj = sil_coef4, print.summary = TRUE, palette = "Set2",
                ggtheme = theme_classic())

In [93]:
sil_coef2 <-silhouette(df.2k$cluster, dist(data_frame_nc.scaled))
sil_coef3 <-silhouette(df.3k$cluster, dist(data_frame_nc.scaled))
sil_coef4 <-silhouette(df.4k$cluster, dist(data_frame_nc.scaled))
fviz_silhouette(sil.obj = sil_coef2, print.summary = TRUE, palette = "Set2",
                ggtheme = theme_classic())
fviz_silhouette(sil.obj = sil_coef3, print.summary = TRUE, palette = "Set2",
                ggtheme = theme_classic())
fviz_silhouette(sil.obj = sil_coef4, print.summary = TRUE, palette = "Set2",
                ggtheme = theme_classic())

In [94]:
head(df_syn_scaled_nc)

In [95]:
df_syn_pca = df_syn_scaled_nc[c(2,4,5,6,8,10)]
head(df_syn_pca)

In [96]:
dfsp.3k <- kmeans(df_syn_pca, 3, nstart = 25)
dfsp.2k <- kmeans(df_syn_pca, 2, nstart = 25)
dfsp.4k <- kmeans(df_syn_pca, 4, nstart = 25)
dfsp.5k <- kmeans(df_syn_pca, 5, nstart = 25)
fviz_cluster(dfsp.2k, data = df_syn_pca)
fviz_cluster(dfsp.3k, data = df_syn_pca)
fviz_cluster(dfsp.4k, data = df_syn_pca)
fviz_cluster(dfsp.5k, data = df_syn_pca)

In [97]:
sil_coef2 <-silhouette(dfsp.2k$cluster, dist(df_syn_pca))
sil_coef3 <-silhouette(dfsp.3k$cluster, dist(df_syn_pca))
sil_coef4 <-silhouette(dfsp.4k$cluster, dist(df_syn_pca))
fviz_silhouette(sil.obj = sil_coef2, print.summary = TRUE, palette = "Set2",
                ggtheme = theme_classic())
fviz_silhouette(sil.obj = sil_coef3, print.summary = TRUE, palette = "Set2",
                ggtheme = theme_classic())
fviz_silhouette(sil.obj = sil_coef4, print.summary = TRUE, palette = "Set2",
                ggtheme = theme_classic())

In [98]:
df_syn_pca$class = df_syn_scaled$class

In [99]:
table(df_syn)

In [None]:
head(data_frame.syn)

In [None]:

modsynpca = Mclust(data_frame.syn[1:10])

In [None]:
summary(modsynpca)
plot(modsynpca, what="classification")