In [2]:
library(tidyverse)
list.files(path = "../input")
library(ggplot2)
library(ggpubr)
library(plyr)
library(modeest)
library(gridExtra)
require(mclust)

In [3]:
path="../input/bcancer006/fdata.csv"
data = read.csv(path, header=T, sep=";")

In [4]:
head(data)

In [5]:
nrow(data); ncol(data)

In [6]:
summary(data)

In [7]:
ggplot(data, aes(x=as.factor(outcome) )) +
  geom_bar(color="blue", fill=rgb(0.1,0.4,0.5,0.7) ) +
  labs(x = "Pacientes recurrentes antes de los 24 meses",
         y = "# Pacientes",
         color = "Legend")+
  scale_x_discrete(breaks=c("R","N"),
        labels=c("Recurrente", "No-Recurrente"))

In [8]:
table(data$outcome)

In [9]:
hist(data$time, breaks=10, main = "Distribución de tiempo", 
     xlab = "Tiempo", border = "purple", 
     col = "orange")

In [10]:
hist(data$radius_mean, breaks=8, main = "Distribución de Radio promedios", 
     xlab = "Radio promedio", border = "purple", 
     col = "orange")

In [11]:
hist(data$perimeter_mean, breaks=18, main = "Distribución de Perímetros promedios", 
     xlab = "Perímetro promedio", border = "purple", 
     col = "orange")

In [12]:
shapiro.test(data$perimeter_mean)

In [13]:
hist(data$area_mean, breaks=10, main = "Distribución de Areas promedios", 
     xlab = "Area promedio", border = "purple", 
     col = "orange")

In [14]:
hist(data$smoothness_mean, breaks=10, main = "Distribución de Suavidad promedios", 
     xlab = "Suavidad promedio", border = "purple", 
     col = "orange")

In [15]:
hist(data$texture_mean, breaks=12, main = "Distribución de textura promedios", 
     xlab = "Textura promedio", border = "purple", 
     col = "orange")

In [16]:
shapiro.test(data$texture_mean)

In [17]:
hist(data$fractal_dimension_mean, breaks=10, main = "Distribución de dimensión fractal promedio", 
     xlab = "Dimensión fractal promedio", border = "purple", 
     col = "orange")

In [18]:
shapiro.test(data$fractal_dimension_mean)

In [19]:
hist(data$concavity_mean, breaks=10, main = "Distribución de concavidad promedio", 
     xlab = "Concavidad promedio", border = "purple", 
     col = "orange")

In [20]:
shapiro.test(data$concavity_mean)

In [21]:
data_frame = data[c(2,4:13)]


In [22]:
head(data_frame)

In [23]:

parsingOutcome <- function(col){
    
        col <- revalue(col, c("N" = 2))
        col <- revalue(col, c("R" = 1))
        col <- as.integer((col)) 
    return (col)
}

In [24]:

  data_frame[["outcome"]] <- parsingOutcome(data_frame[["outcome"]])


In [25]:
head(data_frame)

In [26]:
library(corrplot)

In [27]:
corrplot(cor(data_frame), method = "color", addCoef.col="grey", order = "AOE",number.cex=0.75)

In [28]:
data_frame.sub <- data_frame[,c(2:10)]
# Creamos un objeto PCA
data_frame.pca <- prcomp(data_frame.sub , scale.=TRUE)

In [29]:
summary(data_frame.pca)

In [30]:
library("FactoMineR")
library("factoextra")

In [31]:
pca <- PCA(data_frame.sub, scale.unit = TRUE, ncp = 8, graph = TRUE)

In [32]:
pca$eig

In [33]:
fviz_eig(pca, addlabels = TRUE, ylim = c(0, 60))

In [34]:
pca_var <- get_pca_var(pca)
corrplot(pca_var$cos2, is.corr=FALSE)

In [35]:
data_full = data[c(2, 4:33)]
head(data_full)

In [36]:
data_full.sub <- data_full[,c(2:31)]
# Creamos un objeto PCA
data_full.pca <- prcomp(data_full.sub , scale.=TRUE)

In [37]:
corrplot(cor(data_full.sub), method = "color", addCoef.col="grey", order = "AOE",number.cex=0.75)


In [38]:
summary(data_full.pca)

In [39]:
pca_full <- PCA(data_full.sub, scale.unit = TRUE, ncp = 8, graph = TRUE)

In [40]:
pca_full$eig

In [41]:
pca_var <- get_pca_var(pca_full)
corrplot(pca_var$cos2, is.corr=FALSE)

In [42]:
plot.box<-function(
  table,
  x,
  y,
  xlab,
  ylab,
  tittle
){
  ggboxplot(
    table, x = x, y = y,
    color = x, 
    palette = c("#00AFBB", "#E7B800", "#FC4E07"),
    xlab = xlab,
    add = "jitter",
    ylab = ylab
  ) + scale_x_discrete(
      breaks=c("1","2"),
      labels=c("R", "NR"))
}

In [43]:
length(data_frame$outcome)

In [44]:
class <- c(data_frame$outcome)
concavepoints <- c(data_frame[(9)])
table <- data.frame('class'= class, 'concave.points_mean'= concavepoints)
plot.box(table, "class","concave.points_mean","Pacientes","Concave.Points_mean", "Grafico 1")

In [45]:
concavity_mean <- c(data_frame$concavity_mean)
table <- data.frame('class'= class, 'concavity_mean'= concavity_mean)
plot.box(table, "class","concavity_mean","Pacientes","concavity_mean", "Grafico 2")

In [46]:
radius_mean <- c(data_frame$radius_mean)
table <- data.frame('class'= class, 'radius_mean'= radius_mean)
plot.box(table, "class","radius_mean","Pacientes","radius_mean", "Grafico 3")

In [47]:
area_mean <- c(data_frame$area_mean)
table <- data.frame('class'= class, 'radius_mean'= area_mean)
plot.box(table, "class","area_mean","Pacientes","area_mean", "Grafico 3")

In [48]:
attach(data_frame)
plot(area_mean, perimeter_mean, col=c("red","blue")[outcome]); detach(data_frame)

In [49]:
attach(data_frame)
plot(area_mean,smoothness_mean, col=c("red","blue")[outcome]); detach(data_frame)

In [50]:
attach(data_frame)
plot(concave.points_mean,perimeter_mean, col=c("red","blue")[outcome]); detach(data_frame)

In [51]:
attach(data_frame)
plot(fractal_dimension_mean,smoothness_mean, col=c("red","blue")[outcome]); detach(data_frame)

In [52]:
library(cluster)
library(BBmisc)
data_frame.scaled = normalize(data_frame, method="standardize")

In [53]:
head(data_frame.scaled)

In [54]:

fviz_nbclust(data_frame.scaled, kmeans, method = "wss")

In [55]:
fviz_nbclust(data_frame.scaled, kmeans, method = "silhouette")

In [56]:
fviz_nbclust(data_frame.scaled, kmeans, method = "gap_stat")

In [57]:
df.3k <- kmeans(data_frame.scaled, 3, nstart = 25)
df.2k <- kmeans(data_frame.scaled, 2, nstart = 25)
df.4k <- kmeans(data_frame.scaled, 4, nstart = 25)
df.5k <- kmeans(data_frame.scaled, 5, nstart = 25)

In [58]:
fviz_cluster(df.2k, data = data_frame.scaled)
fviz_cluster(df.3k, data = data_frame.scaled)
fviz_cluster(df.4k, data = data_frame.scaled)
fviz_cluster(df.5k, data = data_frame.scaled)

In [59]:
data_frame_nc = data_frame[c(2:11)]

In [60]:
head(data_frame_nc)

In [61]:
data_frame_nc.scaled = normalize(data_frame_nc, method="standardize")

In [62]:
head(data_frame_nc.scaled)

In [63]:
fviz_nbclust(data_frame_nc.scaled, kmeans, method = "wss")
fviz_nbclust(data_frame_nc.scaled, kmeans, method = "silhouette")

In [64]:
df.3k <- kmeans(data_frame_nc.scaled, 3, nstart = 25)
df.2k <- kmeans(data_frame_nc.scaled, 2, nstart = 25)
df.4k <- kmeans(data_frame_nc.scaled, 4, nstart = 25)
df.5k <- kmeans(data_frame_nc.scaled, 5, nstart = 25)
fviz_cluster(df.2k, data = data_frame_nc.scaled)
fviz_cluster(df.3k, data = data_frame_nc.scaled)
fviz_cluster(df.4k, data = data_frame_nc.scaled)
fviz_cluster(df.5k, data = data_frame_nc.scaled)

## PCA y Cluster

In [65]:
dfncscaled_pca = data_frame_nc.scaled[c(1,2,3,4,7,8)]
head(dfncscaled_pca)

In [66]:
df.3k <- kmeans(dfncscaled_pca, 3, nstart = 25)
df.2k <- kmeans(dfncscaled_pca, 2, nstart = 25)
df.4k <- kmeans(dfncscaled_pca, 4, nstart = 25)
df.5k <- kmeans(dfncscaled_pca, 5, nstart = 25)
fviz_cluster(df.2k, data = dfncscaled_pca)
fviz_cluster(df.3k, data = dfncscaled_pca)
fviz_cluster(df.4k, data = dfncscaled_pca)
fviz_cluster(df.5k, data = dfncscaled_pca)

In [108]:
table(class,df.5k$cluster)

In [67]:
head(data_frame)

## Agrupamiento

In [68]:
sil_coef2 <-silhouette(df.2k$cluster, dist(data_frame_nc.scaled))
sil_coef3 <-silhouette(df.3k$cluster, dist(data_frame_nc.scaled))
sil_coef4 <-silhouette(df.4k$cluster, dist(data_frame_nc.scaled))
fviz_silhouette(sil.obj = sil_coef2, print.summary = TRUE, palette = "Set2",
                ggtheme = theme_classic())
fviz_silhouette(sil.obj = sil_coef3, print.summary = TRUE, palette = "Set2",
                ggtheme = theme_classic())
fviz_silhouette(sil.obj = sil_coef4, print.summary = TRUE, palette = "Set2",
                ggtheme = theme_classic())

In [69]:
data_full_pca = data_full[c(3,7,8,9,11,23,31)]

In [70]:
head(data_full_pca)

In [71]:
corrplot(cor(data_full_pca), method = "color", addCoef.col="grey", order = "AOE",number.cex=0.75)

### MClust + PCA Full + Correlation filter

In [72]:
mclustfullpca_filter = Mclust(data_full_pca[c(1,3,5)])

In [73]:
summary(mclustfullpca_filter)

In [74]:
plot(mclustfullpca_filter, what="classification")

In [75]:
mclustfullpca_filter.bic = mclustBIC(data_full_pca[c(1,3,5)], prior = priorControl(functionName="defaultPrior", shrinkage=0.1))

In [76]:
plot(mclustfullpca_filter.bic)
summary(mclustfullpca_filter.bic)
## La clase 1 corresponde a la cantidad de pacientes que incurrieron y la clase 2 a los que no.
table(class, mclustfullpca_filter$classification)

In [77]:
mclustfullpca_filter_best = Mclust(data_full_pca[c(1,3,5)], modelNames ="VEE", G=2)

In [78]:
summary(mclustfullpca_filter_best)

In [79]:
plot(mclustfullpca_filter_best, what="classification")

In [80]:
mclustfullpca_filter_best3 = Mclust(data_full_pca[c(1,3,5)], modelNames ="VEV", G=3)

In [81]:
summary(mclustfullpca_filter_best3)

In [82]:
plot(mclustfullpca_filter_best3, what="classification")

In [83]:
table(class, mclustfullpca_filter_best3$classification)

In [84]:
fviz_nbclust(data_full_pca[c(1,3,5)], kmeans, method = "wss")
fviz_nbclust(data_full_pca[c(1,3,5)], kmeans, method = "silhouette")

In [85]:
data_full_pca.2k <- kmeans(data_full_pca[c(1,3,5)], 2, nstart = 25)
data_full_pca.3k <- kmeans(data_full_pca[c(1,3,5)], 3, nstart = 25)

In [107]:
table(class,data_full_pca.3k$cluster)


In [87]:
fviz_cluster(data_full_pca.2k, data = data_full_pca[c(1,3,5)])
fviz_cluster(data_full_pca.3k, data = data_full_pca[c(1,3,5)])

In [88]:
sil_coef2 <-silhouette(data_full_pca.2k$cluster, dist(data_full_pca[c(1,3,5)]))
sil_coef3 <-silhouette(data_full_pca.3k$cluster, dist(data_full_pca[c(1,3,5)]))
fviz_silhouette(sil.obj = sil_coef2, print.summary = TRUE, palette = "Set2",
                ggtheme = theme_classic())
fviz_silhouette(sil.obj = sil_coef3, print.summary = TRUE, palette = "Set2",
                ggtheme = theme_classic())

In [89]:
corrplot(cor(data_frame[,2:11]), method = "color", addCoef.col="grey", order = "AOE",number.cex=0.75)

In [90]:
data_frame_pca4 = data_frame[c(3,4,6,8)]

In [91]:
head(data_frame_pca4)

In [92]:
mclustpca4 = Mclust(data_frame_pca4)

In [93]:
summary(mclustpca4)

In [94]:
plot(mclustpca4, what="classification")

In [95]:
mclustpca4.bic = mclustBIC(data_frame_pca4, prior = priorControl(functionName="defaultPrior", shrinkage=0.1))

In [96]:
plot(mclustpca4.bic)
summary(mclustpca4.bic)
## La clase 1 corresponde a la cantidad de pacientes que incurrieron y la clase 2 a los que no.
table(class, mclustpca4$classification)

In [97]:
mclustfullpca_filter_best4 = Mclust(data_frame_pca4, modelNames ="VEE", G=2)

In [98]:
plot(mclustfullpca_filter_best4)

In [99]:
summary(mclustfullpca_filter_best4)

In [100]:
table(class, mclustfullpca_filter_best4$classification)

In [101]:
data_frame_pca3= data_frame[c(3,4,8)]

In [102]:
head(data_frame_pca3)

In [103]:
mclustpca3 = Mclust(data_frame_pca3)
summary(mclustpca3)
table(class, mclustpca3$classification)

In [104]:
data_frame_pca3.bic = mclustBIC(data_frame_pca3, prior = priorControl(functionName="defaultPrior", shrinkage=0.1))

In [105]:
plot(data_frame_pca3.bic)
summary(data_frame_pca3.bic)

In [2]:
version