 # golub dataset

In [None]:
# install.packages("binom") # Installed from R studio as needs compilation
# if (!require("BiocManager", quietly = TRUE))
#     install.packages("BiocManager")

# BiocManager::install("cancerclass")

In [None]:
library("binom") 
library("evaluomeR")
library("cancerclass")
library("dplyr")
library("caret")

options(scipen=10)

# Table of contents
* [Dataset](#dataset)
    * [Removing correlated](#correlated)
    * [Top 200](#top)
    * [evaluomeR](#evaluomer)
* [Clest](#clest)
* [PCA](#pca)
* [Sensitivity](#sensitivity)
* [CER](#cer)

# Dataset <a class="anchor" id="dataset"></a>

In [None]:
load("leukemia.RData")
golub = as.data.frame(leukemia)

In [2]:
library("evaluomeR") 
library("cancerclass")

load("leukemia.RData")
golub = as.data.frame(leukemia)
golub["Class"] = NULL
golub["sample"] = NULL
golub["type"] = NULL
golub["FAB"] = NULL
golub["gender"] = NULL

colnames(golub)[colnames(golub) == 'Case'] <- 'Description'

seed = 13606
k.range=c(3,10)
cbi = "clara"
stab_range = stabilityRange(data=golub, k.range=k.range, 
                            bs=100, seed=seed,
                            all_metrics=TRUE,
                            cbi=cbi)
stab = standardizeStabilityData(stab_range)

# Qual
qual_range = qualityRange(data=golub, k.range=k.range, 
                            all_metrics=TRUE, seed=seed,
                            cbi=cbi)

qual = standardizeQualityData(qual_range)
# K opt
k_opt = getOptimalKValue(stab_range, qual_range, k.range= k.range)
optimal_k = k_opt$Global_optimal_k
optimal_k_str = paste0("k_", optimal_k)
print(paste0("Optimal k: ", optimal_k))

Loading required package: SummarizedExperiment
Loading required package: GenomicRanges
Loading required package: stats4
Loading required package: S4Vectors

Attaching package: 'S4Vectors'

The following object is masked from 'package:base':

    expand.grid

Loading required package: IRanges

Attaching package: 'IRanges'

The following object is masked from 'package:grDevices':

    windows

Loading required package: GenomeInfoDb
Loading required package: DelayedArray
Loading required package: matrixStats

Attaching package: 'matrixStats'

The following objects are masked from 'package:Biobase':

    anyMissing, rowMedians

Loading required package: BiocParallel

Attaching package: 'DelayedArray'

The following objects are masked from 'package:matrixStats':

    colMaxs, colMins, colRanges, rowMaxs, rowMins, rowRanges

The following objects are masked from 'package:base':

    aperm, apply, rowsum

Loading required package: MultiAssayExperiment
Loading required package: cluster
Loading

[1] "Optimal k: 3"


In [None]:
head(golub)

In [None]:
unique(golub["Class"])

In [None]:
# https://rdrr.io/bioc/cancerclass/man/GOLUB.html

In [None]:
# data(GOLUB1)
# golub = as.data.frame(rdata_golub)

In [None]:
head(golub)
print(paste0("Rows: ", nrow(golub)))
print(paste0("Columns: ", ncol(golub)))
# En el artículo dice: The data are then summarized by a 72 × 3,571

In [None]:
#golub["class"] = NULL
golub["Class"] = NULL # leukemia
golub["sample"] = NULL
golub["type"] = NULL
golub["FAB"] = NULL
golub["gender"] = NULL
print(paste0("Rows: ", nrow(golub)))
print(paste0("Columns: ", ncol(golub)))
#golub["labels"] = rownames(golub)
#golub = golub[ , c("labels", names(golub)[names(golub) != "labels"])] 

In [None]:
head(golub)

# Removing correlated <a class="anchor" id="correlated"></a>

In [None]:
data = golub[-1]
R = cor(data)
head(R)

In [None]:
cor_metrics = findCorrelation(R, cutoff = 1, verbose = FALSE, names=TRUE)
length(cor_metrics)

# Top 200 <a class="anchor" id="top"></a>

In [None]:
variance = sort(sapply(golub[-1], var,na.rm = TRUE), decreasing = TRUE)  # Sorted gene variance
gene_var = as.data.frame(variance)
gene_var["gene"] = rownames(gene_var)
head(gene_var)

In [None]:
top_number = 100
top_genes = gene_var[c(1:top_number), ]
head(top_genes)

In [None]:
gene_list = as.list(top_genes["gene"])

In [None]:
top_gene_list = unlist(setdiff(gene_list, names(golub)))
top_gene_list

In [None]:
top_golub = golub[, top_gene_list]
top_golub["labels"] = rownames(top_golub)
top_golub = top_golub[ , c("labels", names(top_golub)[names(top_golub) != "labels"])] 

In [None]:
head(top_golub)

# evaluomeR K analysis

In [None]:
seed = 13606
k.range=c(3,10)
cbi = "clara"
top_golub = golub
colnames(top_golub)[colnames(top_golub) == 'Case'] <- 'Description'
# Stab
stab_range = stabilityRange(data=top_golub, k.range=k.range, 
                            bs=100, seed=seed,
                            all_metrics=TRUE,
                            cbi=cbi)
stab = standardizeStabilityData(stab_range)

# Qual
qual_range = qualityRange(data=top_golub, k.range=k.range, 
                            all_metrics=TRUE, seed=seed,
                            cbi=cbi)
qual = standardizeQualityData(qual_range)

# K opt
# K opt
k_opt = getOptimalKValue(stab_range, qual_range, k.range= k.range)
optimal_k = k_opt$Global_optimal_k
optimal_k_str = paste0("k_", optimal_k)
print(paste0("Optimal k: ", optimal_k))

In [None]:
print(paste0("W/ ", cbi))
stab
qual
print(paste0("Stab in k=", optimal_k,": ", stab[optimal_k_str]))
print(paste0("Qual in k=", optimal_k,": ", qual[optimal_k_str]))

# Clusters

In [None]:
individuals_per_cluster = function(qualityResult) {
  qual_df = as.data.frame(assay(qualityResult))


  cluster_pos_str = as.character(unlist(qual_df["Cluster_position"]))
  cluster_labels_str = as.character(unlist(qual_df["Cluster_labels"]))

  cluster_pos = as.list(strsplit(cluster_pos_str, ",")[[1]])
  cluster_labels = as.list(strsplit(cluster_labels_str, ",")[[1]])

  individuals_in_cluster = as.data.frame(cbind(cluster_labels, cluster_pos))
  colnames(individuals_in_cluster) = c("Individual", "InCluster")

  return(individuals_in_cluster)
}

In [None]:
qual_cbi = qualityRange(data=top_golub, k.range=k.range, 
                            all_metrics=TRUE, seed=seed,
                            cbi=cbi)
cluster_individuals = individuals_per_cluster(assay(qual_cbi[optimal_k_str]))
print(paste0("CBI: ", cbi, " - k: ", optimal_k))
standardizeQualityData(qual_cbi)
for (cluster_i in 1:optimal_k) {
    ind_in_cluster = paste(unlist(cluster_individuals[cluster_individuals$InCluster == cluster_i, ]["Individual"]), collapse = ",")
    print(paste("Cluster", cluster_i, ":", ind_in_cluster))
    print("---")
}

# Clest <a class="anchor" id="clest"></a>

In [None]:
#top_golub_copy = top_golub
#top_golub[-1]

In [None]:
# -- Clest
#golub_clest = Clest(as.matrix(top_golub_copy[-1]), maxK = 6, alpha=0, B=3, B0=10, beta = 0.01, nstart=100,pca=FALSE,L1=3,silent=TRUE);

In [None]:
#names(golub_clest)
#golub_clest$observedCERs
#golub_clest$K

In [None]:
#?Clest

# PCA <a class="anchor" id="pca"></a>

In [None]:
#PCA
print(length(cluster_individuals$InCluster))
print(nrow(top_golub))
top_golub["inCluster"] = as.numeric(cluster_individuals$InCluster)
unique(cluster_individuals$InCluster)

In [None]:
pca_matrix = top_golub %>% select(-Description, -inCluster)
head(pca_matrix)

In [None]:
pca_result <- prcomp(pca_matrix, scale. = TRUE)

In [None]:
pca_df <- data.frame(pca_result$x)
pca_df$Cluster <- as.factor(top_golub$inCluster)
pca_df$Individual <- top_golub$Description

# Plot PCA results

ggplot2::ggplot(pca_df, ggplot2::aes(x = PC1, y = PC2, color = Cluster, label = Individual)) +
  ggplot2::geom_point(size = 3) +
  ggplot2::geom_text(vjust = 1, hjust = 1) +
  ggplot2::labs(title = "PCA of Features",
       x = "Principal Component 1",
       y = "Principal Component 2") +
  ggplot2::theme_minimal()

# Sensitivity <a class="anchor" id="sensitivity"></a>

In [None]:
library("MLmetrics")

In [None]:
top_golub["Class"] = as.data.frame(leukemia)["Class"]
head(top_golub)[, c("Description", "Class")]

In [None]:
# Define levels and their corresponding numbers
# Order is important
# kmeans k = 4
#level_mapping <- c("M" = 1, "T" = 2, "MM" = 3, "B" = 4)
# clara k = 3
level_mapping <- c("B" = 1, "T" = 2, "M" = 3)


In [None]:
map_strings_to_numbers <- function(strings) {
  factorized <- factor(strings, levels = names(level_mapping))
  as.numeric(factorized)
}
top_golub["Class_n"] = lapply(top_golub["Class"], map_strings_to_numbers)
top_golub[, c("Description", "Class", "Class_n")]

In [None]:
# Getting a vector of clusters
actual = as.factor(as.vector(unlist(top_golub["Class_n"])))
predicted <- factor(as.vector(unlist(top_golub["inCluster"])))

print("actual")
actual
print("predicted")
predicted

In [None]:
sens <- Sensitivity(y_pred = predicted, y_true = actual)
sens = format(round(sens*100, 2), nsmall = 2)
print(paste0("Sensitivity: ", sens, "%"))

# CER <a class="anchor" id="cer"></a>

In [None]:
cer <- CER(predicted, actual)
cer = format(round(cer*100, 2), nsmall = 2)
print(paste0("CER: ", cer, "%"))