# Big Data Analytics - Project - 2021

### Authors: Kirill ??, Lilith Feer, Luca Guenin, Mark Martori Lopez, Remo ??
### Matrikel-Nr.: ???, 16720799, 16609521, 19759869, ???

## Goal

### Goal Description

# 

## Packages & Libraries

In [None]:
#install.packages("BiocManager")
#BiocManager::install("Rgraphviz")

#install.packages('CePa', .libPaths(), repos='http://cran.us.r-project.org')
# or
#BiocManager::install("cmapR")


#install.packages("data.table")
#install.packages("dplyr")
install.packages("Hmisc")
#BiocManager::install("outliers")

In [None]:
library(ggplot2)
#library(CePa)
library(data.table)
library(dplyr) 
library(tidyr)
library(cmapR)
library(outliers)
library(Hmisc)

## Dataset
### Description:
#### Pancreatic cancer arises from the abnormal and uncontrolled growth of cells in the tissues of the pancreas. Pancreatic adenocarcinoma (PAAD) is the most common type of pancreatic cancer, accounting for approximately 85% of all types of pancreatic cancer. This cancer is the twelfth most common cancer and the seventh leading cause of cancer-related death. [01]

#### The dataset contains normalized RNA Sequencing reads for pancreatic cancer tumors. The measurement consists of ~20,000 genes for 185 pancreatic cancer tumors. The file format is GCT , a tab-delimited file used for sharing gene expression data and metadata (details for each sample) for samples.

### Link:
### https://www.kaggle.com/abhiparashar/cancer-prediction?select=PAAD.gct

### Load:

In [None]:
# 1 -  Read gct:
# original_dataset <- read.gct("Data/PAAD.gct")

# 2 - Read in data.table format:                    # Setting Participant ID as column Name -> skip = 3
original_dataset <- fread("Data/PAAD.gct", skip = 3, quote = "", header = TRUE, sep = "\t")

In [None]:
# Dimensions of the dataset
dim(original_dataset)

# Overview of types of Data
#str(original_dataset)

# Patients IDs
patients_IDs <- colnames(original_dataset)

# Features
rownames <- original_dataset[,1]

# Attributes
#attributes(original_dataset)

head(original_dataset,20)

### Separate the two data sets (meta data and expression data)

In [None]:
Expression_data <- original_dataset[124:length(original_dataset$participant_id),]
Patient_data <- original_dataset[1:123,]
Patient_data
Expression_data

### Transpose the data sets

In [None]:
transposefunction <- function(dataset){
    rowname <- dataset$participant_id
    colname <- 1:length(original_dataset[1,])
    transposed_dataset <- transpose(dataset)
    rownames(transposed_dataset) <- colname
    colnames(transposed_dataset) <- rowname
    transposed_dataset <- transposed_dataset[2:nrow(transposed_dataset),]
    return(transposed_dataset)
}

t_expression_data <- transposefunction(Expression_data)
t_patient_data <- transposefunction(Patient_data)

dim(t_expression_data)
dim(t_patient_data)

In [None]:
# convert the whole data set into floats
t_expression_data[] <- lapply(t_expression_data, function(x) {as.numeric(as.character(x))})

dimensions <- dim(t_expression_data)
dimensions[1]*dimensions[2]

table(is.na(t_expression_data))

summary(is.na(t_expression_data))
## there are 134'954 missing values (from 3'379'095 total) in the expression data set

In [None]:
table(is.na(t_patient_data))

dimensions <- dim(t_patient_data)
dimensions[1]*dimensions[2]

summary(is.na(t_patient_data))
## 9654 of 22632 are missing values --> but some have a meaning (e.g. when patients haven't died days_to_death is NA)

In [None]:
head(t_patient_data,20)

In [None]:
# turn the whole dataset into floats
head(t_expression_data,20)

### Histograms and check for outliers using grubbs test

In [None]:
outlier_fun <- function(df){
  for (columnindex in 1:ncol(df)){
      hist(data.matrix(df[,..columnindex]))
      print(grubbs.test(data.matrix(df[,..columnindex])))
  }
}

outlier_fun(t_expression_data)
# if the p-value is less than the chosen significance threshold (generally α = 0.05) then the 
# null hypothesis is rejected and we will conclude that the lowest/highest value is an outlier.

## Patient meta data histograms 

### Numeric columns

In [None]:
t_patient_data <- t_patient_data %>%
    mutate_all(type.convert) %>%
    mutate_if(is.factor, as.character)

# only numeric values
df_patient_num <-  t_patient_data %>% select_if(is.numeric)

sprintf("%d numeric columns",ncol(df_patient_num))

In [None]:
ggplot(gather(df_patient_num[,1:9], cols, value), aes(x = value)) + 
       geom_histogram(binwidth = 1) + facet_wrap(.~cols, scales = "free") 

In [None]:
ggplot(gather(df_patient_num[,10:18], cols, value), aes(x = value)) + 
       geom_histogram(binwidth = 1) + facet_wrap(.~cols, scales = "free") 

In [None]:
ggplot(gather(df_patient_num[,19:28], cols, value), aes(x = value)) + 
       geom_histogram(binwidth = 1) + facet_wrap(.~cols, scales = "free") 

## Correlations - Pearson - Patients Data

In [None]:
# All Character Fields as Factors? 

t_patient_data <- t_patient_data %>%
    mutate_all(type.convert) %>%
    mutate_if(is.character, as.factor)

In [None]:
# WAY 1:

# Patients Features Correlation
corrPatient <- round(cor(df_patient_num, use='pairwise', method = 'pearson'),2)


# Avoid columns with All raws == NA ??????
corrPatientnoNA <- corrPatient[,colSums(is.na(corrPatient))<nrow(corrPatient)]


# Setting all NA to 0 only for correlation
corrPatient[is.na(corrPatient)] <- 0


In [None]:
head(corrPatient,10) # 0 Values are NA values, 0.00 Values are 0.00.

In [None]:
# WAY 2: Hmisc
install.packages("Hmisc")
library(Hmisc)


corrPatient2 <- round(rcorr(as.matrix(df_patient_num)),2) # It already is numeric -rcorr uses 'pairwise' by default


In [None]:
# WAY 2

# This function returns a Table with Rownames, Colnames, Correlation Coefficient, p-value
flattenCorrMatrix <- function(cormat, pmat) {
  ut <- upper.tri(cormat)
  data.frame(
    row = rownames(cormat)[row(cormat)[ut]],
    column = rownames(cormat)[col(cormat)[ut]],
    cor  =(cormat)[ut],
    p = pmat[ut]
    )
}

### Table of correlations

In [None]:
# WAY 2

# Visualize Correlation - Table

flattenCorrMatrix(corrPatient2$r, corrPatient2$P)

### Plot

In [None]:
# WAY 1:

heatmap(corrPatient)


In [None]:
# PLOT WAY 2:
install.packages("corrplot")
library(corrplot)

corrplot(corrPatient2, type = "upper", order = "hclust", 
         tl.col = "black", tl.srt = 45)

In [None]:
# PLOT WAY 3:

#install.packages("PerformanceAnalytics")
library(PerformanceAnalytics)
chart.Correlation(corrPatient[1:5,1:5], histogram=TRUE, pch=19)


## References

#### 01 - Baek, B., Lee, H. Prediction of survival and recurrence in patients with pancreatic cancer by integrating multi-omics data. Sci Rep 10, 18951 (2020). https://doi.org/10.1038/s41598-020-76025-1