# PCA plotting and generating analysis files
Note: This will be shown for the RNA, same principles apply for proteomic and metabolomic data

This will be using R code/cells in notebook

In [None]:
# Working directory
setwd("/data/RNA/")

In [None]:
# Load packages
library(ggplot2)
library(ggfortify)
library(dplyr)
library(cluster)

## Generate PCA plot for IPS + Neuron + MGL

In [None]:
# Read in data
ips <- read.csv("./IPSC_PROCESSED_DATA/salmon/all_samples_salmon_genes_ips.csv", header=T)
neuron <- read.csv("./NEURON_PROCESSED_DATA/salmon/all_samples_salmon_genes_neuron.csv", header=T)
mgl <- read.csv("./MGL_PROCESSED_DATA/salmon/all_samples_salmon_genes_mgl.csv", header=T)

In [None]:
# Remove id column in neuron and mgl dataset
neuron <- neuron[,-1 ]
mgl <- mgl[,-1]
# Append neuron to ips
ips_neuron_mgl <- cbind(ips,neuron, mgl)

In [None]:
# Calculate mean TPM
row_means <- rowMeans(ips_neuron_mgl[, 2:55])
# Add column of mean TPM
ips_neuron_mgl$TPM_average <- row_means
# Subset by mean TPM
subset_data <- ips_neuron_mgl[ips_neuron_mgl$TPM_average >= 2, ]

In [None]:
# Add status column
create_new_row <- function(column_names) {
  row_values <- ifelse(grepl("IPSC", column_names), "IPS", ifelse(grepl("KOLF", column_names), "NEURON", "MGL"))
  return(row_values)
}

new_row_values <- create_new_row(colnames(subset_data))

subset_data <- rbind(subset_data, new_row_values)

In [None]:
# Edit down column names
new_column_names <- gsub("^TPM_KOLF_IPSC_GBA1_", "", colnames(subset_data))
colnames(subset_data) <- new_column_names

new_column_names <- gsub("^TPM_KOLF_IPSC_", "", colnames(subset_data))
colnames(subset_data) <- new_column_names

new_column_names <- gsub("^TPM_KOLF_GBA1_", "", colnames(subset_data))
colnames(subset_data) <- new_column_names

new_column_names <- gsub("^TPM_KOLF_", "", colnames(subset_data))
colnames(subset_data) <- new_column_names

# Print the updated column names
print(colnames(subset_data))

In [None]:
# Need to transpose data
transposed_df <- t(subset_data)

# Convert the matrix into a dataframe
transposed_df <- as.data.frame(transposed_df)

# Get the first row as column names
col_names <- transposed_df[1, ]

# Set column names
colnames(transposed_df) <- col_names

# Remove the first row from the transposed dataframe
transposed_df <- transposed_df[-1, ]

# Remove tpm average row
transposed_df <- transposed_df[-55, ]

# Move column name from NEURON to STATUS
colnames(transposed_df)[colnames(transposed_df) == "MGL"] <- "STATUS"

In [None]:
# Set rownames as ID
transposed_df$ID <- row.names(transposed_df)

# Check dimensions
dimensions <- dim(transposed_df)

# Print the dimensions
print(dimensions)

In [None]:
# Make PCA df, 2 minus columns due to ID and STATUS
pca_df <- transposed_df[,1:12205]
rownames(pca_df) <- NULL
pca_df <- sapply(pca_df, as.numeric)

In [None]:
# Plot PCA
pca_res <- prcomp(pca_df, scale. = TRUE)

# Ensure STATUS is a factor and ordered correctly
transposed_df$STATUS <- factor(transposed_df$STATUS, levels = c("IPSC", "NEURON", "MGL"))

# CHANGE TO D409!
rownames(transposed_df) <- gsub("^D448", "D409", rownames(transposed_df))
transposed_df$ID <- gsub("D448", "D409", transposed_df$ID)

# Define custom colors for cell types
custom_colors <- c("IPSC" = "#E41A1C", "NEURON" = "#377EB8", "MGL" = "#4DAF4A")  # Red, Blue, Green

# PCA Plot with modified settings
pca_plot <- autoplot(pca_res, data = transposed_df, frame = TRUE, frame.colour = "STATUS") +
  geom_text(aes(label = transposed_df$ID), size = 2, check_overlap = TRUE) +
  ggtitle("Differentiations RNAseq") +
  scale_color_manual(name = "Cell Type", values = custom_colors) +  # Rename legend key and set colors
  scale_fill_manual(name = "Cell Type", values = custom_colors) +   # Apply colors to filled areas
  theme_minimal() +  # Use a minimal theme with a white background
  theme(
    panel.background = element_rect(fill = "white", color = NA),  # White background
    plot.background = element_rect(fill = "white", color = NA),
    legend.title = element_text(size = 14, face = "bold"),  # Format legend title
    panel.border = element_rect(color = "black", fill = NA, size = 1),
    legend.text = element_text(size = 10),
    legend.position = "right",  # Keep legend on the right
    plot.title = element_text(hjust = 0.5, face = "bold", size = 14),
    axis.text.x=element_text(size=14),
    axis.text.y=element_text(size=14),
    axis.title.x=element_text(size=14),
    axis.title.y=element_text(size=14),
  )
pca_plot
ggsave("RNA_PCA_all.png", plot = pca_plot, width = 8, height = 6, dpi = 600)


### Now that we have plotted, going to generate the analysis table which includes all the information + PCs
Only showing for iPSC

In [None]:
# Read in data
data <- read.csv("./IPSC_PROCESSED_DATA/salmon/all_samples_salmon_genes_ips.csv", header=T)

# Calculate mean TPM
row_means <- rowMeans(data[, 2:19])
# Add column of mean TPM
data$TPM_average <- row_means
# Subset by mean TPM
subset_data <- data[data$TPM_average >= 2, ]

# Edit down column names for cleaner plot
new_column_names <- gsub("^TPM_KOLF_IPSC_GBA1_", "", colnames(subset_data))
colnames(subset_data) <- new_column_names
new_column_names <- gsub("^TPM_KOLF_IPSC_", "", colnames(subset_data))
colnames(subset_data) <- new_column_names

# Print the updated column names
print(colnames(subset_data))

# Need to transpose data
transposed_df <- t(subset_data)

# Convert the matrix into a dataframe
transposed_df <- as.data.frame(transposed_df)


# Get the first row as column names
col_names <- transposed_df[1, ]

# Set column names
colnames(transposed_df) <- col_names

# Remove the first row from the transposed dataframe
transposed_df <- transposed_df[-1, ]

# Remove tpm average row
transposed_df <- transposed_df[-19, ]


# Check dimensions
dimensions <- dim(transposed_df)

# Print the dimensions
print(dimensions)

In [None]:
# Make PCA df 
pca_df <- transposed_df[,1:12109]
rownames(pca_df) <- NULL
pca_df <- sapply(pca_df, as.numeric)

In [None]:
# Run pca and make table
pca_res <- prcomp(pca_df, scale. = TRUE)
pc_scores <- pca_res$x

sample_names <- rownames(transposed_df)
pc_data <- data.frame(Sample = sample_names, pc_scores[, 1:10])

print(pc_data)

In [None]:
# In transposed dataset, make rownames column one again
# Set rownames as ID

transposed_df$ID <- row.names(transposed_df)

transposed_df <- transposed_df %>%
  select(ID, everything()) %>%
  rename(Sample = ID)

rownames(transposed_df) <- NULL

head(transposed_df)

In [None]:
# Make sure transposed entries are numeric except sample
transposed_df <- transposed_df %>%
  mutate_at(vars(-Sample), as.numeric)
    
head(transposed_df)

In [None]:
# Merge with PCS
pc_genes <- merge(transposed_df, pc_data , by= "Sample")

In [None]:
# Check that dimensions match expected
dimensions <- dim(transposed_df)
print(dimensions)

dimensions <- dim(pc_data)
print(dimensions)

dimensions <- dim(pc_genes)
print(dimensions)

In [None]:
# Now add in column with GBA groups 1-4 
pc_genes <- pc_genes %>%
  mutate(GBA_Group = case_when(
    grepl("KO", Sample) ~ 4,
    grepl("HOM", Sample) ~ 3,
    grepl("HET", Sample) ~ 2,
    grepl("WT", Sample) ~ 1,
    TRUE ~ NA_integer_
  )) %>%
  select(Sample, GBA_Group, everything())

head(pc_genes)

In [None]:
# Now add in group column without replicates

pc_genes <- pc_genes %>%
  mutate(Group = case_when(
    grepl("KO", Sample) ~ "KO",
    grepl("D448H_HOM", Sample) ~ "D448H_HOM",
    grepl("D448V_HOM", Sample) ~ "D448V_HOM",
    grepl("D448V_HET", Sample) ~ "D448V_HET",
    grepl("D448H_HET", Sample) ~ "D448H_HET",
    grepl("WT", Sample) ~ "WT",
    TRUE ~ NA_character_
  )) %>%
  select(Sample, Group, everything())

head(pc_genes)

In [None]:
# Reorganize 
pc_genes <- pc_genes %>%
    select(Sample, Group, GBA_Group, PC1, PC2, PC3, PC4, PC5, PC6, PC7, PC8, PC9, PC10, everything())
head(pc_genes)

In [None]:
# Write table
write.csv(pc_genes, "analysis_table_ips.csv", row.names=FALSE)