In [2]:
import os
import torch
import numpy as np
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
os.chdir('/content/drive/Shareddrives/Strawberries/Image experiment')

embedding_size = 16

Mounted at /content/drive


In [4]:
# load genotypes and embeddings

geno = torch.load('fullGeno.pt', map_location=torch.device('cpu')).type(torch.float32)
np.savetxt('fullGeno.csv', geno.detach().numpy(), delimiter=',')
for random_seed in list(range(1,51)):
  embeddings = torch.load(str(random_seed) + '_embeddings_' + str(embedding_size) + '.pt', map_location=torch.device('cpu')).type(torch.float32)
  np.savetxt(str(random_seed) + '_embeddings_' + str(embedding_size) + '.csv', embeddings.detach().numpy(), delimiter=',')

In [5]:
%load_ext rpy2.ipython

In [7]:
%%R
install.packages("rrBLUP")
install.packages("plotly")
require(tidyverse); require(magrittr); require(rrBLUP)
library(data.table)
require(plotly)

setwd("/content/drive/Shareddrives/Strawberries/Image experiment/")

Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)
trying URL 'https://cran.rstudio.com/src/contrib/rrBLUP_4.6.3.tar.gz'
Content type 'application/x-gzip' length 17817 bytes (17 KB)
downloaded 17 KB


The downloaded source packages are in
	‘/tmp/RtmpG2TUGR/downloaded_packages’
Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)
trying URL 'https://cran.rstudio.com/src/contrib/plotly_4.10.4.tar.gz'
Content type 'application/x-gzip' length 3903133 bytes (3.7 MB)
downloaded 3.7 MB


The downloaded source packages are in
	‘/tmp/RtmpG2TUGR/downloaded_packages’


In [8]:
%%R

takeMean <- function(x, accessions){

  # return(x)

  x %>%
    mutate(Accession = accessions) %>%
    group_by(Accession) %>%
    summarise(across(everything(), ~mean(.x, na.rm = T))) %>%
    select(!Accession)
}

embedding_size <- 16
for(random_seed in 1:50){
  print(random_seed)
  print(embedding_size)

  trainKey <- paste0(random_seed, "_trainKey.csv") %>%
    fread(.)

  testKey <- paste0(random_seed, "_testKey.csv") %>%
    fread(.)

  accessions <- trainKey
  accessions[testKey$V1 >= 0] <- testKey$V1[testKey$V1 >= 0]

  genotypes <- "fullGeno.csv" %>%
    fread() %>%
    tibble()

  embeddings <- paste0(random_seed, '_embeddings_', embedding_size, '.csv') %>%
    fread() %>%
    tibble()

  trainKey %<>% takeMean(., accessions) %>% {.$V1}
  testKey %<>% takeMean(., accessions) %>% {.$V1}
  genotypes %<>% takeMean(., accessions)
  embeddings %<>% takeMean(., accessions)

  embeddingComponents <- names(embeddings)

  trainingSet <- trainKey >= 0
  testingSet <- testKey >= 0
  genotype_matrix <- genotypes %>%
    as.matrix()
  GRM <- A.mat(genotype_matrix)

  nObservations <- nrow(embeddings)

  knownEmbeddings <- list()
  predictedEmbeddings <- list()
  trainCorrelations <- c()
  testCorrelations <- c()
  testMSEs <- c()

  for (embeddingComponent in embeddingComponents){

    # print(embeddingComponent)
    estimated_h2 <- c()

    phenotype_vector <- embeddings %>% pull(embeddingComponent)
    # # randomize phenotype_vector
    # phenotype_vector <- phenotype_vector[sample(1:length(phenotype_vector), length(phenotype_vector))]
    phenotype_vector[testingSet] <- NA

    # print(system.time({
      fit <- mixed.solve(y = (phenotype_vector - mean(phenotype_vector, na.rm = T)), K = GRM)
    # }))
    var_genetic <- fit$Vu
    var_residual <- fit$Ve
    h2 <- var_genetic / (var_genetic + var_residual)
    estimated_h2 <- c(estimated_h2, h2)
    gblups <- (fit$u) + mean(phenotype_vector, na.rm = T)
    predictedEmbeddings[[length(predictedEmbeddings) + 1]] <- gblups

    phenotype_vector <- embeddings %>% pull(embeddingComponent)
    trainCorrelations <- c(trainCorrelations, cor(gblups[trainingSet], phenotype_vector[trainingSet])[1] ^ 2)
    testCorrelations <- c(testCorrelations, cor(gblups[testingSet], phenotype_vector[testingSet])[1] ^ 2)
    testMSEs <- c(testMSEs, mean((phenotype_vector[testingSet] - gblups[testingSet]) ^ 2))
    # plot(gblups[testingSet], phenotype_vector[testingSet], main = random_seed)

    # print(embeddingComponent)
    # print(paste0("h2: ", estimated_h2))
    # print(paste0("train r2: ", last(trainCorrelations)))
    # print(paste0("test r2: ", last(testCorrelations)))

    knownEmbeddings[[length(knownEmbeddings) + 1]] <- phenotype_vector
  }

  predictedEmbeddings <- as.data.frame(predictedEmbeddings)
  names(predictedEmbeddings) <- paste0("V", 1:ncol(predictedEmbeddings))
  knownEmbeddings <- as.data.frame(knownEmbeddings)
  names(knownEmbeddings) <- paste0("V", 1:ncol(knownEmbeddings))

  print(mean(testCorrelations))
  print(max(testCorrelations))
  print(mean(testMSEs))

  write_csv(predictedEmbeddings, paste0(random_seed, "_rrBLUPpredictedEmbeddings_", embedding_size, ".csv"))
}

[1] 1
[1] 16
[1] 0.2483369
[1] 0.4150634
[1] 0.2237007
[1] 2
[1] 16
[1] 0.2775886
[1] 0.4614635
[1] 0.2395254
[1] 3
[1] 16
[1] 0.2020326
[1] 0.3047148
[1] 0.2235235
[1] 4
[1] 16
[1] 0.3028441
[1] 0.4874488
[1] 0.2096472
[1] 5
[1] 16
[1] 0.2513002
[1] 0.4748002
[1] 0.2711449
[1] 6
[1] 16
[1] 0.2608804
[1] 0.4837067
[1] 0.2257629
[1] 7
[1] 16
[1] 0.2377897
[1] 0.4182971
[1] 0.2644987
[1] 8
[1] 16
[1] 0.2579146
[1] 0.4686866
[1] 0.2942913
[1] 9
[1] 16
[1] 0.2180009
[1] 0.3812452
[1] 0.2239476
[1] 10
[1] 16
[1] 0.2370858
[1] 0.4652013
[1] 0.2207754
[1] 11
[1] 16
[1] 0.2599135
[1] 0.463293
[1] 0.2434727
[1] 12
[1] 16
[1] 0.2807947
[1] 0.4847596
[1] 0.2479107
[1] 13
[1] 16
[1] 0.2838083
[1] 0.4843348
[1] 0.2045231
[1] 14
[1] 16
[1] 0.2820006
[1] 0.5601331
[1] 0.2051088
[1] 15
[1] 16
[1] 0.2427416
[1] 0.3869353
[1] 0.2110047
[1] 16
[1] 16
[1] 0.2451956
[1] 0.5749733
[1] 0.227413
[1] 17
[1] 16
[1] 0.1954718
[1] 0.4368628
[1] 0.2614778
[1] 18
[1] 16
[1] 0.2575817
[1] 0.4049664
[1] 0.250909
[1] 

|--------------------------------------------------|
|--------------------------------------------------|
|--------------------------------------------------|
|--------------------------------------------------|
|--------------------------------------------------|
|--------------------------------------------------|
|--------------------------------------------------|
|--------------------------------------------------|
|--------------------------------------------------|
|--------------------------------------------------|
|--------------------------------------------------|
|--------------------------------------------------|
|--------------------------------------------------|
|--------------------------------------------------|
|--------------------------------------------------|
|--------------------------------------------------|
|--------------------------------------------------|
|--------------------------------------------------|
|---------------------------------------------