In [None]:
import torch
import re
import torchvision
import pandas as pd
import cv2
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
from torch import nn
from torch import optim
import torch.nn.functional as F
from torch.autograd import Variable
from torch.utils.data import DataLoader
from torchvision import transforms
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import io
import os
import copy
from sklearn.neighbors import NearestNeighbors

batch_size = 2*10
sentenceSize = 10
wordEmbeddingSize = 300
sentenceEmbeddingSize = wordEmbeddingSize * sentenceSize

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

from google.colab import drive
drive.mount('/content/drive', force_remount = True)
os.chdir('/content/drive/Shareddrives/Strawberries')

Mounted at /content/drive


In [None]:
%load_ext rpy2.ipython

In [None]:
%%R
install.packages("rrBLUP")
install.packages("plotly")
require(tidyverse); require(magrittr); require(rrBLUP)
library(data.table)
require(plotly)

setwd("/content/drive/Shareddrives/Strawberries/")

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.2     ✔ tibble    3.2.1
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.0.4     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors


Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)
trying URL 'https://cran.rstudio.com/src/contrib/rrBLUP_4.6.3.tar.gz'
Content type 'application/x-gzip' length 17817 bytes (17 KB)
downloaded 17 KB


The downloaded source packages are in
	‘/tmp/Rtmp2seBOv/downloaded_packages’
Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)
also installing the dependencies ‘lazyeval’, ‘crosstalk’

trying URL 'https://cran.rstudio.com/src/contrib/lazyeval_0.2.2.tar.gz'
trying URL 'https://cran.rstudio.com/src/contrib/crosstalk_1.2.1.tar.gz'
trying URL 'https://cran.rstudio.com/src/contrib/plotly_4.10.4.tar.gz'

The downloaded source packages are in
	‘/tmp/Rtmp2seBOv/downloaded_packages’
Loading required package: tidyverse
Loading required package: magrittr

Attaching package: ‘magrittr’

The following object is masked from ‘package:purrr’:

    set_names

The following object is masked from ‘package:tidyr’:

    extract

Loading required p

In [None]:
%%R

takeMean <- function(x, accessions){

  # return(x)

  x %>%
    mutate(Accession = accessions) %>%
    group_by(Accession) %>%
    summarise(across(everything(), ~mean(.x, na.rm = T))) %>%
    select(!Accession)
}

correlations <- data.frame(trait = character(), h2 = numeric(), train_r2 = numeric(), test_r2 = numeric())

for(random_seed in 47:50){
  print(random_seed)

  trainKey <- paste0("Image experiment/", random_seed, "_trainKey.csv") %>%
      fread(file = .)

  testKey <- paste0("Image experiment/", random_seed, "_testKey.csv") %>%
    fread(file = .)

  accessions <- trainKey
  accessions[testKey$V1 >= 0] <- testKey$V1[testKey$V1 >= 0]

  genotypes <- "Image experiment/fullGeno.csv" %>%
    fread() %>%
    tibble()

  embeddings <- "Text experiment/embeddings.csv" %>%
    fread(file = ., header = F) %>%
    tibble()

  trainKey %<>% takeMean(., accessions) %>% {.$V1}
  testKey %<>% takeMean(., accessions) %>% {.$V1}
  genotypes %<>% takeMean(., accessions)
  embeddings %<>% takeMean(., accessions)

  embeddingComponents <- names(embeddings)

  trainingSet <- trainKey >= 0
  testingSet <- testKey >= 0
  genotype_matrix <- genotypes %>%
    as.matrix()
  GRM <- A.mat(genotype_matrix)

  nObservations <- nrow(embeddings)

  knownEmbeddings <- list()
  predictedEmbeddings <- list()
  trainCorrelations <- c()
  testCorrelations <- c()
  testMSEs <- c()

  for (embeddingComponent in embeddingComponents){

    # print(embeddingComponent)
    estimated_h2 <- c()

    phenotype_vector <- embeddings %>% pull(embeddingComponent)
    # # randomize phenotype_vector
    # phenotype_vector <- phenotype_vector[sample(1:length(phenotype_vector), length(phenotype_vector))]
    phenotype_vector[testingSet] <- NA

    # print(system.time({
      fit <- mixed.solve(y = (phenotype_vector - mean(phenotype_vector, na.rm = T)), K = GRM)
    # }))
    var_genetic <- fit$Vu
    var_residual <- fit$Ve
    h2 <- var_genetic / (var_genetic + var_residual)
    estimated_h2 <- c(estimated_h2, h2)
    gblups <- (fit$u) + mean(phenotype_vector, na.rm = T)
    predictedEmbeddings[[length(predictedEmbeddings) + 1]] <- gblups

    phenotype_vector <- embeddings %>% pull(embeddingComponent)
    trainCorrelations <- c(trainCorrelations, cor(gblups[trainingSet], phenotype_vector[trainingSet])[1] ^ 2)
    testCorrelations <- c(testCorrelations, cor(gblups[testingSet], phenotype_vector[testingSet])[1] ^ 2)
    testMSEs <- c(testMSEs, mean((phenotype_vector[testingSet] - gblups[testingSet]) ^ 2))
    # plot(gblups[testingSet], phenotype_vector[testingSet], main = random_seed)

    print(embeddingComponent)
    print(paste0("h2: ", estimated_h2))
    print(paste0("train r2: ", last(trainCorrelations)))
    print(paste0("test r2: ", last(testCorrelations)))

    knownEmbeddings[[length(knownEmbeddings) + 1]] <- phenotype_vector

    correlations <- rbind(correlations, data.frame(seed = random_seed, trait = embeddingComponent, h2 = estimated_h2, train_r2 = trainCorrelations, test_r2 = testCorrelations))
    gc()
  }

  predictedEmbeddings <- as.data.frame(predictedEmbeddings)
  names(predictedEmbeddings) <- paste0("V", 1:ncol(predictedEmbeddings))
  knownEmbeddings <- as.data.frame(knownEmbeddings)
  names(knownEmbeddings) <- paste0("V", 1:ncol(knownEmbeddings))

  print(mean(testCorrelations))
  print(max(testCorrelations))
  print(mean(testMSEs))

  write.csv(correlations, paste0("Text experiment/", random_seed, "_rrBLUPextractedTraitCorrelations.csv"))
  write_csv(predictedEmbeddings, paste0("Text experiment/", random_seed, "_rrBLUPpredictedEmbeddings.csv"))
}

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[1] "V288"
[1] "h2: 0.172745025102196"
[1] "train r2: 0.333400273934752"
[1] "test r2: 0.280800344820043"
[1] "V289"
[1] "h2: 0.0727499917960722"
[1] "train r2: 0.179742993449439"
[1] "test r2: 0.0708252177677938"
[1] "V290"
[1] "h2: 0.187448493000024"
[1] "train r2: 0.324850483747117"
[1] "test r2: 0.291919714363788"
[1] "V291"
[1] "h2: 0.362970772398223"
[1] "train r2: 0.475602392172458"
[1] "test r2: 0.260223810204036"
[1] "V292"
[1] "h2: 0.109417826476657"
[1] "train r2: 0.208050958530703"
[1] "test r2: 0.0117916107552643"
[1] "V293"
[1] "h2: 0.10367590704736"
[1] "train r2: 0.200757593754422"
[1] "test r2: 0.1337816810186"
[1] "V294"
[1] "h2: 0.139061355574081"
[1] "train r2: 0.269216559784415"
[1] "test r2: 0.343610165867924"
[1] "V295"
[1] "h2: 0.0503978414062505"
[1] "train r2: 0.127276150230699"
[1] "test r2: 0.00324281334972744"
[1] "V296"
[1] "h2: 0.211582363597289"
[1] "train r2: 0.360310799046194"
[1] "test r

|--------------------------------------------------|
