In [None]:
import os
import numpy as np
import pandas as pd

from google.colab import drive
drive.mount('/content/drive', force_remount = True)
os.chdir('/content/drive/Shareddrives/Strawberries')

Mounted at /content/drive


In [None]:
# Create df of raw text features
path = "Text experiment/textPhenotypes.csv"
sentences = list(pd.read_csv(path, header=0)['AI_Description'].to_numpy())
rawTextFeatures = [np.array([ord(j) for j in i]) for i in sentences]
rawTextFeatures = [np.pad(i, (0, 40 - len(i)), 'constant') for i in rawTextFeatures]
rawTextFeatures = np.array(rawTextFeatures)
np.savetxt("Text experiment/rawTextFeatures.csv", rawTextFeatures, delimiter=",")

In [None]:
%load_ext rpy2.ipython

In [None]:
%%R
install.packages("rrBLUP")
install.packages("plotly")
require(tidyverse); require(magrittr); require(rrBLUP)
library(data.table)
require(plotly)

setwd("/content/drive/Shareddrives/Strawberries/")

In [None]:
%%R

takeMean <- function(x, accessions){

  # return(x)

  x %>%
    mutate(Accession = accessions) %>%
    group_by(Accession) %>%
    summarise(across(everything(), ~mean(.x, na.rm = T))) %>%
    select(!Accession)
}

correlations <- data.frame(trait = character(), h2 = numeric(), train_r2 = numeric(), test_r2 = numeric())

for(random_seed in 1:50){
  print(random_seed)

  trainKey <- paste0("Image experiment/", random_seed, "_trainKey.csv") %>%
      fread(file = .)

  testKey <- paste0("Image experiment/", random_seed, "_testKey.csv") %>%
    fread(file = .)

  accessions <- trainKey
  accessions[testKey$V1 >= 0] <- testKey$V1[testKey$V1 >= 0]

  genotypes <- "Image experiment/fullGeno.csv" %>%
    fread() %>%
    tibble()

  phenotypes <- "Text experiment/rawTextFeatures.csv" %>%
    fread(file = ., header = F) %>%
    tibble()

  trainKey %<>% takeMean(., accessions) %>% {.$V1}
  testKey %<>% takeMean(., accessions) %>% {.$V1}
  genotypes %<>% takeMean(., accessions)
  phenotypes %<>% takeMean(., accessions)

  phenotypeComponents <- names(phenotypes)

  trainingSet <- trainKey >= 0
  testingSet <- testKey >= 0
  genotype_matrix <- genotypes %>%
    as.matrix()
  GRM <- A.mat(genotype_matrix)

  nObservations <- nrow(phenotypes)

  knownPhenotypes <- list()
  predictedPhenotypes <- list()
  trainCorrelations <- c()
  testCorrelations <- c()
  testMSEs <- c()

  for (phenotypeComponent in phenotypeComponents){

    # print(phenotypeComponent)
    estimated_h2 <- c()

    phenotype_vector <- phenotypes %>% pull(phenotypeComponent)
    # # randomize phenotype_vector
    # phenotype_vector <- phenotype_vector[sample(1:length(phenotype_vector), length(phenotype_vector))]
    phenotype_vector[testingSet] <- NA

    # print(system.time({
      fit <- mixed.solve(y = (phenotype_vector - mean(phenotype_vector, na.rm = T)), K = GRM)
    # }))
    var_genetic <- fit$Vu
    var_residual <- fit$Ve
    h2 <- var_genetic / (var_genetic + var_residual)
    estimated_h2 <- c(estimated_h2, h2)
    gblups <- (fit$u) + mean(phenotype_vector, na.rm = T)
    predictedPhenotypes[[length(predictedPhenotypes) + 1]] <- gblups

    phenotype_vector <- phenotypes %>% pull(phenotypeComponent)
    trainCorrelations <- c(trainCorrelations, cor(gblups[trainingSet], phenotype_vector[trainingSet])[1] ^ 2)
    testCorrelations <- c(testCorrelations, cor(gblups[testingSet], phenotype_vector[testingSet])[1] ^ 2)
    testMSEs <- c(testMSEs, mean((phenotype_vector[testingSet] - gblups[testingSet]) ^ 2))
    # plot(gblups[testingSet], phenotype_vector[testingSet], main = random_seed)

    print(phenotypeComponent)
    print(paste0("h2: ", estimated_h2))
    print(paste0("train r2: ", last(trainCorrelations)))
    print(paste0("test r2: ", last(testCorrelations)))

    knownPhenotypes[[length(knownPhenotypes) + 1]] <- phenotype_vector

    correlations <- rbind(correlations, data.frame(seed = random_seed, trait = phenotypeComponent, h2 = estimated_h2, train_r2 = trainCorrelations, test_r2 = testCorrelations))
    write.csv(correlations, paste0("Text experiment/", random_seed, "_rrBLUPRawFeaturesCorrelations.csv"))

  }

  predictedPhenotypes <- as.data.frame(predictedPhenotypes)
  names(predictedPhenotypes) <- paste0("V", 1:ncol(predictedPhenotypes))
  knownPhenotypes <- as.data.frame(knownPhenotypes)
  names(knownPhenotypes) <- paste0("V", 1:ncol(knownPhenotypes))

  print(mean(testCorrelations))
  print(max(testCorrelations))
  print(mean(testMSEs))

  write_csv(predictedPhenotypes, paste0("Text experiment/", random_seed, "_rrBLUPpredictedRawFeatures.csv"))
}

In [None]:
# prompt: read predicted raw features and convert those back to text

import pandas as pd
import numpy as np

def convert_to_text(predicted_features):
  """Converts predicted raw features back to text.

  Args:
    predicted_features: A NumPy array or pandas DataFrame of predicted raw features.

  Returns:
    A list of strings, where each string is the reconstructed text.
  """
  if isinstance(predicted_features, pd.DataFrame):
    predicted_features = predicted_features.values

  text_list = []
  for feature_vector in predicted_features:
    text = ''.join([chr(int(round(x))) for x in feature_vector if 0 < int(round(x)) < 256])
    text_list.append(text)
  return text_list

# Example usage:
# Assuming 'predicted_features_df' is a pandas DataFrame loaded from the CSV
predicted_features_df = pd.read_csv('Text experiment/1_rrBLUPpredictedRawFeatures.csv')

# Convert the predicted features back to text
reconstructed_text = np.array(convert_to_text(predicted_features_df))

reconstructed_text

In [None]:
n = len(reconstructed_text)
path = "Text experiment/textPhenotypes.csv"
sentences = list(pd.read_csv(path, header=0)['AI_Description'].to_numpy())
np.array(sentences).shape
df = pd.DataFrame({'original': sentences[0:n], 'predicted': list(reconstructed_text)[0:n]})
df

Unnamed: 0,original,predicted
0,"Short, deep red strawberry.",Qinpg//hfecGXfXC[qmeiebilf<
1,"Short, pale strawberry.",Qinof/1hfecKXeWH[qkddd_dfb;
2,"Long, deep red strawberry.",Qinog.0hfddJVfYGXqldee_egd>
3,"Short, light red strawberry.",Qinpg./gffdE[eX@]qnejfcjmg<
4,"Long, light red strawberry.",Qinpg.0hfddHXfXEZqldee`fhd<
...,...,...
558,"Short, pale strawberry.",Qinof/0gfedDZeX@^qmfhebjkf<
559,"Short, deep red strawberry.",Qinpg./gfedC\fY?_qnfieckmg:
560,"Short, deep red strawberry.",Rinpi..hfddJShZDWqmegeagjf@
561,"Short, deep red strawberry.",Qinof/0gfecG[eWC^qmegeahje:
