In [1]:
!pip install evaluate sacrebleu bert_score rouge_score datasets accelerate transformers sentence_transformers openai > /dev/null

In [None]:
!pip install vec2text

In [None]:
import os
import math
import torch
import openai
import vec2text
import numpy as np
import pandas as pd
from openai import OpenAI
from google.colab import drive
drive.mount('/content/drive', force_remount = True)
os.chdir('/content/drive/Shareddrives/Strawberries')

from google.colab import userdata
OPENAI_KEY = userdata.get('OPENAI_KEY')

client = OpenAI(
  api_key=OPENAI_KEY
)

os.environ["OPENAI_API_KEY"]=OPENAI_KEY

corrector = vec2text.load_pretrained_corrector("text-embedding-ada-002")

def get_embeddings_openai(text_list, model="text-embedding-ada-002"):
    batches = math.ceil(len(text_list) / 128)
    outputs = []
    for batch in range(batches):
        # print which batch we're in
        print(f"Processing batch {batch + 1} of {batches}")
        text_list_batch = text_list[batch * 128 : (batch + 1) * 128]
        response = client.embeddings.create(
            input=text_list_batch,
            model=model,
            encoding_format="float",  # override default base64 encoding...
        )
        outputs.extend([e.embedding for e in response.data])
    return torch.tensor(outputs)

In [None]:
# Create ada embeddings
path = "Text experiment/textPhenotypes.csv"
sentences = list(pd.read_csv(path, header=0)['AI_Description'].to_numpy())

embeddings = get_embeddings_openai(sentences)
embeddings_df = pd.DataFrame(embeddings.numpy())
embeddings_df.to_csv(f'Text experiment/embeddings.csv', index=False, header=False)

In [None]:
# Load text phenotypes
path = "Text experiment/textPhenotypes.csv"
sentences = list(pd.read_csv(path, header=0)['AI_Description'].to_numpy())

embeddings_df = pd.read_csv(f'Text experiment/embeddings.csv', header=None)

embeddings = torch.tensor(embeddings_df.to_numpy()).type(torch.float32)

batch_size = 100 #100
inverted_sentences = []

for i in range(35 * batch_size, len(embeddings), batch_size):
  print(f"Processing batch {i // batch_size + 1} of {math.ceil(len(embeddings)/batch_size)}")

  torch.cuda.empty_cache()
  inverted_sentences_batch = vec2text.invert_embeddings(
      embeddings=embeddings[i:i+batch_size].cuda(),
      corrector=corrector,
      num_steps=5,
      sequence_beam_width=2
  )

  inverted_sentences.extend(inverted_sentences_batch)
  del inverted_sentences_batch

  df = pd.DataFrame({'Original Sentence': sentences[0:len(inverted_sentences)], 'Encoded-decoded Sentence': inverted_sentences})
  # save df to csv with indices of batches in path name
  df.to_csv(f"Text experiment/{i}_rrBLUPpredictedText.csv", index=False)

# Create mosaic with decoded predicted embeddings

In [None]:
path = "Text experiment/textPhenotypes.csv"
sentences = list(pd.read_csv(path, header=0)['AI_Description'].to_numpy())

n = 563
batch_size = 100
for random_seed in range(1, 51):
  print(random_seed)
  predicted_df = pd.read_csv(f"Text experiment/{random_seed}_rrBLUPpredictedEmbeddings.csv")
  predicted_embeddings = torch.tensor(predicted_df.to_numpy()).type(torch.float32)[0:n]
  predicted_sentences = []
  for i in range(0, len(predicted_embeddings), batch_size):
    print(f"Processing batch {i // batch_size + 1} of {math.ceil(len(predicted_embeddings)/batch_size)}")
    torch.cuda.empty_cache()
    predicted_sentences_batch = vec2text.invert_embeddings(
        embeddings=predicted_embeddings[i:i+batch_size].cuda(),
        corrector=corrector,
        num_steps=5,
        sequence_beam_width=2
    )
    predicted_sentences.extend(predicted_sentences_batch)
  df = pd.DataFrame({'original': sentences[0:n], 'predicted': list(predicted_sentences)[0:n]})
  # save as csv
  df.to_csv(f"Text experiment/{random_seed}_rrBLUPpredictedText.csv", index=False)
  break

In [None]:
trainKey = np.genfromtxt(f"Image experiment/{random_seed}_trainKey.csv", delimiter=',', skip_header=0)
testKey = np.genfromtxt(f"Image experiment/{random_seed}_testKey.csv", delimiter=',', skip_header=0)
accessions = trainKey.copy()
accessions[testKey >= 0] = testKey[testKey >= 0]

In [None]:
which_genos = np.array([i for i in set(testKey)]).astype(int)
# which_genos = which_genos[[0, 1, 2, 3]]
which_genos = which_genos[[9, 13, 15, 105]]

originals = []
encodedDecodeds = []
meanEncodedDecodeds = []
endToEnds = []
for i in accessions[which_genos].astype(int):
  original = np.array(sentences)[np.where(accessions == i)]
  originals.append("\n ".join(original))

  theseEmbeddings = torch.tensor(get_embeddings_openai(np.array(sentences)[np.where(accessions == i)]))
  encodedDecoded = vec2text.invert_embeddings(
        embeddings=theseEmbeddings.cuda(),
        corrector=corrector,
        num_steps=5,
        sequence_beam_width=2
    )
  encodedDecodeds.append("\n ".join(encodedDecoded))

  theseEmbeddingsMean = torch.mean(theseEmbeddings, dim=0)
  encodedDecodedMean = vec2text.invert_embeddings(
        embeddings=torch.unsqueeze(theseEmbeddingsMean, dim=0).cuda(),
        corrector=corrector,
        num_steps=5,
        sequence_beam_width=2
    )
  meanEncodedDecodeds.append(encodedDecodedMean)

  endToEnd = predicted_sentences[i]
  endToEnds.append(endToEnd)

In [None]:
df_results = pd.DataFrame({
    'Originals': originals,
    'Encoded-Decoded': encodedDecodeds,
    'Mean Encoded-Decoded': meanEncodedDecodeds,
    'End-to-End': endToEnds
})

df_results

Unnamed: 0,Originals,Encoded-Decoded,Mean Encoded-Decoded,End-to-End
0,"Long, deep red strawberry.\n Long, deep red st...","Long, deep red strawberry.\n Long, deep red st...","[Long, deep red strawberry.]","Long, deep red strawberry."
1,"Long, light red strawberry.\n Short, pale stra...","Long, light red strawberry.\n Short, pale stra...","[Short, light red strawberry.]","Long, light red short strawberry."
2,"Long, light red strawberry.\n Short, deep red ...","Long, light red strawberry.\n Short, deep red ...","[Long, deep red strawberry.]","Long, light red strawberry."
3,"Medium-long, light red strawberry.\n Long, lig...","Medium-long, light red strawberry.\n Long, lig...","[Long, light red strawberry.]","Long, deep red strawberry."


# Encoded and decoded text accuracy

In [15]:
df = pd.read_csv(f"Text experiment/{1400}_rrBLUPpredictedText.csv")
# get rows of df where "Original Sentence" column	!= "Encoded-decoded Sentence" column
df = df[df['Original Sentence'] != df['Encoded-decoded Sentence']]
df

Unnamed: 0,Original Sentence,Encoded-decoded Sentence
103,"Bright, long, medium red.","Bright, medium, long red."
266,"""Long, light red strawberry.""","''Long, light red strawberry.''"
670,"""Long, light red strawberry.""","''Long, light red strawberry.''"
701,"""Medium long, light red""","''Medium-long, light red''"
1197,"""Long, deep red strawberry.""","''Long, deep red strawberry.''"


In [30]:
df = pd.read_csv(f"Text experiment/{1400}_rrBLUPpredictedText.csv")
# get rows of df where "Original Sentence" column	!= "Encoded-decoded Sentence" column
df = df[df['Original Sentence'] != df['Encoded-decoded Sentence']]

# Set your OpenAI API key
api_key =OPENAI_KEY
client = OpenAI(
    api_key=api_key,  # This is the default and can be omitted
)

def generate_phenotype_from_description(phenotype_description):
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": "Based on the provided phenotype description, generate numeric values for length and redness. Length should range from 0 (short) to 1 (long), and redness should range from 0 (not red) to 1 (red). Your response should be strictly of the form "'Length: __, Redness: __'},
                    {"type": "text", "text": "Description: " + phenotype_description},
                ],
                "temperature":"0"
            }
        ],
    )
    return response.choices[0].message.content

# Apply the function to the 'Description' column
predicted_df = df['Encoded-decoded Sentence'].apply(generate_phenotype_from_description)

# Assuming 'predicted' is a pandas Series as shown in the provided code
# Split the strings in the 'predicted' Series by comma and create new columns
predicted_df = predicted_df.str.split(pat=",", expand=True)

# Rename the columns (adjust column names as needed)
predicted_df.columns = ['Extracted_Length', 'Extracted_Redness']

# Remove unwanted characters and convert to numeric
for col in predicted_df.columns:
    predicted_df[col] = predicted_df[col].str.replace(r'[a-zA-Z:]', '', regex=True).str.strip()
    predicted_df[col] = pd.to_numeric(predicted_df[col], errors='coerce') # Convert to numbers, handling errors

predicted_df = predicted_df[['Extracted_Length', 'Extracted_Redness']]
predicted_df

Unnamed: 0,Extracted_Length,Extracted_Redness
103,0.8,1.0
266,0.8,0.4
670,0.8,0.4
701,0.6,0.4
1197,0.9,1.0


In [26]:
df = pd.read_csv(f"Text experiment/{1400}_rrBLUPpredictedText.csv")
# get rows of df where "Original Sentence" column	!= "Encoded-decoded Sentence" column
df = df[df['Original Sentence'] != df['Encoded-decoded Sentence']]

# Set your OpenAI API key
api_key ="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"
client = OpenAI(
    api_key=api_key,  # This is the default and can be omitted
)

def generate_phenotype_from_description(phenotype_description):
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": "Based on the provided phenotype description, generate numeric values for length and redness. Length should range from 0 (short) to 1 (long), and redness should range from 0 (not red) to 1 (red). Your response should be strictly of the form "'Length: __, Redness: __'},
                    {"type": "text", "text": "Description: " + phenotype_description},
                ],
            }
        ],
    )
    return response.choices[0].message.content

# Apply the function to the 'Description' column
predicted_df = df['Original Sentence'].apply(generate_phenotype_from_description)

# Assuming 'predicted' is a pandas Series as shown in the provided code
# Split the strings in the 'predicted' Series by comma and create new columns
predicted_df = predicted_df.str.split(pat=",", expand=True)

# Rename the columns (adjust column names as needed)
predicted_df.columns = ['Extracted_Length', 'Extracted_Redness']

# Remove unwanted characters and convert to numeric
for col in predicted_df.columns:
    predicted_df[col] = predicted_df[col].str.replace(r'[a-zA-Z:]', '', regex=True).str.strip()
    predicted_df[col] = pd.to_numeric(predicted_df[col], errors='coerce') # Convert to numbers, handling errors

predicted_df = predicted_df[['Extracted_Length', 'Extracted_Redness']]
predicted_df

Unnamed: 0,Extracted_Length,Extracted_Redness
103,0.8,0.6
266,0.8,0.4
670,0.8,0.4
701,0.5,0.3
1197,1.0,1.0
