In [1]:
from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer

# Specify the model name and revision
model_name = "google/flan-t5-large"
model_revision = "main"

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name, revision=model_revision)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name, revision=model_revision)

# Load the summarization pipeline
summarizer = pipeline("summarization", model=model, tokenizer=tokenizer)

# Use a library like requests to fetch the webpage content
import requests
from bs4 import BeautifulSoup

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def info(id):
    url = f"https://parts.igem.org/Part:{id}"
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    article_text = soup.find("div", id="mw-content-text").get_text()

    # Set the maximum length of the summary
    max_length = 25  # Adjust this value as needed

    # Generate the summary
    summary = summarizer(article_text, max_length=max_length, min_length=15, do_sample=False)

    return summary[0]["summary_text"]

In [3]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import json
import os

In [5]:
# check if data.json exists, and if it does, load it into selected_parts
if os.path.exists("data.json"):
    with open("data.json", "r") as f:
        selected_parts = json.load(f)
        
else:
    with open("parts.txt") as f:
        text = f.read()

    parts_raw = [x.strip() for x in text.split(">")][1:]
    parts = []
    desc_lens = {}

    for p in parts_raw:
        id = p.split(maxsplit=1)[0]
        part_type = p.split("\n", maxsplit=1)[0].split()[3]
        description = p.split('"', 1)[1].split('"', 1)[0]

        if len(description) not in desc_lens:
            desc_lens[len(description)] = 0
        desc_lens[len(description)] += 1

        parts.append({
            "id": id,
            "type": part_type,
            "description": description
        })

    # Longer descriptions are found to be more meaningful and so we will limit our 
    # analysis to parts with descriptions longer than 101 characters (105 such parts)

    selected_parts = [x for x in parts if len(x["description"]) > 101]

    # Add website summaries to each entry
    for i, p in enumerate(selected_parts):
        p["info"] = info(p["id"])
        print(i, p["id"], p["info"])

    # dump selected_parts to a json file
    with open("data.json", "w") as f:
        json.dump(selected_parts, f)

In [7]:
# split selected_parts into training and testing sets (80-20 split)
import random
random.seed(0)
random.shuffle(selected_parts)

train_size = int(0.8 * len(selected_parts))

train_data = selected_parts[:train_size]
test_data = selected_parts[train_size:]

In [9]:
training_examples = []
shots = []

for p in train_data:
    inp = f"Type: {p['type']}, Information: {p['info']}"
    outp = p["description"]
    
    training_examples.append({"input": inp, "target": outp})
    shots.append(f"{inp}, Description: {outp}")

In [10]:
class DescriptionDataset(Dataset):
    def __init__(self, tokenizer, examples):
        self.examples = examples
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        example = self.examples[idx]
        inputs = self.tokenizer.encode(example["input"] + ", Description: ", return_tensors="pt", padding=True, truncation=True, max_length=512)
        targets = self.tokenizer.encode(example["target"], return_tensors="pt", padding=True, truncation=True, max_length=512)
        return {"input_ids": inputs, "attention_mask": inputs.ne(0), "labels": targets}

    def collate_fn(self, batch):
        inputs = pad_sequence([item["input_ids"].squeeze(0) for item in batch], batch_first=True)
        attention_masks = pad_sequence([item["attention_mask"].squeeze(0) for item in batch], batch_first=True)
        labels = pad_sequence([item["labels"].squeeze(0) for item in batch], batch_first=True, padding_value=self.tokenizer.pad_token_id)
        
        return {"input_ids": inputs, "attention_mask": attention_masks, "labels": labels}

In [11]:
# Prepare the dataset and data loader
dataset = DescriptionDataset(tokenizer, training_examples)
batch_size = 4  # Adjust this based on your resources
train_loader = DataLoader(dataset, batch_size=batch_size, collate_fn=dataset.collate_fn, shuffle=True)

# Fine-tuning settings
num_epochs = 3
learning_rate = 5e-5

# Define the optimizer and loss function
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
criterion = torch.nn.CrossEntropyLoss()

# Fine-tuning loop
model.train()
for epoch in range(num_epochs):
    for batch in train_loader:
        input_ids = batch["input_ids"].squeeze(1).to(model.device)
        attention_mask = batch["attention_mask"].squeeze(1).to(model.device)
        labels = batch["labels"].squeeze(1).to(model.device)
            
        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

# Save the fine-tuned model
model.save_pretrained("fine_tuned_description_model")
tokenizer.save_pretrained("fine_tuned_description_model")

('fine_tuned_description_model/tokenizer_config.json',
 'fine_tuned_description_model/special_tokens_map.json',
 'fine_tuned_description_model/tokenizer.json')

In [12]:
# Generate descriptions
for p in test_data:
    input_text = f"Type: {p['type']}, Information: {p['info']}, Description: "
    
    input_ids = tokenizer.encode(input_text + ", Description: ", return_tensors="pt")
    output = model.generate(input_ids, max_length=60, num_beams=4, early_stopping=True)
    description = tokenizer.decode(output[0], skip_special_tokens=True)

    print(f"ID: {p['id']}")
    print(f"Original Description: {p['description']}")
    print(f"Description: {description}")
    print("--------------")

ID: BBa_K395603
Original Description: alcohol acetyltransferase%3B converts butanol or 2-methylbutanol to butyl acetate or 2-methylbutyl ace
Description: DISPLAY FUNCTIONALLY WITH RFC[10]
--------------
ID: BBa_K086016
Original Description: modified Lutz-Bujard LacO promoter,with alternative sigma factor  %26%23963%3B38  followed by YFP reporter
Description: , rudimentary
--------------
ID: BBa_K531010
Original Description: Constitutive promoter and RBS BBa_K081005 + %3Ci%3ECaulobacter%3C/i%3E optimized %3Ci%3EdspB%3C/i%3E + %3Ci%3ErsaA%3C/i%3E C
Description: Promoter, %3C/%3C/%3C/%3C/%3C/%3C
--------------
ID: BBa_J45004
Original Description: SAM%3Abenzoic acid/salicylic acid carboxyl methyltransferase I%3B converts salicylic acid to methyl sali
Description: identifies a new assembly method for BSMT1
--------------
ID: BBa_K987001
Original Description: This is a composite part which has the function to invert the temperature activation by the part%3A BB
Description: Laboratory of X-Ra