In [7]:
from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer

# Specify the model name and revision
model_name = "google/flan-t5-large"
model_revision = "main"

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name, revision=model_revision)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name, revision=model_revision)

# Load the summarization pipeline
summarizer = pipeline("summarization", model=model, tokenizer=tokenizer)

# Use a library like requests to fetch the webpage content
import requests
from bs4 import BeautifulSoup

In [8]:
def info(id):
    url = f"https://parts.igem.org/Part:{id}"
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    article_text = soup.find("div", id="mw-content-text").get_text()

    # Set the maximum length of the summary
    max_length = 25  # Adjust this value as needed

    # Generate the summary
    summary = summarizer(article_text, max_length=max_length, min_length=15, do_sample=False)

    return summary[0]["summary_text"]

In [24]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import json
import os

In [25]:
# check if data.json exists, and if it does, load it into selected_parts
if os.path.exists("data.json"):
    with open("data.json", "r") as f:
        selected_parts = json.load(f)
        
else:
    with open("parts.txt") as f:
        text = f.read()

    parts_raw = [x.strip() for x in text.split(">")][1:]
    parts = []
    desc_lens = {}

    for p in parts_raw:
        id = p.split(maxsplit=1)[0]
        part_type = p.split("\n", maxsplit=1)[0].split()[3]
        description = p.split('"', 1)[1].split('"', 1)[0]

        if len(description) not in desc_lens:
            desc_lens[len(description)] = 0
        desc_lens[len(description)] += 1

        parts.append({
            "id": id,
            "type": part_type,
            "description": description
        })

    # Longer descriptions are found to be more meaningful and so we will limit our 
    # analysis to parts with descriptions longer than 101 characters (105 such parts)

    selected_parts = [x for x in parts if len(x["description"]) > 101]

    # Add website summaries to each entry
    for i, p in enumerate(selected_parts):
        p["info"] = info(p["id"])
        print(i, p["id"], p["info"])

    # dump selected_parts to a json file
    with open("data.json", "w") as f:
        json.dump(selected_parts, f)

In [None]:
# store the data with summaries

x = json.dumps(for_improvement)
y = json.dumps(for_training)

# X = '[{"id": "BBa_I726001", "type": "Intermediate", "description": "I", "info": "iGEM 2007 Team Parts I luxI Usage and Biology Sequence and Features Assembly Compatibility: 10COMPATIBLE WITH RFC[10] . 12compatible with rfc[12"}, {"id": "BBa_I726002", "type": "Intermediate", "description": "R", "info": "iGEM 2007 Team Parts R luxR Usage and Biology Sequence and Features Assembly Compatibility: 10Compatible WITH RFC[10] 12COMPATIBLE WITH rfc[12] 21"}, {"id": "BBa_I741049", "type": "Regulatory", "description": " ", "info": "Sequence and Features Assembly Compatibility: 10Compatible with rfc[10] 12COMPATIBLE WITH RFC[12] 21Compatible with Rf[21] 23Compatiable with RFC"}, {"id": "BBa_I741108", "type": "Temporary", "description": "a", "info": "a Sequence and Features Assembly Compatibility: 10Compatible WITH RFC[10] 12COMPATIBLE WITH rfc[12] 21Compatible with rc[21] 23Compat"}, {"id": "BBa_J17004", "type": "Temporary", "description": "a", "info": "a a Sequence and Features Assembly Compatibility: 10COMPATIBLE WITH RFC[10] 12Compatible with rfc[12] 21INCOMPATIBIBLE WITH rc[21"}, {"id": "BBa_J29043", "type": "Reporter", "description": "a", "info": "a a Sequence and Features Assembly Compatibility: 10INCOMPATIBLE WITH RFC[10]Illegal EcoRI site found at 316 12INCOMPATIBILITY WITH rfc["}, {"id": "BBa_K245138", "type": "Coding", "description": "x", "info": "x x Sequence and Features Assembly Compatibility: 10compatible with rfc[10] 12compatible with RFC[12] 21compatibile with Rf[21] 23compat"}, {"id": "BBa_K252015", "type": "RBS", "description": "f", "info": "f ff sequence and features assembly compatibility: 10compatible with rfc[10] 12compatible with RFC[12] 21compatible with regc[21] 23compat"}, {"id": "BBa_K361010", "type": "Regulatory", "description": " ", "info": "Sequence and Features Assembly Compatibility: 10Compatible WITH RFC[10] 12COMPATIBLE WITH rfc[12] 21Compatible with rc[21] 23Compatiable with "}, {"id": "BBa_K404245", "type": "Project", "description": " ", "info": "Sequence and Features Assembly Compatibility: 10Compatible WITH RFC[10] 12COMPATIBLE WITH rfc[12] 21Compatible with rc[21] 23Compatiable with "}, {"id": "BBa_K404249", "type": "Project", "description": " ", "info": "Sequence and Features Assembly Compatibility: 10Compatible WITH RFC[10] 12COMPATIBLE WITH rfc[12] 21Compatible with rc[21] 23Compatiable with "}, {"id": "BBa_K404307", "type": "Composite", "description": " ", "info": "x Sequence and Features Assembly Compatibility: 10Compatible WITH RFC[10] 12COMPATIBLE WITH rfc[12] 21Compatible with rc[21] 23Compatiable"}, {"id": "BBa_K519004", "type": "Composite", "description": "a", "info": "a a Sequence and Features Assembly Compatibility: 10Compatible with rfc[10] 12INCOMPATIBLE WITH RFC[12]Illegal nheI site found at 7I"}, {"id": "BBa_K519005", "type": "Composite", "description": "a", "info": "a a Sequence and Features Assembly Compatibility: 10Compatible with rfc[10] 12INCOMPATIBLE WITH RFC[12]Illegal nheI site found at 7I"}, {"id": "BBa_K879927", "type": "Plasmid", "description": " ", "info": "this is a plasmid backbone that was designed to have its copy number controlled by the presence or absence of the inducer molecule IPTG . this copy number corresponds to the expression level of the genes on the "}, {"id": "BBa_K879928", "type": "Plasmid", "description": " ", "info": "this biobrick has been tested in an experiment looking at plasmid loss from a culture of cells that are not growing under selection . the culture containing IPTG produced approximately 25 times as many colonies as the culture without IP"}]'
# Y = '[{"id": "BBa_J01062", "type": "Intermediate", "description": "Mobilized OriTF with Lock1ed cI --%3E pRM --%3E GFP %3D %5BOriTF%5D%5BOnLock1%5D%5BcI%5D%5BDblTerminator%5D%5BpRM GFP%5D", "info": "Mobilized OriTF with Lock1ed cI--> pRM --> GFP = [OriTF][OnLock1][cI][dblTerminator]["}, {"id": "BBa_J23040", "type": "Signalling", "description": "%5BTetR%5D%5Brbs%5D%5BLuxR%5D%5BdblTerm%5D%5BLuxPR%5D+%5Brbs%5D%5BLacI%5D%5BdblTerm%5D  %22AHL-dependent inverter%22", "info": "F2620+P0412 sequence and features assembly compatibility: 10Compatible with rfc[10] 12Compatible with RFC[12] 21INCOMPATIBLE WITH RFC [21]"}, {"id": "BBa_J72103", "type": "Composite", "description": "%7BaraC-Pbad%7D%7Brbs.coi%7D%7Bdouble terminator%7D%7Bcin, no start, frameshift%7D%7BrepL no start%7D%7BLPsit%7D%7Brbs.pacA%7D%7BT", "info": "this part is in BBb Format . it is flanked by BglII and BamHI sites instead of XbaI and SpeI . the phagemid is referred to internally as"}, {"id": "BBa_J72116", "type": "Composite", "description": "%7BaraC-Pbad%7D%7Brbs.coi%7D%7Bdouble terminator%7D%7Bcin, no start, frameshift%7D%7BLPsit%7D%7Brbs.pacA%7D%7BTrnpB%7D", "info": "this part is in BBb Format . it is flanked by BglII and BamHI sites instead of XbaI and SpeI . the part is referred to internally as jt"}, {"id": "BBa_K078004", "type": "Coding", "description": "2%26%2365292%3B2%91%26%2365292%3B3-Trihydroxybiphenyl 1,2-dioxygenase. The second step enzyme in dixon degradation", "info": "there is currently no text in this page . you can search for this page title in other pages, or search the related logs, but you do not have permission to create this page. if you want to create a new page"}, {"id": "BBa_K078005", "type": "Coding", "description": "2%26%2365292%3B2%91%26%2365292%3B3-Trihydroxybiphenyl 1,2-dioxygenase. The second step enzyme in dixon degradation", "info": "2,2\\u2018,3-trihydroxybiphenyl 1,2-dioxygenase is one enzyme in dioxin degradation pathway . it is one kind of estradoil dioxygenese repon"}, {"id": "BBa_K531007", "type": "Composite", "description": "P%3Csub%3ErsaA%3C/sub%3E constitutive promoter, %3Ci%3Eesp%3C/i%3E optimized for %3Ci%3ECaulobacter%3C/i%3E, and %3Ci%3ErsaA%3C/i%3E", "info": "most of our chimeric RsaA-fusion proteins were expressed and secreted from Caulobacter . both DspB and Esp constructs with pxyl inducible promoter and chi"}, {"id": "BBa_K531008", "type": "Composite", "description": "P%3Csub%3Exyl%3C/sub%3E + %3Ci%3ECaulobacter%3C/i%3E optimized %3Ci%3EdspB%3C/i%3E + %3Ci%3ErsaA%3C/i%3E C-term", "info": "most of our chimeric RsaA-fusion proteins were expressed and secreted from Caulobacter . both DspB and Esp constructs with Pxyl inducible promoter can inhibit the bio"}, {"id": "BBa_K531009", "type": "Composite", "description": "P%3Csub%3ErsaA%3C/sub%3E + %3Ci%3ECaulobacter%3C/i%3E optimized %3Ci%3EdspB%3C/i%3E + %3Ci%3ErsaA%3C/i%3E C-term", "info": "most of our chimeric RsaA-fusion proteins were expressed and secreted from Caulobacter . both DspB and Esp constructs with Pxyl inducible promoter can inhibit the bio"}, {"id": "BBa_K531010", "type": "Composite", "description": "Constitutive promoter and RBS BBa_K081005 + %3Ci%3ECaulobacter%3C/i%3E optimized %3Ci%3EdspB%3C/i%3E + %3Ci%3ErsaA%3C/i%3E C", "info": "most of our chimeric RsaA-fusion proteins were expressed and secreted from Caulobacter . both DspB and Esp constructs with Pxyl inducible promoter can inhibit the bio"}, {"id": "BBa_K531011", "type": "Composite", "description": "Constitutive promoter and RBS BBa_K081005 + %3Ci%3ECaulobacter%3C/i%3E optimized %3Ci%3Eesp%3C/I%3E + C-terminal sec", "info": "most of our chimeric RsaA-fusion proteins were expressed and secreted from Caulobacter . both DspB and Esp can inhibit the biofilm growth . biofilm assay data showed a"}, {"id": "BBa_K733007", "type": "Composite", "description": "%3Ci%3EPveg%3C/i%3E + spoVG RBS + %3Ci%3ElytC%3C/i%3E + linker + RPMrel + consensus RBS + GFP + double terminator", "info": "project seeks to design recombinant bacteria that specifically target and suppress the growth of colorectal carcinoma cells in a controllable way . our proposed solution requires the phage display peptide \\u2018RPM"}, {"id": "BBa_K733012", "type": "Composite", "description": "%3Ci%3ExylR%3C/i%3E+%3Ci%3EPxylA%3C/i%3E+RBS+%3Ci%3EydcE%3C/i%3E+%3Ci%3EPtms%3C/i%3E+RBS+%3Ci%3EydcD%3C/i%3E%3A Cell Growth Inhibition Device", "info": "ptms+RBS+ydcD is for the stabilization of the cell growth inhibition system . the rationale for including this growth inhibition device is that over-dose of BMP2 can cause unexpected proliferation of"}, {"id": "BBa_K823023", "type": "Plasmid_Backbone", "description": "pSB%3Csub%3EBs%3C/sub%3E1C%3A Empty backbone for integration into %3Ci%3EBacillus subtilis%3C/i%3E %3Ci%3EamyE%3C/i%3E locus", "info": "pSBBs1C is an empty backbone vector for the usage in Bacillus subtilis . it integrates in the amyE locus and can be selected with chloramphenicol (cat gene) this"}, {"id": "BBa_K823024", "type": "Plasmid_Backbone", "description": "pSB%3Csub%3EBs%3C/sub%3E4S-P%3Ci%3E%3Csub%3Exyl%3C/i%3E%3C/sub%3E%3A Integrative expression vector for %3Ci%3EBacillus subtilis%3C/i", "info": "pSBBs4S-Pxyl is an expression vector for Bacillus subtilis . it integrates at the thrC locus and has a chloramphenicol resistance for selection in B"}, {"id": "BBa_K823026", "type": "Plasmid_Backbone", "description": "pSB%3Csub%3EBs%3C/sub%3E0K-P%3Csub%3Espac%3C/sub%3E (replicative Bacillus subtilis expression vector%3B IPTG inducible", "info": "pSBBs0K-Pspac is a replicative expression vector for Bacillus subtilis . it has an ampicillin resistance for cloning in E.coli and kanamycin resistance for selection"}, {"id": "BBa_K823030", "type": "Regulatory", "description": "P%3Csub%3E%3Ci%3EcotYZ%3C/i%3E%3C/sub%3E%3A %3Ci%3EB. subtilis%3C/i%3E promoter regulating expression of spore crust proteins", "info": "PcotYZ lies within cotVWXYZ gene cluster, which regulates expression of CotYZ . to test the activity we cloned this promoter into the lux-reporter vector"}, {"id": "BBa_K823033", "type": "Regulatory", "description": "P%3Csub%3E%3Ci%3EcotV%3C/i%3E%3C/sub%3E%3A  %3Ci%3EB. subtilis%3C/i%3E promoter regulates spore crust protein cotV", "info": "PcotV lies within cotVWXYZ gene cluster, which regulates expression of spore crust protein CotV . to test the activity we cloned this promoter into the lux-re"}, {"id": "BBa_M10220", "type": "Composite", "description": "%7BPbad.rbs.prepro.StrepTag%7D%7B%26%23706%3BGS5-IILK%26%23707%3B%7D%7B%26%23706%3BupaG_short%21%7D%7BdblTerm%7D", "info": "recombinant leucine zipper constructs were tested for their ability to bind streptavidin . the constructs did not appear to be able to clump at the bottom of the LB plates ."}, {"id": "BBa_M10221", "type": "Device", "description": "%7BPbad.rbs.prepro.StrepTag%7D%7D%7B%26%23706%3BGS5-IILK%26%23707%3B%7D%7B%26%23706%3BAg43_short%21%7D%7BdblTerm%7D", "info": "toxicity can be inferred from the differences in growth rate (as determined by changes in OD 600 over time) between samples containing LB and LB+arabinose . pbca9145-b"}, {"id": "BBa_M10222", "type": "Device", "description": " %7BPbad.rbs.prepro.StrepTag%7D%7B%26%23706%3BGS5-IILK%26%23707%3B%7D%7B%26%23706%3BespP(beta)%21%7D%7BdblTerm%7D ", "info": "recombinant leucine zipper constructs were tested for their ability to bind streptavidin . the constructs did not appear to clump in the presence of arabinose . a method for quantify"}, {"id": "BBa_S03679", "type": "Intermediate", "description": "(2,-1) RFP%3Csub%3Erev%3C/sub%3E-RBS%3Csub%3Erev%3C/sub%3E-HixC-RBS-TetF-HixC %3A pLac%3Csub%3Erev%3C/sub%3E-HixC", "info": "RFPrev-RBSrev-HixC : pLacrev-hixc Construction intermediate Sequence and Features Assembly Compatibility: 10COMPATIBLE WITH RFC[10] 12INCOM"}, {"id": "BBa_S03680", "type": "Intermediate", "description": "(-2,1) RFP%3Csub%3Erev%3C/sub%3E-RBS%3Csub%3Erev%3C/sub%3E-HixC-TB-RBS%3Csub%3Erev%3C/sub%3E-HixC %3A pLac-HixC", "info": "RFPrev-RBSrev-HixC : pLac-hixc Construction intermediate Sequence and Features Assembly Compatibility: 10COMPATIBLE WITH RFC[10] 12INCOMPA"}, {"id": "BBa_S03681", "type": "Intermediate", "description": "(-2,-1) RFP%3Csub%3Erev%3C/sub%3E-RBS%3Csub%3Erev%3C/sub%3E-HixC-TB-RBS%3Csub%3Erev%3C/sub%3E-HixC %3A pLac%3Csub%3Erev%3C/sub%3E-HixC", "info": "RFPrev-RBSrev-HixC : pLacrev-hixc Construction intermediate Sequence and Features Assembly Compatibility: 10COMPATIBLE WITH RFC[10] 12INCOM"}, {"id": "BBa_S03685", "type": "Intermediate", "description": "(1,-2) RFP%3Csub%3Erev%3C/sub%3E-RBS%3Csub%3Erev%3C/sub%3E-HixC-pLac-HixC %3A TetB-RBS%3Csub%3Erev%3C/sub%3E-HixC", "info": "RFPrev-RBSrev-HixC-pLac-hixc : construction intermediate Sequence and features Assembly Compatibility: 10COMPATIBLE WITH RFC[10] 12INCOMPA"}, {"id": "BBa_S03687", "type": "Intermediate", "description": "(-1,2) RFP%3Csub%3Erev%3C/sub%3ERBS%3Csub%3Erev%3C/sub%3E-HixC-pLac%3Csub%3Erev%3C/sub%3E-HixC %3A RBS-TetF-HixC", "info": "RFPrevRBSrev-HixC-pLacrev-hixc : RBS-TetF-hxc Construction intermediate Sequence and features Assembly Compatibility: 10COMPAT"}, {"id": "BBa_S03688", "type": "Intermediate", "description": "(-1,-2) RFP%3Csub%3Erev%3C/sub%3ERBS%3Csub%3Erev%3C/sub%3E-HixC-pLac%3Csub%3Erev%3C/sub%3E-HixC %3A TB-RBS%3Csub%3Erev%3C/sub%3E-HixC", "info": "TB-RBSrev-HixC-pLacrev-hixc Construction intermediate Sequence and features Assembly Compatibility: 10Compatible WITH RFC[10] 12INCOMPATIBLE WITH R"}]'

# for_improvement = json.loads(X)
# for_training = json.loads(Y)

print(x, y, sep="\n\n")

In [None]:
training_examples = []
shots = []

for p in for_training:
    inp = f"Type: {p['type']}, Information: {p['info']}"
    outp = p["description"]
    
    training_examples.append({"input": inp, "target": outp})
    shots.append(f"{inp}, Description: {outp}")

In [None]:
class DescriptionDataset(Dataset):
    def __init__(self, tokenizer, examples):
        self.examples = examples
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        example = self.examples[idx]
        inputs = self.tokenizer.encode(example["input"] + ", Description: ", return_tensors="pt", padding=True, truncation=True, max_length=512)
        targets = self.tokenizer.encode(example["target"], return_tensors="pt", padding=True, truncation=True, max_length=512)
        return {"input_ids": inputs, "attention_mask": inputs.ne(0), "labels": targets}

    def collate_fn(self, batch):
        inputs = pad_sequence([item["input_ids"].squeeze(0) for item in batch], batch_first=True)
        attention_masks = pad_sequence([item["attention_mask"].squeeze(0) for item in batch], batch_first=True)
        labels = pad_sequence([item["labels"].squeeze(0) for item in batch], batch_first=True, padding_value=self.tokenizer.pad_token_id)
        
        return {"input_ids": inputs, "attention_mask": attention_masks, "labels": labels}

In [None]:
# Prepare the dataset and data loader
dataset = DescriptionDataset(tokenizer, training_examples)
batch_size = 4  # Adjust this based on your resources
train_loader = DataLoader(dataset, batch_size=batch_size, collate_fn=dataset.collate_fn, shuffle=True)

# Fine-tuning settings
num_epochs = 3
learning_rate = 5e-5

# Define the optimizer and loss function
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
criterion = torch.nn.CrossEntropyLoss()

# Fine-tuning loop
model.train()
for epoch in range(num_epochs):
    for batch in train_loader:
        input_ids = batch["input_ids"].squeeze(1).to(model.device)
        attention_mask = batch["attention_mask"].squeeze(1).to(model.device)
        labels = batch["labels"].squeeze(1).to(model.device)
            
        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

# Save the fine-tuned model
model.save_pretrained("fine_tuned_description_model")
tokenizer.save_pretrained("fine_tuned_description_model")

In [None]:
# Generate descriptions
for p in for_improvement:
#     one_shot = shots[0] + "\n"
    
#     input_text = one_shot + f"Type: {p['type']}, Information: {p['info']}"
    input_text = f"Type: {p['type']}, Information: {p['info']}, Description: "
    
    input_ids = tokenizer.encode(input_text + ", Description: ", return_tensors="pt")
    output = model.generate(input_ids, max_length=60, num_beams=4, early_stopping=True)
    description = tokenizer.decode(output[0], skip_special_tokens=True)

    print(f"ID: {p['id']}")
    print(f"Original Description: {p['description']}")
    print(f"Description: {description}")
    print("--------------")