# Transcription Factor Review

In [1]:
import os

from oaklib.interfaces.association_provider_interface import AssociationProviderInterface

from aurelian.agents.goann.goann_agent import goann_agent
from aurelian.agents.goann.goann_config import GOAnnotationDependencies


In [3]:
from dotenv import load_dotenv
load_dotenv()  # You'll need to install python-dotenv: pip install python-dotenv


True

In [6]:
cborg_api_key = os.environ.get("CBORG_CONTEXTUALIZER_API_KEY")

In [7]:
from linkml_store import Client

In [8]:
client = Client()
db = client.attach_database("mongodb://localhost:27017/tf_reviews", "tf_reviews")


In [9]:
collection = db.get_collection("claude_sonnet_v1", create_if_not_exists=True)

['tf_reviews']


In [10]:
from pydantic_ai.settings import ModelSettings
from pydantic_ai.providers.openai import OpenAIProvider
from pydantic_ai.models.openai import OpenAIModel


# not to be confused with GO-CAM Model
ai_model = OpenAIModel(
    "anthropic/claude-sonnet",
    provider=OpenAIProvider(
        base_url="https://api.cborg.lbl.gov",
        api_key=cborg_api_key),
)

settings = ModelSettings(
    max_tokens=64000,
)


In [11]:
from oaklib import get_adapter

amigo = get_adapter("amigo:NCBITaxon:9606")
if not isinstance(amigo, AssociationProviderInterface):
    raise ValueError("AMIGO is not an association provider")

In [12]:
TF = "GO:0003700"

In [13]:
from oaklib.datamodels.vocabulary import IS_A

subjs = list(amigo.associations_subjects(objects=[TF], object_closure_predicates=[IS_A]))

In [14]:
len(subjs)

1648

In [15]:
subjs[0:10]

['UniProtKB:Q6ZNC4',
 'UniProtKB:Q6ZNB6',
 'UniProtKB:Q01201',
 'UniProtKB:Q6ZMY9',
 'UniProtKB:P86452',
 'UniProtKB:P78347',
 'UniProtKB:Q8TF47',
 'UniProtKB:Q8TF45',
 'UniProtKB:Q8IYN0',
 'UniProtKB:Q92826']

In [16]:
query_template = """review TF-related annotations for {subject}, check against what is in uniprot, check
all PMIDs for the TF, and check against guidelines. Make specific recommendations if any existing annotations
should be changed or added
"""

In [17]:
import nest_asyncio
nest_asyncio.apply()

In [18]:
!mkdir tf-reviews

mkdir: tf-reviews: File exists


In [19]:
deps = GOAnnotationDependencies()

In [25]:
from datetime import datetime

for gene in subjs:
    if collection.find({"gene": gene}).num_rows:
        print(f"already reviewed {gene}")
        continue
    print(f"Q: {gene}")
    command = query_template.format(subject=gene)
    try:
        result = goann_agent.run_sync(command, deps=deps, model=ai_model, model_settings=settings)
    except Exception as e:
        print(f"Error: {e}")
        continue
    mjb = result.all_messages_json()
    # decode messages from json bytes to dict:
    if isinstance(mjb, bytes):
        mjb = mjb.decode()
    # print the messages
    import json
    all_messages = json.loads(mjb)
    collection.insert(
        {
            "gene": gene,
            "review": result.data,
            "messages": all_messages,
            "date": str(datetime.now()),
        }
    )

already reviewed UniProtKB:Q6ZNC4
already reviewed UniProtKB:Q6ZNB6
already reviewed UniProtKB:Q01201
already reviewed UniProtKB:Q6ZMY9
Q: UniProtKB:P86452
FIND GENE ANNOTATIONS: UniProtKB:P86452
LOOKUP PMID: 24043816
FETCH DOCUMENT: Transcription Factors Annotation Guidelines
FETCH DOCUMENT: Transcription Factors Annotation Guidelines Paper
FETCH DOCUMENT: How_to_annotate_TFs
FETCH DOCUMENT: Transcription_Factors_Annotation_Guidelines
FETCH DOCUMENT: Transcription_Factors_Annotation_Guidelines
FETCH DOCUMENT: Transcription Factors Annotation Guidelines
LITERATURE WEB SEARCH: GO gene ontology transcription factor annotation guidelines
LITERATURE WEB SEARCH: gene ontology transcription factor annotation guidelines ZBED6
FETCH LITERATURE URL: https://www.sciencedirect.com/science/article/pii/S1874939921000705
FETCH LITERATURE URL: https://academic.oup.com/database/article/doi/10.1093/database/bat062/340860
Error: Tool exceeded max retries count of 1
Q: UniProtKB:P78347
FIND GENE ANNOTATI



FIND GENE ANNOTATIONS: Q13485
Q: UniProtKB:P15336
FIND GENE ANNOTATIONS: UniProtKB:P15336
FETCH DOCUMENT: Transcription Factors Annotation Guidelines
FETCH DOCUMENT: Transcription Factors Annotation Guidelines Paper
LOOKUP PMID: 19861239
LOOKUP PMID: 18671972
LOOKUP PMID: 10821277
FETCH DOCUMENT: Transcription Factors Annotation Guidelines
Q: UniProtKB:P56178
FETCH DOCUMENT: Transcription Factors Annotation Guidelines
FETCH DOCUMENT: Transcription Factors Annotation Guidelines Paper
FIND GENE ANNOTATIONS: UniProtKB:P56178
LOOKUP PMID: 19497851
LITERATURE WEB SEARCH: DLX5 homeobox protein transcription factor activity
FETCH LITERATURE URL: https://pmc.ncbi.nlm.nih.gov/articles/PMC2742824/
FETCH LITERATURE URL: https://pmc.ncbi.nlm.nih.gov/articles/PMC4826628/
Q: UniProtKB:Q00978
FIND GENE ANNOTATIONS: UniProtKB:Q00978
LOOKUP PMID: 1630447
LOOKUP PMID: 24882218
LOOKUP PMID: 24065129
LOOKUP PMID: 28473536
FETCH DOCUMENT: Transcription Factors Annotation Guidelines
FETCH DOCUMENT: Transcri



LOOKUP PMID: 26733313
FETCH DOCUMENT: Transcription Factors Annotation Guidelines Paper
FETCH DOCUMENT: Transcription Factors Annotation Guidelines
Q: UniProtKB:Q6ZN57
FIND GENE ANNOTATIONS: UniProtKB:Q6ZN57
FETCH DOCUMENT: Transcription Factors Annotation Guidelines
FETCH DOCUMENT: Transcription Factors Annotation Guidelines
FETCH DOCUMENT: Transcription Factors Annotation Guidelines Paper
FETCH DOCUMENT: Transcription Factors Annotation Guidelines Paper
LOOKUP PMID: 25416956
LITERATURE WEB SEARCH: PMID 25416956 protein interaction
LOOKUP PMID: 31515488
Error: Tool exceeded max retries count of 1
Q: UniProtKB:Q6ZMS7
FIND GENE ANNOTATIONS: UniProtKB:Q6ZMS7
FETCH DOCUMENT: Transcription Factors Annotation Guidelines
FETCH DOCUMENT: Transcription Factors Annotation Guidelines
FETCH DOCUMENT: Transcription Factors Annotation Guidelines Paper
FETCH DOCUMENT: Transcription Factors Annotation Guidelines Paper
LOOKUP PMID: 34673265
LITERATURE WEB SEARCH: PMID:34673265 ZNF783 transcription fac



LOOKUP PMID: 28473536
LOOKUP PMID: 32296183
FETCH DOCUMENT: Transcription Factors Annotation Guidelines
FETCH DOCUMENT: Transcription Factors Annotation Guidelines
FETCH DOCUMENT: Transcription Factors Annotation Guidelines Paper
Q: UniProtKB:P52737
FIND GENE ANNOTATIONS: UniProtKB:P52737
FETCH DOCUMENT: Transcription Factors Annotation Guidelines
FETCH DOCUMENT: Transcription Factors Annotation Guidelines Paper
LOOKUP PMID: 7649249
Q: UniProtKB:P52736
FIND GENE ANNOTATIONS: P52736
FIND GENE ANNOTATIONS: UniProtKB:P52736
FETCH DOCUMENT: Transcription Factors Annotation Guidelines
FETCH DOCUMENT: Transcription Factors Annotation Guidelines
FETCH DOCUMENT: Transcription Factors Annotation Guidelines Paper
FETCH DOCUMENT: Transcription Factors Annotation Guidelines Paper
FETCH DOCUMENT: How_to_annotate_TFs
LITERATURE WEB SEARCH: GO transcription factor annotation guidelines
FETCH LITERATURE URL: https://www.sciencedirect.com/science/article/pii/S1874939921000705
LOOKUP PMID: 7649249
LOOKU

UnsupportedFormatException: Could not convert '/var/folders/nc/m4tx21912kv1b8nk3zzx9plr0000gn/T/tmpache7af0' to Markdown. The formats ['.xml', '.docbook', '.qtl', '.rng'] are not supported.

In [24]:
print("done")

done
