In [1]:
import chromadb
import numpy as np
import pandas as pd

from utils.funcs import upsert_to_collection
from datetime import datetime
from sentence_transformers import SentenceTransformer

In [2]:
# pubmed articles
articles = pd.read_parquet("data/pubmed_articles.parquet")
print(articles.shape)
articles.head()

(29107, 7)


Unnamed: 0,pubmed_id,doi,title,abstract,journal,publication_date,electronic_publication_date
0,37957539,[10.1002/adbi.202300453 [doi]],Identification and Validation of Glomeruli Cel...,Accumulating evidence indicates that cellular ...,Adv Biol (Weinh),2023 Nov 13,20231113
1,37956978,[10.1055/a-2209-6357 [doi]],The potential of anti-coronavirus plant second...,"In early 2020, a global pandemic was announced...",Planta Med,2023 Nov 13,20231113
2,37956941,"[S0014-4835(23)00342-1 [pii], 10.1016/j.exer.2...",Blockade of interleukin-6 trans-signaling prev...,Interleukin-6 (IL-6) is a multifaceted cytokin...,Exp Eye Res,2023 Nov 11,20231111
3,37956927,"[S1568-1637(23)00281-7 [pii], 10.1016/j.arr.20...",Pharmacological modulation of vascular ageing:...,"Vascular ageing, characterized by structural a...",Ageing Res Rev,2023 Nov 11,20231111
4,37956894,"[S0006-2952(23)00505-1 [pii], 10.1016/j.bcp.20...",DEL-1 deficiency aggravates pressure overload-...,Recent studies have shown that neutrophils pla...,Biochem Pharmacol,2023 Nov 11,20231111


In [3]:
# reformat the dates
def format_date(date):
    try:
        return datetime.strptime(date, "%Y%m%d").strftime("%m/%d/%Y")
    except:
        return None


articles["electronic_publication_date"] = articles["electronic_publication_date"].map(
    format_date
)
articles.head()

Unnamed: 0,pubmed_id,doi,title,abstract,journal,publication_date,electronic_publication_date
0,37957539,[10.1002/adbi.202300453 [doi]],Identification and Validation of Glomeruli Cel...,Accumulating evidence indicates that cellular ...,Adv Biol (Weinh),2023 Nov 13,11/13/2023
1,37956978,[10.1055/a-2209-6357 [doi]],The potential of anti-coronavirus plant second...,"In early 2020, a global pandemic was announced...",Planta Med,2023 Nov 13,11/13/2023
2,37956941,"[S0014-4835(23)00342-1 [pii], 10.1016/j.exer.2...",Blockade of interleukin-6 trans-signaling prev...,Interleukin-6 (IL-6) is a multifaceted cytokin...,Exp Eye Res,2023 Nov 11,11/11/2023
3,37956927,"[S1568-1637(23)00281-7 [pii], 10.1016/j.arr.20...",Pharmacological modulation of vascular ageing:...,"Vascular ageing, characterized by structural a...",Ageing Res Rev,2023 Nov 11,11/11/2023
4,37956894,"[S0006-2952(23)00505-1 [pii], 10.1016/j.bcp.20...",DEL-1 deficiency aggravates pressure overload-...,Recent studies have shown that neutrophils pla...,Biochem Pharmacol,2023 Nov 11,11/11/2023


In [4]:
articles = articles.dropna()
articles.shape

(29107, 7)

In [5]:
sentences = (articles["title"] + "[SEP]" + articles["abstract"]).tolist()
print(len(sentences))
print(sentences[0])

29107
Identification and Validation of Glomeruli Cellular Senescence-Related Genes in Diabetic Nephropathy by Multiomics.[SEP]Accumulating evidence indicates that cellular premature senescence of the glomerulus, including endothelial cells, mesangial cells, and podocytes leads to diabetic nephropathy (DN), and DN is regarded as a clinical model of premature senescence. However, the role of cellular senescence-associated genes in the glomerulus in DN progression remains unclear. Therefore, this work aims to identify and validate potential cellular aging-related genes in the glomerulus in DN to provide novel clues for DN treatment based on anti-aging. The microarray GSE96804 dataset, including 41 diabetic glomeruli and 20 control glomeruli, is retrieved from the Gene Expression Omnibus (GEO) database and cellular senescence-related genes (CSRGs) are obtained from the GeneCards database and literature reports. Subsequently, PPI, GO, and KEGG enrichment are analyzed by screening the inters

In [6]:
# use a smaller model
model = SentenceTransformer("all-MiniLM-L6-v2")

In [7]:
embeddings = model.encode(sentences, batch_size=128, show_progress_bar=True)

Batches:   0%|          | 0/228 [00:00<?, ?it/s]

In [8]:
embeddings.shape

(29107, 384)

In [9]:
# convert to list
embeddings = embeddings.tolist()
len(embeddings)

29107

In [10]:
# use pubmed_id as id
ids = articles["pubmed_id"].tolist()

In [11]:
# prepare metadata
metadatas = articles.loc[
    :, ["title", "abstract", "journal", "electronic_publication_date"]
].to_dict("records")
metadatas[:3]

[{'title': 'Identification and Validation of Glomeruli Cellular Senescence-Related Genes in Diabetic Nephropathy by Multiomics.',
  'abstract': 'Accumulating evidence indicates that cellular premature senescence of the glomerulus, including endothelial cells, mesangial cells, and podocytes leads to diabetic nephropathy (DN), and DN is regarded as a clinical model of premature senescence. However, the role of cellular senescence-associated genes in the glomerulus in DN progression remains unclear. Therefore, this work aims to identify and validate potential cellular aging-related genes in the glomerulus in DN to provide novel clues for DN treatment based on anti-aging. The microarray GSE96804 dataset, including 41 diabetic glomeruli and 20 control glomeruli, is retrieved from the Gene Expression Omnibus (GEO) database and cellular senescence-related genes (CSRGs) are obtained from the GeneCards database and literature reports. Subsequently, PPI, GO, and KEGG enrichment are analyzed by s

In [12]:
# chroma
client = chromadb.PersistentClient(path="data")
collection = client.get_or_create_collection(
    name="papers", metadata={"hnsw:space": "cosine"}
)

In [13]:
collection.count()

0

In [14]:
# upsert papers to collection
upsert_to_collection(collection, ids, embeddings, metadatas, batch_size=10000)

In [15]:
collection.count()

29107