In [1]:
import numpy as np
import pandas as pd
from datasets import Dataset
from sentence_transformers import SentenceTransformer
import minsearch
import ollama
from datetime import datetime

# Load dataset

In [2]:
# load dataframe
df = pd.read_csv('../data/data-kb.csv', sep='\t', dtype=str) # they are all strings!
df

Unnamed: 0,pmid,elocationid,title,journal,year,author,affiliation,abstract
0,40770710,pii: 769,The dynamic trajectory of autistic life and it...,BMC psychiatry,2025,"Leshata Winter Mokhwelepa, Gsakani Olivia Sumb...","School of Medicine, Faculty of Health Science,...",There is a noticeable knowledge vacuum on the ...
1,40690320,,[Quantitative analysis of autism online forums].,Psychiatria Hungarica : A Magyar Pszichiatriai...,2025,"Brigitta Kakuszi, Szilvia Hetesy, Pál Czobor",,Social media platforms are becoming increasing...
2,40637642,doi: 10.5152/TurkArchPediatr.2025.25127,The Work of Grunya Efimovna Sukhareva in the F...,Turkish archives of pediatrics,2025,"Annio Posar, Paola Visconti",IRCCS Istituto delle Scienze Neurologiche di B...,Despite several articles that in recent years ...
3,40597832,pii: 484,Autism spectrum disorders and childhood caries...,BMC pediatrics,2025,"Qiufang Jin, Zexiu He, Dongfang Xu, Ruihua Lin...","Department of Otolaryngology, The Second Hospi...",This study aimed to investigate the causal rel...
4,40527486,pii: S0021-7557(25)00099-3,Psychometric characteristics of the Mini-TEA s...,Jornal de pediatria,2025,"Cassiano Mateus Forcelini, Regina Ampese, Hele...",Associação de Pais e Amigos dos Excepcionais (...,Early diagnosis of autism spectrum disorder (A...
...,...,...,...,...,...,...,...,...
2995,7866673,,"Linguistics, human communication and psychiatry.",The British journal of psychiatry : the journa...,1994,"P Thomas, W Fraser",,Psycholinguistics and sociolinguistics have ex...
2996,7794327,,Prevalence of Asperger's syndrome in a secure ...,The British journal of psychiatry : the journa...,1994,"P Scragg, A Shah",,The hypothesis that Asperger's syndrome (AS) m...
2997,29871460,doi: 10.1007/BF01978114,A preliminary study of right hemisphere cognit...,European child & adolescent psychiatry,1994,"Hadyn D Ellis, Diane M Ellis, William Fraser, ...","Department of Psychological Medicine, Universi...",Seven children and young adults with definite ...
2998,7926319,,Developmental prosopagnosia in Asperger syndro...,Developmental medicine and child neurology,1994,I Kracke,"Department of Psychology, Hollymoor Hospital, ...",The case of a young man is presented who initi...


# Embed data

## Preprocessing

In [3]:
# count missing in each field
[df[column_name].isna().value_counts() for column_name in df.columns]

[pmid
 False    3000
 Name: count, dtype: int64,
 elocationid
 False    1758
 True     1242
 Name: count, dtype: int64,
 title
 False    2994
 True        6
 Name: count, dtype: int64,
 journal
 False    2995
 True        5
 Name: count, dtype: int64,
 year
 False    2972
 True       28
 Name: count, dtype: int64,
 author
 False    2939
 True       61
 Name: count, dtype: int64,
 affiliation
 True     1910
 False    1090
 Name: count, dtype: int64,
 abstract
 False    2644
 True      356
 Name: count, dtype: int64]

In [4]:
# fill in missing cells with empty string
for column_name in df.columns:
    df[column_name] = df[column_name].fillna('')

In [5]:
# combine text fields to be embedded
df_toembed = pd.DataFrame(df['journal']+' / '+df['title']+' / '+df['abstract'], columns=['text'])
df_toembed

Unnamed: 0,text
0,BMC psychiatry / The dynamic trajectory of aut...
1,Psychiatria Hungarica : A Magyar Pszichiatriai...
2,Turkish archives of pediatrics / The Work of G...
3,BMC pediatrics / Autism spectrum disorders and...
4,Jornal de pediatria / Psychometric characteris...
...,...
2995,The British journal of psychiatry : the journa...
2996,The British journal of psychiatry : the journa...
2997,European child & adolescent psychiatry / A pre...
2998,Developmental medicine and child neurology / D...


In [6]:
# how many words are the texts?
print(df_toembed['text'].apply(lambda x : len(x.split())).quantile([0.5, 0.75, 0.99, 1.0])) # percentiles

0.50    136.00
0.75    185.00
0.99    387.02
1.00    983.00
Name: text, dtype: float64


## Work, in batches

In [7]:
def batch_embed(batch, model_handle_vectorizer='all-MiniLM-L6-v2'):
    vectorizer = SentenceTransformer(model_name_or_path=model_handle_vectorizer)
    return {'embedding' : vectorizer.encode(batch['text'])}

In [8]:
# by batch, embed text
print(datetime.now())
dataset_to_embed = Dataset.from_pandas(df_toembed)
list_of_docs_with_embeddings = dataset_to_embed.map(batch_embed, batched=True, batch_size=32)
print(datetime.now())

2025-08-25 10:50:33.257416


Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

HTTP Error 429 thrown while requesting HEAD https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2/resolve/main/./modules.json
Retrying in 1s [Retry 1/5].
HTTP Error 429 thrown while requesting HEAD https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2/resolve/main/./modules.json
Retrying in 2s [Retry 2/5].
HTTP Error 429 thrown while requesting HEAD https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2/resolve/main/./modules.json
Retrying in 4s [Retry 3/5].
HTTP Error 429 thrown while requesting HEAD https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2/resolve/main/./modules.json
Retrying in 8s [Retry 4/5].
HTTP Error 429 thrown while requesting HEAD https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2/resolve/main/./modules.json
Retrying in 8s [Retry 5/5].
HTTP Error 429 thrown while requesting HEAD https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2/resolve/main/./modules.json
HTTP Error 429 thrown while requesting HEAD https://hugg

2025-08-25 10:56:58.249359


In [9]:
# numpy format embeddings
embeddings = np.array([doc['embedding'] for doc in list_of_docs_with_embeddings])
embeddings

array([[-0.02014046, -0.01249517, -0.03239842, ...,  0.05398439,
         0.02627808,  0.05590913],
       [-0.04988108, -0.05125703, -0.04357366, ...,  0.0861038 ,
        -0.00495032,  0.04734036],
       [-0.06211504, -0.01873365, -0.11244083, ...,  0.02616481,
        -0.02174573,  0.09641958],
       ...,
       [-0.00082186,  0.01687531, -0.08099026, ...,  0.1024764 ,
        -0.05948333,  0.00125555],
       [ 0.02463584,  0.04507218, -0.08503982, ...,  0.08198918,
         0.03921755,  0.04240407],
       [ 0.01920908, -0.09527313, -0.01468155, ...,  0.0642341 ,
        -0.01229316,  0.0252928 ]], shape=(3000, 384))

# CSV file

In [10]:
# write CSV file
np.savetxt('../data/embed-kb.csv', embeddings, delimiter=',')

In [11]:
print(datetime.now())

2025-08-25 10:56:59.108726
