In [1]:
from opensearchpy import OpenSearch, helpers


In [2]:
client = OpenSearch(
   hosts=["https://admin:admin@localhost:9200/"],
    http_compress=True,
    use_ssl=True,  # DONT USE IN PRODUCTION
    verify_certs=False,  # DONT USE IN PRODUCTION
    ssl_assert_hostname=False,
    ssl_show_warn=False,
)


In [76]:
# Create indicies
settings = {
    "settings": {
        "index": {
            "knn": True,
        }
    },
    "mappings": {
        "properties": {
            "id": {"type": "long"},
            "doi": {"type": "text"},
            "authors": {"type": "text"},
            "title": {"type": "text"},
            "abstract": {"type": "text"},
            "embedding": {
                "type": "knn_vector",
                "dimension": 384,
            },
        }
    },
}

In [77]:
INDEX_NAME = "pubmed-processed-data"

In [78]:

res = client.indices.create(index=INDEX_NAME, body=settings, ignore=[400])
print(res)

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'pubmed-processed-data'}


In [10]:
import pandas as pd
import torch
from tqdm import tqdm
from sentence_transformers import SentenceTransformer


  from .autonotebook import tqdm as notebook_tqdm


In [11]:
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

In [79]:
dataFile = open("./missedIds.txt","a+")

In [112]:
def getCurrentFileName(index):
    dataFile.write(f"extracted_text_{index}.csv ---------------------------------------------------- STARTED \n")
    FILENAME = f"../Preprocessed/file_{index}.csv"
    return FILENAME

In [114]:
reader = pd.read_csv(
    getCurrentFileName(4),
    usecols=["PMID","DOI","Authors","Title","Content","cleaned"]
)

reader.columns =["id", "doi", "authors","title", "contentUnprocessed", "abstract"]

In [115]:
reader.drop(['contentUnprocessed'], axis=1, inplace=True)

In [116]:
reader.shape

(8255, 5)

In [117]:
reader.head()

Unnamed: 0,id,doi,authors,title,abstract
0,37166765,,['Martina Stefanini'],What is your intelligence type?,
1,36736583,10.1016/j.drudis.2023.103516,"['Carmen Cerchia', 'Antonio Lavecchia']",New avenues in artificial-intelligence-assiste...,past decade biomedical data available grown un...
2,35203086,10.1213/ANE.0000000000005952,"['Kamal Maheshwari', 'Jacek B Cywinski', 'Fran...",Artificial Intelligence for Perioperative Medi...,anesthesiologist role expanded operating room ...
3,37140229,10.1017/pls.2023.2,['Craig Douglas Albert'],Epidemic intelligence studies: A research agen...,research letter introduces readers health inte...
4,36907624,10.1016/j.csm.2022.11.008,"['Bobbie Ann Adair White', 'Joann Farrell Quinn']",Personal Growth and Emotional Intelligence: Fo...,emotional intelligence (emotional intelligence...


In [118]:
# Function to convert each row to a dictionary
def rows_to_dict(dataframe):
    # Convert each row to a dictionary and append to an array
    dict_array = [row.to_dict() for index, row in dataframe.iterrows()]
    return dict_array


In [119]:
conv_data = rows_to_dict(reader)

In [120]:
conv_data[17]

{'id': 37032263,
 'doi': '10.1016/j.jmir.2023.02.014',
 'authors': "['Jonathan P McNulty', 'Yurgos Politis']",
 'title': 'Empathy, emotional intelligence and interprofessional skills in healthcare education.',
 'abstract': 'according world health organization (world health organization ) health professionals maintain health citizens evidencebased medicine caring students enroled health professional programmes required successfully attained core learning outcomes reaching key milestones course studies demonstrating developed required graduate skills attributes completion programme knowledge skills competencies learning outcomes discipline specific general professional skills disciplines difficult define empathy emotional intelligence interprofessional skills heart health professional programmes defined mapped curricula evaluated literature presented professional skills empathy emotional intelligence interprofessional skills based studies focussed primarily health professional programmes

In [121]:
lines= reader.shape[0]
print(lines)

8255


In [124]:
for counter_index in range(8,52):
    print(f"currently working on {counter_index} ________________________________________")
    reader = pd.read_csv(
        getCurrentFileName(counter_index),
        usecols=["PMID","DOI","Authors","Title","Content","cleaned"]
    )

    reader.columns =["id", "doi", "authors","title", "contentUnprocessed", "abstract"]
    reader.drop(['contentUnprocessed'], axis=1, inplace=True)
    conv_data = rows_to_dict(reader)
    lines= reader.shape[0]

    for paper in tqdm(conv_data, total=lines):
        #print(i)
        embedding = model.encode(f"{paper['authors']} {paper['title']} {paper['abstract']}")
        paper["embedding"] = embedding


        # Upload documents
        try:
            # Index the document
            res = client.index(index=INDEX_NAME, body=paper)
            #print(res)
        #i=i+1
        except Exception as e:
            #print(f"missed {paper['id']}")
            dataFile.write(f"missed {paper['id']} \n")
            #print(e)


currently working on 8 ________________________________________


100%|██████████| 8924/8924 [04:22<00:00, 33.95it/s]


currently working on 9 ________________________________________


100%|██████████| 9120/9120 [05:02<00:00, 30.18it/s]


currently working on 10 ________________________________________


100%|██████████| 8935/8935 [04:47<00:00, 31.04it/s]


currently working on 11 ________________________________________


100%|██████████| 9576/9576 [08:23<00:00, 19.03it/s]  


currently working on 12 ________________________________________


100%|██████████| 4607/4607 [02:38<00:00, 29.07it/s]


currently working on 13 ________________________________________


100%|██████████| 6950/6950 [03:45<00:00, 30.86it/s]


currently working on 14 ________________________________________


100%|██████████| 7168/7168 [03:37<00:00, 33.03it/s]


currently working on 15 ________________________________________


100%|██████████| 7102/7102 [04:03<00:00, 29.15it/s]


currently working on 16 ________________________________________


100%|██████████| 7517/7517 [03:45<00:00, 33.28it/s]


currently working on 17 ________________________________________


100%|██████████| 7606/7606 [04:14<00:00, 29.83it/s]


currently working on 18 ________________________________________


FileNotFoundError: [Errno 2] No such file or directory: '../Preprocessed/file_18.csv'

In [198]:
embedded_search = model.encode("Has Augmented intelligence become the focus of clinical interest")

In [199]:
query = {
    "size": 3,
    "query": {"knn": {"embedding": {"vector": embedded_search, "k": 2}}},
    "_source": False,
    "fields": ["id","doi","title", "abstract", "authors"],
}

In [200]:
response = client.search(body=query, index=INDEX_NAME)

In [201]:
response

{'took': 184,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 12, 'relation': 'eq'},
  'max_score': 0.63422644,
  'hits': [{'_index': 'pubmed-processed-data',
    '_id': 'VcyC-IwBIwUwAZCzEBjS',
    '_score': 0.63422644,
    'fields': {'id': [35020064],
     'abstract': ['augmented intelligence (augmented intelligence ) systems power transform health care bring closer quadruple aim enhancing patient experience improving population health reducing costs improving work life health care providers earning physicians trust critical accelerating adoption augmented intelligence patient care technology evolves medical community need develop standards innovative technologies visit current regulatory systems physicians patients rely ensure health care augmented intelligence responsible evidencebased free bias designed deployed promote equity develop actionable guidance trustworthy augmented intelligence health care ama review