In [22]:
import torch, json, csv, numpy as np
from transformers import AutoTokenizer, AutoModel
from SPARQLWrapper import SPARQLWrapper, JSON
# from sentence_transformers import SentenceTransformer

dimension = 768
model_name = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

In [24]:
from pinecone import Pinecone

pc = Pinecone(api_key="pcsk_vf69v_QZr7gBargpH86fScRFUdXTs1RirNspiT7sq5UN8QGe1rYfhrvexjKS2JccdKkL")
pc_idx = pc.Index("prismgpt")

In [None]:
# Read medications from the CSV file
medications = []
with open('medications.csv', 'r', encoding='utf-8') as csv_file:
    reader = csv.reader(csv_file)
    for row in reader:
        medications.append(row[0])
medications

In [12]:
embeddings = []
for med in medications:
    inputs = tokenizer(med, return_tensors='pt', padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
    embeddings.append(embedding)

In [33]:
embeddings = np.array(embeddings)
len(embeddings[0])

768

In [46]:
char_mapping = {
    'α': 'alpha',  
    'γ': 'gamma',  
    '−': '-',      
}

def sanitize_id(vector_id):
    # Replace non-ASCII characters based on the mapping
    for non_ascii, ascii_equiv in char_mapping.items():
        vector_id = vector_id.replace(non_ascii, ascii_equiv)
    # Remove any remaining non-ASCII characters
    return ''.join(char for char in vector_id if ord(char) < 128)

batch_size = 100  
for i in range(0, len(medications), batch_size):
    batch_ids = medications[i:i + batch_size]
    batch_embeddings = embeddings[i:i + batch_size].tolist()
    sanitized_batch_ids = [sanitize_id(id) for id in batch_ids]
    pc_idx.upsert(vectors=zip(sanitized_batch_ids, batch_embeddings), namespace="medications")

print("Medications have been pushed to the Pinecone index in batches.")

Medications have been pushed to the Pinecone index in batches.


In [58]:
query_med = "zopiclone"
query_inputs = tokenizer(query_med, return_tensors='pt', padding=True, truncation=True, max_length=512)

with torch.no_grad():
    query_outputs = model(**query_inputs)

query_embedding = query_outputs.last_hidden_state.mean(dim=1).squeeze().numpy().tolist()

response = pc_idx.query(
    namespace="medications",
    vector=query_embedding,
    top_k=5,  
    include_values=True,
    include_metadata=True
)

if response and 'matches' in response:
    for match in response['matches']:
        vector_id = match['id'] 
        vector_values = match['values'] 

        print(f"ID: {vector_id}")
        print(f"Vector Values: {vector_values}")
else:
    print("No matches found in the response.")


ID: zopiclone
Vector Values: [-0.0333627164, -0.0570174418, -0.219023332, 0.143539026, 0.191717744, -0.239314124, 0.145805731, 0.343239754, 0.149997592, -0.0382855423, -0.143918902, -0.0346423872, -0.0716419369, 0.0399202071, -0.168665633, 0.0465171151, -0.11011523, 0.0945236161, -0.218815267, 0.0452121794, 0.253554851, -0.193518355, -0.11175099, 0.430213571, 0.0974388495, -0.0544295721, -0.197492644, 0.0639132932, 0.0204301123, -0.168727472, 0.195948362, 0.218884468, -0.180054, -0.0798295215, 0.151326418, 0.125945911, -0.21008043, 0.00117035955, -0.316212147, -0.0697358474, -0.301496983, -0.114906192, 0.0328438468, 0.0870269611, 0.207638368, 0.171192154, -0.0867441297, 0.155680239, -0.0369899906, -0.20013921, -0.637298644, 0.149554491, 0.0935861394, -0.0340566486, 0.357943773, 0.257471859, -0.0752457902, -0.317891091, -0.0469215512, -0.181699947, -0.196321726, -0.0172354188, -0.245643154, -0.143320039, 0.323627084, 0.11181774, 0.00352661801, 0.0599719584, -0.648935556, 0.0782547444, -

In [None]:
# Retrieve and print the top matches
print("Top 5 matches for 'zopiclone':")
for match in query_response['matches']:
    print(f"Medication ID: {match['id']}, Score: {match['score']}")

In [3]:
def build_faiss_index(drug_side_effects):
    drug_list = list(drug_side_effects.keys())
    embeddings = model.encode(drug_list)
    index.add(embeddings)
    return drug_list

In [4]:
def query_faiss(user_input, drug_list):
    user_embedding = model.encode([user_input])
    distances, indices = index.search(user_embedding, k=1)
    closest_match = drug_list[indices[0][0]] if indices[0][0] < len(drug_list) else None
    return closest_match

In [15]:
def query_sparql_for_interaction(drug):
    sparql = SPARQLWrapper("https://query.wikidata.org/sparql")

    interaction_property = "p:P769"
    interaction_statement = "ps:P769"
    side_effect_qualifier = "pq:P1909"

    query = f"""
    SELECT ?interactingDrugLabel (GROUP_CONCAT(?sideEffectLabel; separator=", ") AS ?sideEffects) WHERE {{
      {drug} {interaction_property} ?statement.
      ?statement {interaction_statement} ?interactingDrug.
      ?interactingDrug rdfs:label ?interactingDrugLabel.
      FILTER (LANG(?interactingDrugLabel) = "en")

      OPTIONAL {{
        ?statement {side_effect_qualifier} ?sideEffect.
        ?sideEffect rdfs:label ?sideEffectLabel.
        FILTER (LANG(?sideEffectLabel) = "en")
      }}
    }} GROUP BY ?interactingDrugLabel
    """

    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()

    output_data = {}

    for binding in results['results']['bindings']:
        interacting_drug_label = binding['interactingDrugLabel']['value']
        side_effects = binding.get('sideEffects', {}).get('value', 'No side effects listed')
        side_effects_list = side_effects.split(', ') if side_effects != 'No side effects listed' else []
        output_data[interacting_drug_label] = side_effects_list

    return output_data

drug_side_effects = query_sparql_for_interaction("wd:Q220426")
print(drug_side_effects)

{'loxapine': ['syncope', 'hypotension', 'sedation', 'cognitive disorder', 'central respiratory depression'], 'alfentanil': ['hypotension', 'depressed level of consciousness', 'central respiratory depression'], 'sufentanil': ['hypotension', 'depressed level of consciousness'], 'pethidine': ['depressed level of consciousness', 'central respiratory depression'], 'periciazine': ['depressed level of consciousness'], 'tramadol': ['depressed level of consciousness', 'central respiratory depression'], '(RS)-methadone': ['depressed level of consciousness', 'central respiratory depression'], 'morphine': ['depressed level of consciousness', 'central respiratory depression'], 'oxycodone': ['depressed level of consciousness', 'central respiratory depression'], 'hydromorphone': ['depressed level of consciousness', 'central respiratory depression'], 'dihydrocodeine': ['depressed level of consciousness', 'central respiratory depression'], 'codeine': ['depressed level of consciousness', 'central respir

In [14]:
with open('drug_side_effects.json', 'r') as file:
    drug_side_effects = json.load(file)


{'alfentanil': ['hypotension', 'central respiratory depression', 'depressed level of consciousness'], 'sufentanil': ['hypotension', 'depressed level of consciousness'], 'loxapine': ['hypotension', 'syncope', 'cognitive disorder', 'central respiratory depression', 'sedation'], 'pethidine': ['central respiratory depression', 'depressed level of consciousness'], 'tramadol': ['central respiratory depression', 'depressed level of consciousness'], '(RS)-methadone': ['central respiratory depression', 'depressed level of consciousness'], 'morphine': ['central respiratory depression', 'depressed level of consciousness'], 'oxycodone': ['central respiratory depression', 'depressed level of consciousness'], 'hydromorphone': ['central respiratory depression', 'depressed level of consciousness'], 'dihydrocodeine': ['central respiratory depression', 'depressed level of consciousness'], 'codeine': ['central respiratory depression', 'depressed level of consciousness'], 'tapentadol': ['central respirato

In [17]:
# Example query
current_drug = "tramadol"
proposed_drug = "zopiclone"

In [20]:
def cosine_similarity(vec1, vec2):
  """Calculates the cosine similarity between two vectors.

  Args:
    vec1: The first vector.
    vec2: The second vector.

  Returns:
    The cosine similarity between the two vectors.
  """

  return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

In [21]:
current_drug_match = query_faiss(current_drug, drug_list)
proposed_drug_match = query_faiss(proposed_drug, drug_list)

NameError: name 'drug_list' is not defined

In [None]:
def get_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    # use mean pooling to get the sentence vector
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings.squeeze().numpy()