In [None]:
# %pip install pandas requests beautifulsoup4 openai dashscope tqdm

In [None]:
import pandas as pd
import requests
import json
import random
import time
from bs4 import BeautifulSoup
import openai
from openai import OpenAI
import dashscope
import getpass
from tqdm import tqdm

from concurrent.futures import ThreadPoolExecutor, as_completed

In [None]:
def retry_request(func, retries=3, wait_time=2):
    def wrapper(*args, **kwargs):
        attempt = 0
        while attempt < retries:
            try:
                return func(*args, **kwargs)
            except Exception as e:
                print(f"Error: {e}. Retrying {attempt+1}/{retries}...")
                attempt += 1
                time.sleep(wait_time)
        return None
    return wrapper

In [None]:
@retry_request
def embed_with_str(text, model='text_embedding_v3'):
    resp = dashscope.TextEmbedding.call(
        model=eval(f'dashscope.TextEmbedding.Models.{model}'),
        input=text)
    return resp.output.get('embeddings')[0].get('embedding')

In [None]:
def generate_embeddings_multithread(text_df, text_column='Text', embedding_column='Embedding'):
    embeddings = [None] * len(text_df)
    with ThreadPoolExecutor(max_workers=10) as executor:
        future_to_index = {executor.submit(embed_with_str, text): idx for idx, text in enumerate(text_df[text_column])}
        
        for future in as_completed(future_to_index):
            idx = future_to_index[future]
            try:
                embedding = future.result()
                embeddings[idx] = embedding
            except Exception as e:
                print(f"Error processing text at index {idx}: {e}")
                embeddings[idx] = None

    text_df[embedding_column] = embeddings
    return text_df

In [None]:
api_key = getpass.getpass("Enter your API key: ")
# client = OpenAI(api_key=api_key)
dashscope.api_key = api_key

# DrugBank

## Db2Text

In [None]:
@retry_request
def db2text(drugbank_accession_number="DB00460"):
    url = "https://go.drugbank.com/drugs/" + drugbank_accession_number
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }

    proxies = {
        "http": "http://127.0.0.1:7890",
        "https": "http://127.0.0.1:7890",
    }

    try:
        response = requests.get(url, headers=headers, proxies=proxies, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')

        # Drug Name
        drug_name = soup.find('dt', {'id': 'generic-name'}).find_next_sibling('dd').text.strip()

        # Background
        background_section = soup.find('dt', {'id': 'background'})
        if background_section:
            background_dd = background_section.find_next('dd')
            
            for sup in background_dd.find_all('sup'):
                sup.decompose()
                
            background = background_dd.get_text(separator=" ").replace("\n", " ").strip()
        
        # Type
        type_value = soup.find('dt', {'id': 'type'}).find_next_sibling('dd').text.strip()

        # Chemical Formula
        chemical_formula = soup.find('dt', {'id': 'chemical-formula'}).find_next_sibling('dd').text.strip() if soup.find('dt', {'id': 'chemical-formula'}) else ''

        # Summary
        summary_section = soup.find('dt', {'id': 'summary'})
        if summary_section:
            summary = summary_section.find_next('dd').text.strip()
            drug_text = f"{summary} {background} {drug_name} is of the type {type_value}, number {drugbank_accession_number}"
        else:
            drug_text = f"{background} {drug_name} is of the type {type_value}, number {drugbank_accession_number}"
        
        if chemical_formula:
            drug_text += f" and has the molecular formula {chemical_formula}."

        # print(drug_text)
        
    except requests.exceptions.RequestException as e:
        print(f"Error fetching data for {drugbank_accession_number}: {e}")
    except Exception as e:
        print(f"Error parsing data for {drugbank_accession_number}: {e}")

    return drug_text

In [None]:
def extract_drug_info_multithread(drug_id_df):
    with ThreadPoolExecutor(max_workers=10) as executor:
        future_to_drug_id = {executor.submit(db2text, drug_id): drug_id for drug_id in drug_id_df[0]}
        drug_texts = []

        for future in as_completed(future_to_drug_id):
            drug_id = future_to_drug_id[future]
            try:
                drug_text = future.result()
                drug_texts.append((drug_id, drug_text))
            except Exception as e:
                print(f"Error processing {drug_id}: {e}")
                drug_texts.append((drug_id, None))

    result_df = pd.DataFrame(drug_texts, columns=['DrugBank ID', 'Text'])
    return result_df

In [None]:
db2text('DB00313')

In [None]:
db2text('DB00010')

In [None]:
drug_id = pd.read_csv('Wrname.csv', header=None)
drug_id.head()

In [None]:
# drug_id['text'] = drug_id[0].apply(db2text)
# drug_id.head()

In [None]:
result_df = extract_drug_info_multithread(drug_id)
result_df.head()

In [None]:
result_df = result_df.sort_values('DrugBank ID', ignore_index=True)
result_df.head()

In [None]:
result_df.shape

In [None]:
result_df.to_csv('db2text.csv', index=None)

## Text2Emb

In [None]:
db2text = pd.read_csv('db2text.csv')
db2text.head()

In [None]:
# # openai
# @retry_request
# def get_embedding(text, model="text-embedding-3-small"):
#    text = text.replace("\n", " ")
#    return client.embeddings.create(input = [text], model=model).data[0].embedding

In [None]:
len(embed_with_str(db2text['Text'][0]))

In [None]:
result_df = generate_embeddings_multithread(db2text)
result_df.head()

In [None]:
embeddings_expanded = pd.DataFrame(result_df['Embedding'].tolist())

result_df = pd.concat([result_df.drop('Embedding', axis=1), embeddings_expanded], axis=1)

In [None]:
result_df.head()

In [None]:
result_df.to_csv('db2emb.csv', index=None)

In [None]:
db2text[db2text['DrugBank ID'] == 'DB00015']

In [None]:
embed_with_str(db2text[db2text['DrugBank ID'] == 'DB00015']['Text'].values[0])

In [None]:
# result_df['embedding'] = result_df['Text'].apply(lambda x: get_embedding(x, model='text-embedding-3-small'))

# OMIM

## ID2Text

In [None]:
wdname = pd.read_csv('Wdname.csv', header=None)
wdname.columns = ['ID']
wdname.head()

In [None]:
wdname['MIM Number'] = wdname['ID'].apply(lambda x: int(x.split('D')[-1]))
wdname.head()

In [None]:
omim2title = pd.read_csv('mimTitles.txt', comment='#', sep='\t', header=None)
omim2title = omim2title.iloc[:, 1:3]
omim2title.columns = ['MIM Number', 'Title']
omim2title.head()

In [None]:
omim2title.dtypes

In [None]:
wdname.dtypes

In [None]:
wdname = wdname.merge(omim2title)
wdname.head()

In [None]:
wdname.to_csv('omim2text.csv', index=None)

## Text2Emb

In [None]:
omim2text = pd.read_csv('omim2text.csv')
omim2text.head()

In [None]:
@retry_request
def embed_with_str(text, model='text_embedding_v3'):
    resp = dashscope.TextEmbedding.call(
        model=eval(f'dashscope.TextEmbedding.Models.{model}'),
        input=text)
    return resp.output.get('embeddings')[0].get('embedding')

In [None]:
len(embed_with_str(omim2text['Title'][0]))

In [None]:
result_df = generate_embeddings_multithread(omim2text, text_column='Title', embedding_column='Embedding')
result_df.head()

In [None]:
embeddings_expanded = pd.DataFrame(result_df['Embedding'].tolist())

result_df = pd.concat([result_df.drop('Embedding', axis=1), embeddings_expanded], axis=1)

In [None]:
result_df.to_csv('omim2emb.csv', index=None)