<a href="https://colab.research.google.com/github/mansfire/Zebra/blob/main/Zebra_Forca.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install requests beautifulsoup4 pandas



In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Base URLs
orphanet_base_url = "https://www.orpha.net/en/disease/detail/"
wikipedia_base_url = "https://en.wikipedia.org/wiki/"

# Example disease IDs (Replace this with dynamic fetching)
#disease_ids = range(1, 3475)  # Adjust range as needed
disease_ids = range(1, 75)
disease_data = []

def get_wikipedia_summary(disease_name):
    """Fetch Wikipedia summary for a disease"""
    search_url = wikipedia_base_url + disease_name.replace(" ", "_")
    response = requests.get(search_url)

    if response.status_code == 200:
        wiki_soup = BeautifulSoup(response.text, 'html.parser')
        # Extract the first paragraph of the Wikipedia page
        p_tags = wiki_soup.find_all("p")
        for p in p_tags:
            text = p.text.strip()
            if len(text) > 50:  # Avoid empty or too short descriptions
                return text
    return "No Wikipedia summary available"
# Base URL for Orphanet diseases
base_url = "https://www.orpha.net/en/disease/detail/"

# Example disease IDs (Replace this with dynamic fetching)
disease_ids = range(1, 3000)  # Adjust range as needed

disease_data = []

for disease_id in disease_ids:
    url = f"{base_url}{disease_id}"
    response = requests.get(url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')

        # Extract disease name
        title_tag = soup.find("div", class_="result-detail").find("h2")
        disease_name = title_tag.text.strip() if title_tag else "Unknown"

        # Extract description
        desc_tag = soup.find("div", class_="service-color-box").find("p")
        description = desc_tag.text.strip() if desc_tag else "No description available"

        # Fetch Wikipedia summary
        wikipedia_summary = get_wikipedia_summary(disease_name)

        # Save data
        disease_data.append({
            "ID": disease_id,
            "Disease Name": disease_name,
            "Description": description,
            "Wikipedia Summary": wikipedia_summary
        })
    else:
        print(f"Failed to fetch disease ID {disease_id}")

# Convert to DataFrame and save to CSV
df = pd.DataFrame(disease_data)
df.to_csv("/content/diseases_wikipedia.csv", index=False)

print("Scraping complete. Data saved to diseases_wikipedia.csv.")


Failed to fetch disease ID 1
Failed to fetch disease ID 2
Failed to fetch disease ID 3
Failed to fetch disease ID 4
Failed to fetch disease ID 12
Failed to fetch disease ID 21
Failed to fetch disease ID 66
Failed to fetch disease ID 75
Failed to fetch disease ID 89
Failed to fetch disease ID 121
Failed to fetch disease ID 149
Failed to fetch disease ID 152
Failed to fetch disease ID 153
Failed to fetch disease ID 161
Failed to fetch disease ID 196
Failed to fetch disease ID 197
Failed to fetch disease ID 203
Failed to fetch disease ID 208
Failed to fetch disease ID 228
Failed to fetch disease ID 259
Failed to fetch disease ID 260
Failed to fetch disease ID 271
Failed to fetch disease ID 299
Failed to fetch disease ID 311
Failed to fetch disease ID 339


In [None]:
import pandas as pd
import re
import torch
from torch.utils.data import Dataset, DataLoader

# Load dataset
df = pd.read_csv("/content/diseases_wikipedia.csv")

# Combine the text for training
def clean_text(text):
    text = str(text)
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = text.strip()
    return text

df["text"] = df["Disease Name"] + ": " + df["Description"] + " " + df["Wikipedia Summary"]
df["text"] = df["text"].apply(clean_text)

# Save cleaned text
dataset_text = "\n".join(df["text"].tolist())

# Save for tokenization
with open("rare_diseases.txt", "w", encoding="utf-8") as f:
    f.write(dataset_text)

print("Preprocessing Complete. Saved 'rare_diseases.txt'.")


In [None]:
import sentencepiece as spm

# Train a tokenizer
spm.SentencePieceTrainer.train(input="rare_diseases.txt", model_prefix="tokenizer", vocab_size=10000)

# Load trained tokenizer
sp = spm.SentencePieceProcessor(model_file="tokenizer.model")

# Test tokenizer
print(sp.encode("Long chain 3-hydroxyacyl-CoA dehydrogenase deficiency", out_type=str))


['▁Long', '▁chain', '▁3-', 'hydroxy', 'acyl', '-', 'CoA', '▁dehydrogenase', '▁deficiency']


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import sentencepiece as spm

# Train a tokenizer (only needed once)
spm.SentencePieceTrainer.train(input="rare_diseases.txt", model_prefix="tokenizer", vocab_size=10000)

# Load trained tokenizer
sp = spm.SentencePieceProcessor(model_file="tokenizer.model")

# Check if it's working
print("✅ Tokenizer loaded successfully!")
print("Vocab size:", sp.get_piece_size())  # This should now work

class TransformerLM(nn.Module):
    def __init__(self, vocab_size, d_model=256, num_heads=8, num_layers=6):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=num_heads)
        self.transformer = nn.TransformerEncoder(self.encoder_layer, num_layers=num_layers)
        self.fc_out = nn.Linear(d_model, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        x = self.transformer(x)
        x = self.fc_out(x)
        return x

# Load tokenizer vocab size
vocab_size = sp.get_piece_size()

# Initialize model
model = TransformerLM(vocab_size)


In [None]:
from torch.utils.data import Dataset, DataLoader

# Custom dataset for tokenized text
class RareDiseaseDataset(Dataset):
    def __init__(self, text, tokenizer, seq_length=128):
        self.tokenizer = tokenizer
        self.tokens = tokenizer.encode(text)
        self.seq_length = seq_length

    def __len__(self):
        return len(self.tokens) - self.seq_length

    def __getitem__(self, idx):
        input_ids = self.tokens[idx : idx + self.seq_length]
        target_ids = self.tokens[idx + 1 : idx + self.seq_length + 1]
        return torch.tensor(input_ids), torch.tensor(target_ids)

# Load dataset
dataset = RareDiseaseDataset(dataset_text, sp, seq_length=128)

# Create DataLoader
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)


import torch.optim as optim

# Define loss function and optimizer
optimizer = optim.Adam(model.parameters(), lr=3e-4)  # Adjust learning rate if needed
criterion = torch.nn.CrossEntropyLoss()

import torch.nn.functional as F

def generate_text(prompt, model, tokenizer, max_len=100):
    model.eval()
    tokens = tokenizer.encode(prompt)
    input_ids = torch.tensor(tokens).unsqueeze(0).to(device)

    for _ in range(max_len):
        with torch.no_grad():
            outputs = model(input_ids)
            logits = outputs[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            next_token = torch.multinomial(probs, num_samples=1)

            if next_token.item() == tokenizer.pad_id():
                break

            input_ids = torch.cat((input_ids, next_token), dim=1)

    return tokenizer.decode(input_ids.squeeze(0).tolist())

# Load model and test
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


# Train the model
num_epochs = 3
for epoch in range(num_epochs):
    for batch in dataloader:
        input_ids, target_ids = batch
        input_ids, target_ids = input_ids.to(device), target_ids.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids)
        loss = criterion(outputs.view(-1, vocab_size), target_ids.view(-1))
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1} Loss: {loss.item()}")

# Save trained model
torch.save(model.state_dict(), "rare_disease_transformer.pth")
print("Model trained and saved successfully.")

model.load_state_dict(torch.load("rare_disease_transformer.pth", map_location=device))
model.to(device)

prompt = "What is Long chain 3-hydroxyacyl-CoA dehydrogenase deficiency?"
print(generate_text(prompt, model, sp))
