# ðŸ”— Accessing Google Drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# ðŸ§° Necessary Dependencies

In [15]:
import torch
import polars as pl
from transformers import DistilBertTokenizer
from torch.utils.data import Dataset, DataLoader
from sentence_transformers import SentenceTransformer, util
from tqdm import tqdm
import os
import zipfile
import shutil

# ðŸ¤– DistilBERT predictions on PubMed abstracts

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = torch.load("/content/drive/MyDrive/final_abstract_classifier.pt", weights_only=False)
model.to(device)
model.eval()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [4]:
pmc_file_path = "/content/drive/MyDrive/PubMed_cikkek.csv"
df_pmc = pl.read_csv(pmc_file_path)

In [5]:
class AbstractDataset(Dataset):
    def __init__(self, abstracts, tokenizer, max_length):
        self.abstracts = abstracts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.abstracts)

    def __getitem__(self, idx):
        abstract = str(self.abstracts[idx])

        encoding = self.tokenizer.encode_plus(
            abstract,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten()
        }


In [6]:
MAX_LEN = 256
dataset = AbstractDataset(df_pmc["abstract"].to_list(), tokenizer, MAX_LEN)
data_loader = DataLoader(dataset, batch_size=32)

In [7]:
def get_predictions(model, data_loader, device):
    model.eval()
    predictions = []

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)

            predictions.extend(preds.cpu().numpy())

    return predictions

In [8]:
preds = get_predictions(model, data_loader, device)

In [9]:
df_pmc = df_pmc.with_columns([
    pl.Series(name="label", values=preds)
    ])

In [10]:
relevant = df_pmc.filter(pl.col("label") == 1)
# relevant.write_csv("relevant_pubmed_articles.csv")

# ðŸ§© Expand Topics & Initialize Embedding Model

In [13]:
def expand_terms(seed_terms):
    """
    Expands search terms with synonyms and related terms. Focuses specifically on covid, smoking and obesity topics. Returns a sorted list of unique expanded terms.
    """
    expanded = set()

    # Medical terminology variations - Only for requested topics
    medical_variants = {
        # Smoking-related terms
        'smoke': ['smoking behavior', 'tobacco use', 'cigarette consumption',
                  'nicotine exposure', 'secondhand smoke', 'smoking cessation'],
        'smoking': ['smoking behavior', 'tobacco use', 'cigarette consumption',
                    'nicotine exposure', 'secondhand smoke', 'smoking cessation'],
        'tobacco': ['tobacco use', 'smoking', 'nicotine', 'cigarette'],
        'nicotine': ['nicotine exposure', 'smoking', 'tobacco', 'cigarette'],
        'cigarette': ['cigarette consumption', 'smoking', 'tobacco', 'nicotine'],

        # Obesity-related terms
        'obesity': ['overweight', 'adiposity', 'morbid obesity', 'excessive body weight',
                    'obesity disorder', 'high BMI', 'body fat accumulation'],
        'overweight': ['obesity', 'excess weight', 'adiposity', 'weight problem',
                       'high body mass', 'increased BMI'],
        'adiposity': ['obesity', 'overweight', 'body fat', 'fat accumulation'],

        # COVID-related terms
        'covid': ['covid-19', 'coronavirus', 'sars-cov-2', 'pandemic', 'post-covid syndrome',
          'acute respiratory syndrome', 'covid infection'],
        'covid-19': ['covid', 'coronavirus', 'sars-cov-2', 'pandemic', 'long covid'],
        'coronavirus': ['covid', 'covid-19', 'sars-cov-2', 'corona virus', 'novel coronavirus'],
        'sars-cov-2': ['covid', 'covid-19', 'coronavirus'],
    }

    # Process each seed term
    for term in seed_terms:
        # Add base term in lowercase
        expanded.add(term.lower())

        # Add simple grammatical variations
        if term.lower().endswith('e'):
            expanded.add(term.lower()[:-1] + 'ing')
        if term.lower().endswith('ing'):
            expanded.add(term.lower()[:-3] + 'e')

        # Add common compound forms
        if ' ' in term:
            expanded.add(term.lower().replace(' ', '-'))
            expanded.add(term.lower().replace(' ', ''))

        # Add medical variations if term exists in our dictionary
        if term.lower() in medical_variants:
            expanded.update(medical_variants[term.lower()])

    return sorted(list(expanded))

# Generate expanded terms for covid, smoking and obesity topics
smoking_seed_terms = ['smoke', 'smoking', 'tobacco', 'nicotine', 'cigarette']
obesity_seed_terms = ['obesity', 'overweight', 'adiposity']
covid_seed_terms = ['covid', 'covid-19', 'coronavirus', 'sars-cov-2', 'pandemic', 'long covid']

smoking_terms = expand_terms(smoking_seed_terms)
obesity_terms = expand_terms(obesity_seed_terms)
covid_terms = expand_terms(covid_seed_terms)

# Combine all relevant terms and remove duplicates
relevant_terms = sorted(list(set(smoking_terms + obesity_terms + covid_terms)))

try:
    PubMed_df = relevant
    print(f"\nSuccessfully loaded {len(PubMed_df)} documents.")
except FileNotFoundError:
    print(f"\nError: File not found at specified path.")
    print("Please verify the file path and rerun the script.")
    exit()

# Verify that 'fulltext' column exists for searching
text_column = 'full_text'
if text_column not in PubMed_df.columns:
    print(f"\nError: DataFrame doesn't contain required '{text_column}' column for searching.")
    print("Semantic search can only run on the 'fulltext' column.")
    exit()
else:
    print(f"Searching in '{text_column}' column.")

# Using 'all-mpnet-base-v2' model which generally performs better than 'all-MiniLM-L6-v2'
model = SentenceTransformer('all-mpnet-base-v2', device=device)
print("SentenceTransformer model loaded: 'all-mpnet-base-v2'")

# 3. Generate embeddings for search terms
relevant_terms_embeddings = model.encode(relevant_terms, convert_to_tensor=True)
relevant_terms_embeddings = relevant_terms_embeddings.to(device)


Successfully loaded 14588 documents.
Searching in 'full_text' column.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

SentenceTransformer model loaded: 'all-mpnet-base-v2'


# ðŸ§© Semantic Search on Document Collection

In [16]:
def semantic_search(dataframe, text_column, query_embeddings, query_terms, threshold=0.35, top_k=5):
    """
    Performs semantic search on documents.
    """
    results = []

    # Ensure required columns exist
    for col in ['title', 'doi', 'abstract']:
        if col not in dataframe.columns:
          dataframe = dataframe.with_columns(pl.lit("").alias(col))

    # Search through documents
    for row in tqdm(dataframe.iter_rows(named=True), total=dataframe.height, desc="Searching documents"):
        text = row.get(text_column, "")

        if not isinstance(text, str) or not text.strip():
            continue

        try:
            doc_embedding = model.encode(text, convert_to_tensor=True).to(device)
            cos_scores = util.cos_sim(doc_embedding, query_embeddings)[0]
            top_scores, top_indices = torch.topk(cos_scores, k=min(top_k, len(cos_scores)))

            for score, idx in zip(top_scores, top_indices):
                if score.item() >= threshold:
                    results.append({
                        'title': row.get('title', 'N/A'),
                        'doi': row.get('doi', 'N/A'),
                        'abstract': row.get('abstract', 'N/A'),
                        'fulltext': text,
                        'matched_term': query_terms[idx.item()],
                        'similarity_score': score.item(),
                    })
        except Exception as e:
            doc_identifier = row.get('title', 'N/A') + (f" (DOI: {row.get('doi', 'N/A')})" if row.get('doi') else "")
            print(f"\nError processing document {doc_identifier}: {e}")
            continue

    return pl.DataFrame(results)

In [17]:
print("\nSearching for documents related to covid, smoking and obesity...")
relevant_results = semantic_search(
    dataframe=PubMed_df,
    text_column='full_text',
    query_embeddings=relevant_terms_embeddings,
    query_terms=relevant_terms,
    threshold=0.4
)


Searching for documents related to covid, smoking and obesity...


Searching documents: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 14588/14588 [25:50<00:00,  9.41it/s]



# ðŸ§¹ Post-Processing Search Results (Deduplication + CSV Export)



In [18]:
# Load saved results
df = relevant_results

# Find minimum similarity score
lowest_similarity = df['similarity_score'].min()
print(f"The lowest similarity_score in the entire DataFrame: {lowest_similarity}")

# Keep only the best match per title
df = df.sort("similarity_score", descending=True).unique(subset="doi", keep="first")

The lowest similarity_score in the entire DataFrame: 0.40000176429748535


In [None]:
def save_texts_to_zip(df, name, output_dir):
    """
    Saves article texts to individual files and compresses them into a ZIP archive.

    Args:
        df (pd.DataFrame): DataFrame containing article data
        name (str): Base name for output files and folders
        output_dir (str): Directory path for saving the ZIP file
    """
    # Create a folder for temporary text file storage
    folder = f"{name}_texts"
    os.makedirs(folder, exist_ok=True)
    saved_count = 0

    # Process each row in the DataFrame
    for row in df.iter_rows(named=True):
      text = row.get('fulltext')
      title = row.get('title')

      # Handle missing or non-string titles
      if not isinstance(title, str):
        title = "untitled"

      # Only process rows with valid text content
      if text is not None:
          # Create a safe filename by removing special characters
          safe_title = "".join(
              c for c in title
              if isinstance(c, str) and (c.isalnum() or c in (' ', '_'))
          ).rstrip()

          # Truncate long filenames and add .txt extension
          filename = f"{folder}/{safe_title[:50]}.txt"

          # Write text content to file
          with open(filename, 'w', encoding='utf-8') as f:
              f.write(str(text))
          saved_count += 1

    # Create ZIP archive of all text files
    zip_filename = f"{folder}.zip"
    zip_path = os.path.join(output_dir, zip_filename)

    with zipfile.ZipFile(zip_path, 'w') as zipf:
        for file in os.listdir(folder):
            zipf.write(os.path.join(folder, file), arcname=file)

    # Clean up temporary text files
    shutil.rmtree(folder)

    # Print completion message
    print(f"\n{saved_count} articles saved to {zip_path}")
    print(f"Compressed file available at: {os.path.abspath(zip_path)}")

# Set output directory - Updated path
output_directory = "/content/drive/MyDrive/"

# Execute the function
print("Saving texts to files and compressing...")
save_texts_to_zip(df, "relevant_new", output_directory)