# Reina AL MASRI, Mahmoud El Kasaby

## Baseline Model: Bigrams Bag of Words (PubMed RCT)

### 1. Install and Import Libraries

In [12]:
!git clone https://github.com/Franck-Dernoncourt/pubmed-rct

fatal: destination path 'pubmed-rct' already exists and is not an empty directory.


In [13]:
!pip install scispacy
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_core_sci_md-0.5.4.tar.gz
import spacy

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.2[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip[0m
Defaulting to user installation because normal site-packages is not writeable
Collecting https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_core_sci_md-0.5.4.tar.gz
  Using cached https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_core_sci_md-0.5.4.tar.gz (119.1 MB)
  Preparing metadata (setup.py) ... [?25ldone

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.2[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/Library/Developer/CommandLineTools/usr/bin/pyt

In [14]:
import re
import numpy as np
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

In [15]:
nlp = spacy.load("en_core_sci_md")

### 2. Enhanced Scispacy Tokenizer

In [16]:
def scispacy_tokenizer(text):
    _punct = re.compile(r"^[\W_]+$")
    doc = nlp(text)
    tokens = []
    for t in doc:
        if t.is_space:
            continue
        if t.like_num:
            tokens.append("__NUM__")
            continue
        if t.is_stop and t.text.lower() not in {"no", "not", "without"}:
            continue
        if _punct.match(t.text):
            continue
        lemma = (t.lemma_ or t.text).lower().strip()
        if len(lemma) >= 2:
            tokens.append(lemma)
    return tokens

### 3. Load PubMed RCT Dataset

In [17]:
def load_rct_data(file_path):
    texts, labels = [], []
    with open(file_path, 'r') as f:
        for line in f:
            line = line.rstrip('\n')
            if line.startswith('###') or line.strip() == '':
                continue
            if '\t' in line:
                label, text = line.split('\t', 1)
                labels.append(label)
                texts.append(text)
    return texts, labels

In [18]:
path_rct = "pubmed-rct/PubMed_20k_RCT/"
X_train, y_train = load_rct_data(path_rct + 'train.txt')
X_test, y_test = load_rct_data(path_rct + 'test.txt')
print(f"Train: {len(X_train)}, Test: {len(X_test)}")
print(f"Labels: {sorted(set(y_train))}")

Train: 180040, Test: 30135
Labels: ['BACKGROUND', 'CONCLUSIONS', 'METHODS', 'OBJECTIVE', 'RESULTS']


### 4. Baseline: TF-IDF Bigrams (Bag of Words)

In [None]:
tfidf = TfidfVectorizer(tokenizer=scispacy_tokenizer, max_features=5000, ngram_range=(2,2))
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)



In [None]:
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train_tfidf, y_train)
y_pred = model.predict(X_test_tfidf)
print("Bag of Words (Bigrams) accuracy:", accuracy_score(y_test, y_pred))

In [None]:
print(classification_report(y_test, y_pred, target_names=sorted(set(y_test))))