In [31]:
!pip install numpy pandas scikit-learn nltk torch jupyter
!pip install py7zr

Collecting py7zr
  Downloading py7zr-1.0.0-py3-none-any.whl.metadata (17 kB)
Collecting texttable (from py7zr)
  Downloading texttable-1.7.0-py2.py3-none-any.whl.metadata (9.8 kB)
Collecting pycryptodomex>=3.20.0 (from py7zr)
  Downloading pycryptodomex-3.23.0-cp37-abi3-macosx_10_9_universal2.whl.metadata (3.4 kB)
Collecting brotli>=1.1.0 (from py7zr)
  Downloading Brotli-1.1.0-cp312-cp312-macosx_10_13_universal2.whl.metadata (5.5 kB)
Collecting pyzstd>=0.16.1 (from py7zr)
  Downloading pyzstd-0.17.0-cp312-cp312-macosx_11_0_arm64.whl.metadata (2.5 kB)
Collecting pyppmd<1.3.0,>=1.1.0 (from py7zr)
  Downloading pyppmd-1.2.0-cp312-cp312-macosx_11_0_arm64.whl.metadata (5.4 kB)
Collecting pybcj<1.1.0,>=1.0.0 (from py7zr)
  Downloading pybcj-1.0.6-cp312-cp312-macosx_11_0_arm64.whl.metadata (3.7 kB)
Collecting multivolumefile>=0.2.3 (from py7zr)
  Downloading multivolumefile-0.2.3-py3-none-any.whl.metadata (6.3 kB)
Collecting inflate64<1.1.0,>=1.0.0 (from py7zr)
  Downloading inflate64-1.0.3-

In [94]:
import py7zr
import os

# 20k dataset
data_dir_20k = "./pubmed-rct/PubMed_20k_RCT"
print("Files in 20k folder:", os.listdir(data_dir_20k))

# 20k dataset
data_dir_20k_replace = "./pubmed-rct/PubMed_20k_RCT_numbers_replaced_with_at_sign"
print("Files in 20k folder:", os.listdir(data_dir_20k_replace))

# 200k dataset
data_dir_200k = "./pubmed-rct/PubMed_200k_RCT"
archive_path = os.path.join(data_dir_200k, "train.7z")
extracted_path = os.path.join(data_dir_200k, "train_extracted")

# Create folder to extract 200k dataset
os.makedirs(extracted_path, exist_ok=True)

# Extract .7z archive
with py7zr.SevenZipFile(archive_path, mode='r') as archive:
    archive.extractall(path=extracted_path)

train_200k_file = os.path.join(extracted_path, "train.txt")
print("Files extracted:", os.listdir(extracted_path))

train_200k_replace_file = os.path.join(extracted_path, "train.txt")
print("Files extracted:", os.listdir(extracted_path))

# Load train.txt for 20k, 20k_replace and 200k
with open(os.path.join(data_dir_20k, "train.txt"), "r", encoding="utf-8") as f:
    train_data_20k = f.readlines()

with open(os.path.join(data_dir_20k_replace, "train.txt"), "r", encoding="utf-8") as f:
    train_data_20k_replace = f.readlines()

with open(train_200k_file, "r", encoding="utf-8") as f:
    train_data_200k = f.readlines()

with open(train_200k_replace_file, "r", encoding="utf-8") as f:
    train_data_200k_replace = f.readlines()

# Preview first 10 lines
print(f"Training 20k sample:\n{train_data_20k[:10]}")
print(f"\nTraining 20k Replace Numbers with At Sign sample:\n{train_data_20k_replace[:10]}")
print(f"\nTraining 200k sample:\n{train_data_200k[:10]}")
print(f"\nTraining 200k Replace Numbers with At Sign sample:\n{train_data_200k[:10]}")


Files in 20k folder: ['dev.txt', 'train.txt', 'test.txt']
Files in 20k folder: ['dev.txt', 'train.txt', 'test.txt']
Files extracted: ['train.txt']
Files extracted: ['train.txt']
Training 20k sample:
['###24293578\n', 'OBJECTIVE\tTo investigate the efficacy of 6 weeks of daily low-dose oral prednisolone in improving pain , mobility , and systemic low-grade inflammation in the short term and whether the effect would be sustained at 12 weeks in older adults with moderate to severe knee osteoarthritis ( OA ) .\n', 'METHODS\tA total of 125 patients with primary knee OA were randomized 1:1 ; 63 received 7.5 mg/day of prednisolone and 62 received placebo for 6 weeks .\n', 'METHODS\tOutcome measures included pain reduction and improvement in function scores and systemic inflammation markers .\n', 'METHODS\tPain was assessed using the visual analog pain scale ( 0-100 mm ) .\n', 'METHODS\tSecondary outcome measures included the Western Ontario and McMaster Universities Osteoarthritis Index score

In [96]:
import pandas as pd

def parse_lines(lines):
    data = []
    for line in lines:
        if "\t" in line:
            label, sentence = line.split("\t")
            data.append((label, sentence.strip()))
    return pd.DataFrame(data, columns=["Label", "Sentence"])

df_20k = parse_lines(train_data_20k)
df_20k_replace = parse_lines(train_data_20k_replace)
df_200k = parse_lines(train_data_200k)
df_200k_replace = parse_lines(train_data_200k_replace)


In [97]:
# Clean the data
import string

def clean_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text

df_20k['CleanSentence'] = df_20k['Sentence'].apply(clean_text)
df_20k_replace['CleanSentence'] = df_20k_replace['Sentence'].apply(clean_text)
df_200k['CleanSentence'] = df_200k['Sentence'].apply(clean_text)
df_200k_replace['CleanSentence'] = df_200k_replace['Sentence'].apply(clean_text)

In [99]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000)
X_20k = vectorizer.fit_transform(df_20k['CleanSentence'])
y_20k = df_20k['Label']
X_20k_replace = vectorizer.fit_transform(df_20k_replace['CleanSentence'])
y_20k_replace = df_20k_replace['Label']
X_200k = vectorizer.fit_transform(df_200k['CleanSentence'])
y_200k = df_200k['Label']
X_200k_replace = vectorizer.fit_transform(df_200k_replace['CleanSentence'])
y_200k_replace = df_200k_replace['Label']

In [100]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X_20k, y_20k, test_size=0.2, random_state=42)

# Train
clf = LogisticRegression(max_iter=200)
clf.fit(X_train, y_train)

# Predict and evaluate
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

  BACKGROUND       0.65      0.66      0.65      4398
 CONCLUSIONS       0.73      0.71      0.72      5361
     METHODS       0.85      0.90      0.88     11906
   OBJECTIVE       0.69      0.54      0.61      2739
     RESULTS       0.87      0.87      0.87     11604

    accuracy                           0.80     36008
   macro avg       0.76      0.74      0.74     36008
weighted avg       0.80      0.80      0.80     36008



In [90]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X_20k_replace, y_20k_replace, test_size=0.2, random_state=42)

# Train
clf = LogisticRegression(max_iter=200)
clf.fit(X_train, y_train)

# Predict and evaluate
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

  BACKGROUND       0.65      0.64      0.65      4398
 CONCLUSIONS       0.72      0.70      0.71      5361
     METHODS       0.85      0.90      0.87     11906
   OBJECTIVE       0.69      0.54      0.61      2739
     RESULTS       0.85      0.86      0.85     11604

    accuracy                           0.80     36008
   macro avg       0.75      0.73      0.74     36008
weighted avg       0.79      0.80      0.80     36008



In [76]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X_200k, y_200k, test_size=0.2, random_state=42)

# Train
clf = LogisticRegression(max_iter=200)
clf.fit(X_train, y_train)

# Predict and evaluate
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

  BACKGROUND       0.61      0.57      0.58     39339
 CONCLUSIONS       0.75      0.75      0.75     68127
     METHODS       0.86      0.91      0.88    144462
   OBJECTIVE       0.71      0.60      0.65     37488
     RESULTS       0.87      0.88      0.87    152957

    accuracy                           0.82    442373
   macro avg       0.76      0.74      0.75    442373
weighted avg       0.81      0.82      0.81    442373



In [101]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X_200k_replace, y_200k_replace, test_size=0.2, random_state=42)

# Train
clf = LogisticRegression(max_iter=200)
clf.fit(X_train, y_train)

# Predict and evaluate
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

  BACKGROUND       0.61      0.57      0.58     39339
 CONCLUSIONS       0.75      0.75      0.75     68127
     METHODS       0.86      0.91      0.88    144462
   OBJECTIVE       0.71      0.60      0.65     37488
     RESULTS       0.87      0.88      0.87    152957

    accuracy                           0.82    442373
   macro avg       0.76      0.74      0.75    442373
weighted avg       0.81      0.82      0.81    442373



RESULTS: Classification of PubMed RCT abstracts shows that performance is generally strong for METHODS and RESULTS sentences, with F1-scores around 0.87–0.88 across datasets, while BACKGROUND and OBJECTIVE sentences are more challenging. Replacing numbers with a placeholder (@) in the 20k dataset has does not have significant impact, indicating that the model relies primarily on linguistic context rather than numeric content. Expanding to the 200k dataset improves the accuracy from 80% to 82%. It slightly enhances classification of less frequent labels like OBJECTIVE, demonstrating that larger datasets help the model generalize better. Overall, the results show that dataset size is important for capturing diverse sentence structures, while some categories remain more difficult to classify regardless of the dataset quantity or replacing the numeric values with an at sign.

Results seen here are demonstrated in the original paper: "PubMed 200k RCT: a dataset for sequential sentence classification in medical abstracts" where METHODS and RESULTS sentences were consistently the easiest to classify, while BACKGROUND and OBJECTIVE were more challenging as seen above. Also this method shows a normalization of numeric data which is not elaborated on in the text, but can be seen from re-running and retrieving results from all the datasets.
