In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import sys

# Load and clean the DataFrame
df = pd.read_csv("adh_scrape29jan25.csv")
df = df.drop(columns=['Unnamed: 0', 'Summary', 'PublicationDate'])

# Create a new column to store the article text
df['article_text'] = None

total = len(df.index)
successes = 0
fails = 0

def get_fulltext_link(url):
    """
    Fetches the full-text article for the given URL.
    Returns the article text if found; otherwise, returns None.
    """
    try:
        # First request: Get the page that contains the full-text link(s)
        response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        link_section = soup.find('div', class_='full-text-links-list')
        
        if link_section:
            fulltext_link = link_section.find('a', href=True)
            if fulltext_link:
                # Second request: Fetch the actual article content
                response = requests.get(fulltext_link['href'], headers={'User-Agent': 'Mozilla/5.0'})
                response.raise_for_status()
                soup = BeautifulSoup(response.text, 'html.parser')
                paragraphs = soup.find_all('p')
                article_text = ' '.join([p.get_text() for p in paragraphs if p.get_text()])
                return article_text if article_text.strip() else None
        return None
    except Exception as e:
        return None

# Loop over each journal (row) in the DataFrame.
# Assumes that the URL is stored in a column named 'url'.
for idx, row in df.iterrows():
    url = row['PubMedURL']  # Update the column name if necessary
    article_text = get_fulltext_link(url)
    
    # Update counters based on the result.
    if article_text:
        successes += 1
    else:
        fails += 1
    
    # Save the result in the DataFrame (even if article_text is None)
    df.at[idx, 'article_text'] = article_text

    # Update the progress on a single line using carriage return.
    sys.stdout.write(f"\rSuccess: {successes}/{total} | Fail: {fails}/{total}")
    sys.stdout.flush()
    
    # Save progress every 100 rows by overwriting the same CSV file.
    if (idx + 1) % 100 == 0:
        df.to_csv("progress.csv", index=False)

# Final save after processing all rows.
df.to_csv("progress.csv", index=False)
print("\nDone.")


In [None]:
import os
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import load_dataset, Dataset

# 1. Choose the model and tokenizer
model_name = "gpt2"  # or choose a different model from Hugging Face
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# 2. Prepare your corpus
# Assuming you have a CSV file with a column 'full_text' containing your corpus
data_file = "bc_incidence_full_text_31jan25.csv"
df = pd.read_csv(data_file)

In [33]:
corpus = df.drop(['ExtractedStats'],axis=1).dropna().astype(str).agg(" ".join, axis=1)

In [None]:
corpus

In [2]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import Dataset

# 1. Load your data with pandas
df = pd.read_csv("bc_incidence_full_text_31jan25.csv")
# Suppose you want to combine all text columns into one
df["combined_text"] = df.astype(str).agg(" ".join, axis=1)
corpus = df["combined_text"].fillna("").tolist()

# 2. Initialize your tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = AutoModelForCausalLM.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

# 3. Tokenize the corpus
tokenized_output = tokenizer(corpus, truncation=True, max_length=512)

# 4. Convert tokenized output to a Dataset
train_dataset = Dataset.from_dict(tokenized_output)

# 5. Set up the data collator for language modeling (mlm=False for causal LM)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# 6. Configure training arguments
training_args = TrainingArguments(
    output_dir="./fine_tuned_model",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_steps=500,
    save_total_limit=2,
    eval_strategy="no",
    logging_steps=100,
)

# 7. Set up the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=data_collator,
)

# 8. Fine-tune the model
trainer.train()

# 9. Save the fine-tuned model and tokenizer
trainer.save_model("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")




  0%|          | 0/5745 [00:00<?, ?it/s]

{'loss': 3.0523, 'grad_norm': 5.574575424194336, 'learning_rate': 4.912967798085292e-05, 'epoch': 0.05}
{'loss': 2.7712, 'grad_norm': 5.774946689605713, 'learning_rate': 4.825935596170583e-05, 'epoch': 0.1}
{'loss': 2.7284, 'grad_norm': 3.686461925506592, 'learning_rate': 4.738903394255875e-05, 'epoch': 0.16}
{'loss': 2.6675, 'grad_norm': 3.751136302947998, 'learning_rate': 4.651871192341166e-05, 'epoch': 0.21}
{'loss': 2.6737, 'grad_norm': 4.175460338592529, 'learning_rate': 4.564838990426458e-05, 'epoch': 0.26}
{'loss': 2.6099, 'grad_norm': 3.9453747272491455, 'learning_rate': 4.47780678851175e-05, 'epoch': 0.31}
{'loss': 2.5657, 'grad_norm': 3.237149715423584, 'learning_rate': 4.390774586597041e-05, 'epoch': 0.37}
{'loss': 2.5524, 'grad_norm': 2.8351993560791016, 'learning_rate': 4.303742384682333e-05, 'epoch': 0.42}
{'loss': 2.4988, 'grad_norm': 4.131146430969238, 'learning_rate': 4.216710182767624e-05, 'epoch': 0.47}
{'loss': 2.4994, 'grad_norm': 2.6252501010894775, 'learning_rate

('./fine_tuned_model\\tokenizer_config.json',
 './fine_tuned_model\\special_tokens_map.json',
 './fine_tuned_model\\vocab.json',
 './fine_tuned_model\\merges.txt',
 './fine_tuned_model\\added_tokens.json',
 './fine_tuned_model\\tokenizer.json')