In [7]:
 lora_config = LoraConfig(
        r=8,
        lora_alpha=16,
        target_modules=["query", "value"],
        lora_dropout=0.1,
        bias="none",
        task_type="SEQ_CLS"
    )

In [9]:

# --- Step 1: Load and Embed with T5 (Semantic Search) ---
def embed_with_t5(texts, model_name="t5-base"):
    tokenizer = T5Tokenizer.from_pretrained(model_name)
    model = T5EncoderModel.from_pretrained(model_name)
    inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        embeddings = model(**inputs).last_hidden_state.mean(dim=1)
    return embeddings.numpy()

# --- Step 2: Classify with SciBERT + LoRA ---
def classify_with_scibert_lora(texts, labels, model_name="allenai/scibert_scivocab_uncased",lora_config=None):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=4)  # P, I, C, O

    # LoRA Config

    model = get_peft_model(model, lora_config)

    # Tokenize and prepare dataset
    inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=512)
    labels = torch.tensor(labels)

    # Fine-tune or use pre-trained LoRA weights here
    # (For demo, we skip training and assume model is already fine-tuned)
    # Trainer API can be used for actual fine-tuning

    # Predict
    with torch.no_grad():
        logits = model(**inputs).logits
    preds = torch.argmax(logits, dim=1)
    return preds.numpy()

# --- Step 3: Downstream Tasks ---
def sentiment_analysis(texts):
    # Use twitter-roberta-base-sentiment
    model_name = "cardiffnlp/twitter-roberta-base-sentiment"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name)
    inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        logits = model(**inputs).logits
    return torch.argmax(logits, dim=1).numpy()

def ner_extraction(texts):
    # Use a NER model like dslim/bert-base-NER
    tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
    model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")
    inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        logits = model(**inputs).logits
    # Post-process to extract entities
    # (Simplified for demo)
    return ["Participant1", "Participant2"]  # Placeholder

# --- Main Pipeline ---
def process_paper(title, abstract, text):
    # Combine all text
    all_texts = [title, abstract, text]

    # Step 1: Embed with T5
    embeddings = embed_with_t5(all_texts)

    # Step 2: Classify with SciBERT + LoRA
    # (Assuming you have labels for demo, replace with actual labels)
    labels = [0, 1, 2, 3]  # Dummy labels
    pico_labels = classify_with_scibert_lora(all_texts, labels)

    # Step 3: Downstream tasks
    sentiment = sentiment_analysis([t for i, t in enumerate(all_texts) if pico_labels[i] == 1])  # I/O only
    participants = ner_extraction([t for i, t in enumerate(all_texts) if pico_labels[i] == 0])  # P only

    return {
        "embeddings": embeddings,
        "pico_labels": pico_labels,
        "sentiment": sentiment,
        "participants": participants
    }

# --- Example Usage ---
title = "Effect of X on Y: A Randomized Trial"
abstract = "This study investigates the effect of X on Y in patients with Z..."
text = "Methods: We randomized 100 patients to X or placebo..."
result = process_paper(title, abstract, text)
print(result)




pytorch_model.bin:   0%|          | 0.00/442M [00:00<?, ?B/s]

OSError: Can't load the model for 'allenai/scibert_scivocab_uncased'. If you were trying to load it from 'https://huggingface.co/models', make sure you don't have a local directory with the same name. Otherwise, make sure 'allenai/scibert_scivocab_uncased' is the correct path to a directory containing a file named pytorch_model.bin, tf_model.h5, model.ckpt or flax_model.msgpack.

In [None]:
arxiv=pd.read_csv("C://Users//Lenovo//PycharmProjects//HRreview//review//data//papers_sample.csv")