In [None]:
from transformers import pipeline

# Initialize zero-shot classification pipeline
classifier = pipeline("zero-shot-classification",
                      model="facebook/bart-large-mnli", device=0)

print(f"Framework in use: {classifier.framework}")

abstracts = [
    "This study investigates the transport of glucose by SGLT1 in human intestinal cells.",
    "ATP-sensitive potassium (K (ATP)) channels are multimeric protein complexes...",
]

labels = ["No Evidence (0)", "Minimal Evidence (1-3)", "Moderate Evidence (4-6)",
          "Strong Evidence (7-9)", "Conclusive Evidence (10)"]

# Classify abstracts
for abstract in abstracts:
    result = classifier(abstract, candidate_labels=labels, multi_label=False)
    print(f"Abstract: {abstract}")
    print(
        f"Classification: {result['labels'][0]} with score {result['scores'][0]}")

In [1]:
import torch

print(f"PyTorch version: {torch.__version__}")
print(f"GPU available: {torch.cuda.is_available()}")
print(f"CUDA version: {torch.version.cuda}")

PyTorch version: 2.5.1+cu124
GPU available: True
CUDA version: 12.4


In [2]:
import torch

if torch.cuda.is_available():
    print(f"GPU is available: {torch.cuda.get_device_name(0)}")
else:
    print("GPU is not available.")

GPU is available: NVIDIA A100 80GB PCIe


In [7]:
dataset = [
    {
        "abstract": "This study investigates the transport of glucose by SGLT1 in human intestinal cells.",
        "question": "Does this abstract provide laboratory or experimental evidence that substrate X is transported by transporter protein Y?",
        "confidence_score": 3,
        "justification": "The abstract mentions investigation of glucose transport by SGLT1 but does not provide explicit experimental evidence or outcomes."
    },
    {
        "abstract": "The effects of various inhibitors on non-specific diffusion were analyzed.",
        "question": "Does this abstract provide laboratory or experimental evidence that substrate X is transported by transporter protein Y?",
        "confidence_score": 0,
        "justification": "The abstract focuses solely on non-specific diffusion and the effects of inhibitors, without mentioning substrate transport by a specific protein."
    },
    {
        "abstract": "ATP-sensitive potassium (K (ATP) ) channels are multimeric protein complexes...",
        "question": "Does this abstract provide laboratory or experimental evidence that substrate X is transported by transporter protein Y?",
        "confidence_score": 4,
        "justification": "The abstract identifies transporter proteins and their roles, but does not provide direct evidence of substrate transport."
    },
    {
        "abstract": "Membrane transporters that use energy stored in sodium gradients to drive nutrients...",
        "question": "Does this abstract provide laboratory or experimental evidence that substrate X is transported by transporter protein Y?",
        "confidence_score": 7,
        "justification": "The abstract discusses structural insights and galactose binding to vSGLT, strongly implying substrate transport, but lacks direct experimental transport evidence."
    }
]

In [8]:
import json

# Save dataset to a JSON file
with open("/data/servilla/llama3/data/data.json", "w") as json_file:
    json.dump(dataset, json_file, indent=4)

In [9]:
import json

# Save as JSONL file
with open("/data/servilla/llama3/data/fine_tuning_data.jsonl", "w") as f:
    for entry in dataset:
        f.write(json.dumps(entry) + "\n")


In [7]:
import sentencepiece as spm

TOKENIZER_PATH = "/data_link/servilla/.llama/checkpoints/Llama3.1-8B-Instruct/tokenizer.model"

try:
    print("Loading tokenizer...")
    sp = spm.SentencePieceProcessor()
    sp.Load(TOKENIZER_PATH)
    print("Tokenizer loaded successfully.")
except Exception as e:
    print(f"Error loading tokenizer: {e}")

Loading tokenizer...
Error loading tokenizer: Internal: could not parse ModelProto from /data_link/servilla/.llama/checkpoints/Llama3.1-8B-Instruct/tokenizer.model


In [8]:
import torch

MODEL_PATH = "/data_link/servilla/.llama/checkpoints/Llama3.1-8B-Instruct/consolidated.00.pth"

checkpoint = torch.load(MODEL_PATH, map_location="cpu")
print(type(checkpoint))
print(checkpoint.keys())

  checkpoint = torch.load(MODEL_PATH, map_location="cpu")


<class 'dict'>
dict_keys(['tok_embeddings.weight', 'layers.0.attention.wq.weight', 'layers.0.attention.wk.weight', 'layers.0.attention.wv.weight', 'layers.0.attention.wo.weight', 'layers.0.feed_forward.w1.weight', 'layers.0.feed_forward.w3.weight', 'layers.0.feed_forward.w2.weight', 'layers.0.attention_norm.weight', 'layers.0.ffn_norm.weight', 'layers.1.attention.wq.weight', 'layers.1.attention.wk.weight', 'layers.1.attention.wv.weight', 'layers.1.attention.wo.weight', 'layers.1.feed_forward.w1.weight', 'layers.1.feed_forward.w3.weight', 'layers.1.feed_forward.w2.weight', 'layers.1.attention_norm.weight', 'layers.1.ffn_norm.weight', 'layers.2.attention.wq.weight', 'layers.2.attention.wk.weight', 'layers.2.attention.wv.weight', 'layers.2.attention.wo.weight', 'layers.2.feed_forward.w1.weight', 'layers.2.feed_forward.w3.weight', 'layers.2.feed_forward.w2.weight', 'layers.2.attention_norm.weight', 'layers.2.ffn_norm.weight', 'layers.3.attention.wq.weight', 'layers.3.attention.wk.weight', 

In [3]:
import sentencepiece as spm

TOKENIZER_PATH = "/data/servilla/.llama/checkpoints/Llama3.1-8B-Instruct/tokenizer.model"

sp = spm.SentencePieceProcessor()
try:
    result = sp.Load(TOKENIZER_PATH)
    print(f"Tokenizer loaded successfully: {result}")
except RuntimeError as e:
    print(f"Failed to load tokenizer: {e}")


Failed to load tokenizer: Internal: could not parse ModelProto from /data/servilla/.llama/checkpoints/Llama3.1-8B-Instruct/tokenizer.model
