In [1]:
import pandas as pd
import numpy as np

In [2]:
file_path = r"C:\Users\Mrityunjay\Desktop\lung_cancer_classification\classification_pipeline\app\Dataset\Cancer\30872385.txt"
with open(file_path, "r") as f:
    text = f.read()

In [3]:
lines = text.strip().split("\n")
lines

['<ID:30872385>',
 'Title: Comparison of methodologies for the detection of BRAF mutations in bone marrow trephine specimens.',
 "Abstract: AIMS: BRAF V600E detection assists in the diagnosis of hairy cell leukaemia (HCL); however, testing practices vary. We evaluated the clinical utility of 5 BRAF mutation testing strategies for use on bone marrow trephines (BMT). METHODS: 11 HCL, 5 HCL 'mimic', 2 treated HCL and 10 normal BMT specimens were tested for mutant BRAF, comparing Sanger sequencing, pyrosequencing, amplicon-based next generation sequencing (NGS), automated (Idylla) PCR and immunohistochemistry (IHC). RESULTS: PCR and IHC were cheaper and identified V600E in 100 % of HCL cases. Pyrosequencing detected the mutation in 91%, NGS in 55% of cases and Sanger sequencing in 27%. All assays gave wild-type BRAF results in HCL mimics and normal BMT samples. CONCLUSIONS: PCR and IHC were most sensitive and cost-effective, but these have limited scope for multiplexing and are likely to b

In [4]:
# Parse text into components
lines = text.strip().split('\n')

data = {
    'ID': lines[0].replace('<ID:', '').replace('>', '').strip(),
    'Title': lines[1].replace('Title:', '').strip(),
    'Abstract': lines[2].replace('Abstract:', '').strip()
}

# Create DataFrame
df = pd.DataFrame([data])
print(df)


         ID                                              Title  \
0  30872385  Comparison of methodologies for the detection ...   

                                            Abstract  
0  AIMS: BRAF V600E detection assists in the diag...  


In [5]:
import os
import pandas as pd

#base directory
base_dir = r"C:\Users\Mrityunjay\Desktop\lung_cancer_classification\classification_pipeline\app\Dataset"

#data colection
data = []

#using loop "cancer", "uncancer"
for label in ["Cancer", "Non-Cancer"]:
    folder_path = os.path.join(base_dir, label)

    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            file_path = os.path.join(folder_path, filename)
            #print(file_path)
            
            #read the text file 
            with open(file_path, "r", encoding="utf-8") as f:
                content = f.read()
                
            data.append({
                "file_path":file_path,
                "label": label,
                "content": content
            })
#create dataframe
df = pd.DataFrame(data)

#result
df.head()

Unnamed: 0,file_path,label,content
0,C:\Users\Mrityunjay\Desktop\lung_cancer_classi...,Cancer,<ID:30872385>\nTitle: Comparison of methodolog...
1,C:\Users\Mrityunjay\Desktop\lung_cancer_classi...,Cancer,<ID:30873683>\nTitle: Tumour biomarkers-Tracin...
2,C:\Users\Mrityunjay\Desktop\lung_cancer_classi...,Cancer,"<ID:30874851>\nTitle: Pomalidomide, cyclophosp..."
3,C:\Users\Mrityunjay\Desktop\lung_cancer_classi...,Cancer,<ID:30875581>\nTitle: Aggressive variants of p...
4,C:\Users\Mrityunjay\Desktop\lung_cancer_classi...,Cancer,<ID:30875950>\nTitle: Circulating Tumour Cells...


In [6]:
#data parse from content
data1 = []
def data_parse(df):
    content_list = [i for i in df["content"]]
    for text in content_list:
        lines = text.strip().split("\n")
        id_ = lines[0].replace('<ID:', '').replace('>', '').strip()
        title = lines[1].replace('Title:', '').strip()
        abstract = lines[2].replace('Abstract:', '').strip()
        #print(id_, title, abstract)
        data1.append({
            # 'file_path':df["file_path"],
            # 'label': df["label"][0],
            "ids":id_,
            "title":title,
            "abstract":abstract
        })
data_parse(df)

df_ = pd.DataFrame(data1)
df["ids"] = df_["ids"]
df["title"] = df_["title"]
df["abstract"] =df_["abstract"]
df

Unnamed: 0,file_path,label,content,ids,title,abstract
0,C:\Users\Mrityunjay\Desktop\lung_cancer_classi...,Cancer,<ID:30872385>\nTitle: Comparison of methodolog...,30872385,Comparison of methodologies for the detection ...,AIMS: BRAF V600E detection assists in the diag...
1,C:\Users\Mrityunjay\Desktop\lung_cancer_classi...,Cancer,<ID:30873683>\nTitle: Tumour biomarkers-Tracin...,30873683,Tumour biomarkers-Tracing the molecular functi...,"In recent years, with the increase in cancer m..."
2,C:\Users\Mrityunjay\Desktop\lung_cancer_classi...,Cancer,"<ID:30874851>\nTitle: Pomalidomide, cyclophosp...",30874851,"Pomalidomide, cyclophosphamide, and dexamethas...",Pomalidomide dexamethasone is a standard of ca...
3,C:\Users\Mrityunjay\Desktop\lung_cancer_classi...,Cancer,<ID:30875581>\nTitle: Aggressive variants of p...,30875581,Aggressive variants of prostate cancer - Are w...,"Recently, adoption of novel drugs for systemic..."
4,C:\Users\Mrityunjay\Desktop\lung_cancer_classi...,Cancer,<ID:30875950>\nTitle: Circulating Tumour Cells...,30875950,"Circulating Tumour Cells (CTC), Head and Neck ...",Head and neck cancer is the seventh most commo...
...,...,...,...,...,...,...
995,C:\Users\Mrityunjay\Desktop\lung_cancer_classi...,Non-Cancer,<ID:38623902>\nTitle: [Not Available].\nAbstra...,38623902,[Not Available].,Effective longitudinal biomarkers that track d...
996,C:\Users\Mrityunjay\Desktop\lung_cancer_classi...,Non-Cancer,<ID:38640937>\nTitle: Mechanisms and managemen...,38640937,Mechanisms and management of loss of response ...,We sought to report the effectiveness of infli...
997,C:\Users\Mrityunjay\Desktop\lung_cancer_classi...,Non-Cancer,<ID:38642556>\nTitle: Modification of coronary...,38642556,Modification of coronary artery disease clinic...,The extent to which the relationships between ...
998,C:\Users\Mrityunjay\Desktop\lung_cancer_classi...,Non-Cancer,<ID:38650020>\nTitle: Meta-analysis of the glo...,38650020,Meta-analysis of the global distribution of cl...,CYP2C8 is responsible for the metabolism of 5%...


In [7]:
import re
import string
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

#lemmatizer
lemmatizer = WordNetLemmatizer()

#define cleaning function
def clean_text(text):
    #changing into Lowercase
    text = text.lower()
    
    #remove HTML tags
    text = re.sub(r'<.*?>', '', text)

    #remove special characters & punctuation.
    #text = re.sub(rf'[{re.escape(string.punctuation)}]', '', text)

    #remove numbers
    #text = re.sub(r'\d+', '', text)

    #remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    #Tokenization
    tokens = word_tokenize(text)

    #Lemmatization (without removing stopwords)
    cleaned_tokens = [lemmatizer.lemmatize(word) for word in tokens]

    #Joining back to string
    return ' '.join(cleaned_tokens)


In [8]:
df["title"] = df["title"].apply(clean_text)
df["abstract"] = df["abstract"].apply(clean_text)

In [9]:
df

Unnamed: 0,file_path,label,content,ids,title,abstract
0,C:\Users\Mrityunjay\Desktop\lung_cancer_classi...,Cancer,<ID:30872385>\nTitle: Comparison of methodolog...,30872385,comparison of methodology for the detection of...,aim : braf v600e detection assist in the diagn...
1,C:\Users\Mrityunjay\Desktop\lung_cancer_classi...,Cancer,<ID:30873683>\nTitle: Tumour biomarkers-Tracin...,30873683,tumour biomarkers-tracing the molecular functi...,"in recent year , with the increase in cancer m..."
2,C:\Users\Mrityunjay\Desktop\lung_cancer_classi...,Cancer,"<ID:30874851>\nTitle: Pomalidomide, cyclophosp...",30874851,"pomalidomide , cyclophosphamide , and dexameth...",pomalidomide dexamethasone is a standard of ca...
3,C:\Users\Mrityunjay\Desktop\lung_cancer_classi...,Cancer,<ID:30875581>\nTitle: Aggressive variants of p...,30875581,aggressive variant of prostate cancer - are we...,"recently , adoption of novel drug for systemic..."
4,C:\Users\Mrityunjay\Desktop\lung_cancer_classi...,Cancer,<ID:30875950>\nTitle: Circulating Tumour Cells...,30875950,"circulating tumour cell ( ctc ) , head and nec...",head and neck cancer is the seventh most commo...
...,...,...,...,...,...,...
995,C:\Users\Mrityunjay\Desktop\lung_cancer_classi...,Non-Cancer,<ID:38623902>\nTitle: [Not Available].\nAbstra...,38623902,[ not available ] .,effective longitudinal biomarkers that track d...
996,C:\Users\Mrityunjay\Desktop\lung_cancer_classi...,Non-Cancer,<ID:38640937>\nTitle: Mechanisms and managemen...,38640937,mechanism and management of loss of response t...,we sought to report the effectiveness of infli...
997,C:\Users\Mrityunjay\Desktop\lung_cancer_classi...,Non-Cancer,<ID:38642556>\nTitle: Modification of coronary...,38642556,modification of coronary artery disease clinic...,the extent to which the relationship between c...
998,C:\Users\Mrityunjay\Desktop\lung_cancer_classi...,Non-Cancer,<ID:38650020>\nTitle: Meta-analysis of the glo...,38650020,meta-analysis of the global distribution of cl...,cyp2c8 is responsible for the metabolism of 5 ...


In [None]:
# df.to_csv("data.csv", index = False)

In [19]:
# # data_loader.py
# import os
# import pandas as pd

# # Base directory
# BASE_DIR = r"C:\Users\Mrityunjay\Desktop\lung_cancer_classification\classification_pipeline\app\Dataset"

# # Initialize list for storing data
# data = []

# # Loop through Cancer and Non-Cancer folders
# for label in ["Cancer", "Non-Cancer"]:
#     folder_path = os.path.join(BASE_DIR, label)

#     for filename in os.listdir(folder_path):
#         if filename.endswith(".txt"):
#             file_path = os.path.join(folder_path, filename)

#             # Read the content
#             with open(file_path, "r", encoding="utf-8") as f:
#                 content = f.read().strip()

#             # Parse ID, Title, Abstract
#             try:
#                 lines = content.split("\n")
#                 id_ = lines[0].replace('<ID:', '').replace('>', '').strip()
#                 title = lines[1].replace('Title:', '').strip()
#                 abstract = lines[2].replace('Abstract:', '').strip()
#             except IndexError:
#                 print(f"Malformed file skipped: {file_path}")
#                 continue

#             # Append parsed data
#             data.append({
#                 "file_path": file_path,
#                 "label": label,
#                 "ids": id_,
#                 "title": title,
#                 "abstract": abstract,
#                 "content": content
#             })

# # Convert to DataFrame
# df = pd.DataFrame(data)

# # Optional: show a sample
# print(df.head())


In [23]:
import numpy as np

label= np.load(r"C:\Users\Mrityunjay\Desktop\lung_cancer_classification\classification_pipeline\app\labels.npy")

import pandas as pd 
pd.DataFrame(label)

Unnamed: 0,0
0,1
1,1
2,1
3,1
4,1
...,...
995,0
996,0
997,0
998,0


In [None]:
embd = np.load(r"C:\Users\Mrityunjay\Desktop\lung_cancer_classification\classification_pipeline\app\embeddings.npy")
embd

array([[-0.05865531, -0.06116756, -0.04841614, ..., -0.13194865,
         0.00626522, -0.09212897],
       [-0.04267618, -0.02818019, -0.0572282 , ..., -0.01069069,
         0.01573799, -0.01285062],
       [ 0.00647902, -0.01608148, -0.03012745, ..., -0.07893753,
        -0.00581685, -0.09062544],
       ...,
       [ 0.02162201,  0.03952667,  0.00819065, ..., -0.02779166,
         0.09780525,  0.01256192],
       [-0.01750086, -0.00029801,  0.05761583, ...,  0.04505253,
         0.01330023, -0.06579966],
       [-0.09487092, -0.00049002, -0.04533917, ..., -0.04889944,
         0.01735684,  0.04944939]], dtype=float32)

In [None]:
embd[0], len(embd[0])

(array([-5.86553067e-02, -6.11675642e-02, -4.84161414e-02, -1.29827574e-01,
        -7.55321905e-02, -3.89920287e-02, -7.48163536e-02,  9.17119682e-02,
         2.15303637e-02,  3.15023996e-02, -2.02053525e-02, -2.61817481e-02,
         1.78138334e-02,  3.38815376e-02, -1.22108355e-01,  8.25391989e-03,
        -7.03520281e-03, -1.60418320e-02,  1.35447038e-02, -9.78858545e-02,
        -1.07826211e-01,  4.66004834e-02,  4.39636596e-02,  6.78262860e-02,
        -1.95260171e-03, -1.13929935e-01, -2.94170505e-03,  3.55189666e-02,
        -1.19355889e-02,  2.35840604e-02,  1.58432275e-02,  8.45380127e-02,
         8.95186290e-02, -2.03057062e-02,  1.68474652e-02,  4.90528010e-02,
        -5.40754981e-02,  1.16312327e-02,  3.99037525e-02, -1.80040039e-02,
         5.38776591e-02, -1.23859905e-01, -7.24078640e-02,  4.48571108e-02,
         2.87048910e-02, -6.24615848e-02, -5.72517747e-03,  3.26595060e-03,
        -4.53652767e-03,  1.38715804e-01,  2.08635256e-02, -1.18058771e-02,
        -1.4

In [10]:
df

Unnamed: 0,file_path,label,content,ids,title,abstract
0,C:\Users\Mrityunjay\Desktop\lung_cancer_classi...,Cancer,<ID:30872385>\nTitle: Comparison of methodolog...,30872385,comparison of methodology for the detection of...,aim : braf v600e detection assist in the diagn...
1,C:\Users\Mrityunjay\Desktop\lung_cancer_classi...,Cancer,<ID:30873683>\nTitle: Tumour biomarkers-Tracin...,30873683,tumour biomarkers-tracing the molecular functi...,"in recent year , with the increase in cancer m..."
2,C:\Users\Mrityunjay\Desktop\lung_cancer_classi...,Cancer,"<ID:30874851>\nTitle: Pomalidomide, cyclophosp...",30874851,"pomalidomide , cyclophosphamide , and dexameth...",pomalidomide dexamethasone is a standard of ca...
3,C:\Users\Mrityunjay\Desktop\lung_cancer_classi...,Cancer,<ID:30875581>\nTitle: Aggressive variants of p...,30875581,aggressive variant of prostate cancer - are we...,"recently , adoption of novel drug for systemic..."
4,C:\Users\Mrityunjay\Desktop\lung_cancer_classi...,Cancer,<ID:30875950>\nTitle: Circulating Tumour Cells...,30875950,"circulating tumour cell ( ctc ) , head and nec...",head and neck cancer is the seventh most commo...
...,...,...,...,...,...,...
995,C:\Users\Mrityunjay\Desktop\lung_cancer_classi...,Non-Cancer,<ID:38623902>\nTitle: [Not Available].\nAbstra...,38623902,[ not available ] .,effective longitudinal biomarkers that track d...
996,C:\Users\Mrityunjay\Desktop\lung_cancer_classi...,Non-Cancer,<ID:38640937>\nTitle: Mechanisms and managemen...,38640937,mechanism and management of loss of response t...,we sought to report the effectiveness of infli...
997,C:\Users\Mrityunjay\Desktop\lung_cancer_classi...,Non-Cancer,<ID:38642556>\nTitle: Modification of coronary...,38642556,modification of coronary artery disease clinic...,the extent to which the relationship between c...
998,C:\Users\Mrityunjay\Desktop\lung_cancer_classi...,Non-Cancer,<ID:38650020>\nTitle: Meta-analysis of the glo...,38650020,meta-analysis of the global distribution of cl...,cyp2c8 is responsible for the metabolism of 5 ...


In [None]:
# import numpy as np
# import pandas as pd
# import torch
# # We will use SentenceTransformer, so no need for AutoModel, AutoModelForCausalLM directly
# # from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM
# from sentence_transformers import SentenceTransformer
# import logging

# # Configure logging
# logging.basicConfig(level=logging.INFO)
# logger = logging.getLogger(__name__)

# # --- Model Name for Sentence Transformers ---
# # These are much smaller and faster on CPU.
# # 'all-MiniLM-L6-v2' is a good balance of size/performance.
# # 'all-mpnet-base-v2' is slightly larger but often more performant.
# SENTENCE_TRANSFORMER_MODEL_NAME = "all-MiniLM-L6-v2"
# # --------------------------------------------

# def load_embedding_model(model_name=SENTENCE_TRANSFORMER_MODEL_NAME):
#     """
#     Load a pre-trained sentence embedding model from sentence-transformers.

#     Args:
#         model_name (str): Name of the pre-trained model (e.g., 'all-MiniLM-L6-v2').

#     Returns:
#         SentenceTransformer: The loaded model.
#     """
#     try:
#         # SentenceTransformer handles its own tokenizer and device management internally
#         # It automatically uses CPU if CUDA is not available.
#         model = SentenceTransformer(model_name)
#         logger.info(f"Sentence embedding model '{model_name}' loaded successfully.")
        
#         # Log the actual device being used by the SentenceTransformer model
#         # SentenceTransformer wraps a Hugging Face model, so we can check its device
#         if hasattr(model, 'device'):
#             logger.info(f"SentenceTransformer model using device: {model.device}")
#         else:
#             # Fallback for older versions or different internal structures
#             logger.info("Could not determine specific device for SentenceTransformer model, assuming CPU.")

#         return model
#     except Exception as e:
#         logger.error(f"An error occurred while loading the embedding model: {e}")
#         raise

# def get_embedding(text, model):
#     """
#     Generate embeddings for a given text using a SentenceTransformer model.

#     Args:
#         text (str): The text to generate embeddings for.
#         model: The SentenceTransformer model to use.

#     Returns:
#         numpy.ndarray: The embeddings for the text.
#     """
#     try:
#         # SentenceTransformer's encode method handles tokenization, padding,
#         # and device placement internally.
#         # It returns a numpy array by default.
#         embeddings = model.encode(text, convert_to_numpy=True)

#         # Ensure the embedding is always a 2D array (e.g., (1, D) for a single text)
#         # model.encode can return 1D for single string or 2D for list of strings
#         if embeddings.ndim == 1:
#             embeddings = np.expand_dims(embeddings, axis=0)

#         return embeddings
#     except Exception as e:
#         logger.error(f"An error occurred while generating embeddings for text: '{text[:50]}...': {e}")
#         return None

# def generate_embeddings(df, model): # Removed tokenizer from args, not needed
#     """
#     Generate embeddings for texts in a DataFrame.

#     Args:
#         df (pd.DataFrame): The DataFrame containing the texts.
#         model: The embedding model to use.

#     Returns:
#         tuple: A tuple containing the embeddings and labels.
#     """
#     all_embeddings = []
#     labels = []
    
#     try:
#         # Combine title and abstract
#         df["text"] = df["title"].fillna("") + " " + df["abstract"].fillna("") # Handle potential NaN values

#         if "label" not in df.columns:
#             logger.error("DataFrame does not contain a 'label' column.")
#             return None, None
        
#         # --- Batch processing for efficiency (SentenceTransformer can do this easily) ---
#         # It's more efficient to encode a list of texts at once with SentenceTransformer
#         # Adjust batch_size based on your RAM. For 8GB, 32-64 might be a good starting point.
#         batch_size = 64
        
#         texts_to_process = df["text"].tolist()
#         df_labels = df["label"].tolist() # Get labels in the same order
        
#         for i in range(0, len(texts_to_process), batch_size):
#             batch_texts = texts_to_process[i:i + batch_size]
#             batch_labels = df_labels[i:i + batch_size] # Corresponding labels

#             current_batch_embeddings = model.encode(batch_texts, convert_to_numpy=True, show_progress_bar=False)
            
#             # Ensure embeddings are 2D (batch_size, embedding_dim)
#             if current_batch_embeddings.ndim == 1: # Should not happen with batch_size > 1
#                  current_batch_embeddings = np.expand_dims(current_batch_embeddings, axis=0)

#             for j in range(len(current_batch_embeddings)):
#                 all_embeddings.append(np.expand_dims(current_batch_embeddings[j], axis=0)) # Append as (1,D)
#                 labels.append(batch_labels[j])

#             # Optional: Print progress
#             if (i // batch_size + 1) % 10 == 0: # Print every 10 batches
#                 logger.info(f"Processed {i + len(batch_texts)}/{len(texts_to_process)} texts...")

#         if not all_embeddings:
#             logger.error("No embeddings were generated. Check for issues in get_embedding.")
#             return None, None

#         X = np.vstack(all_embeddings)
#         y = pd.Series(labels).map({'Cancer': 1, 'Non-Cancer': 0}).values

#         logger.info(f"Embeddings generated successfully. Shape: {X.shape}")
#         return X, y
#     except Exception as e:
#         logger.error(f"An error occurred while generating embeddings for the DataFrame: {e}")
#         raise

# if __name__ == "__main__":
#     # Load embedding model (no separate tokenizer needed for SentenceTransformer)
#     model = load_embedding_model()

#     try:
#         df = pd.read_csv(r"C:\Users\Mrityunjay\Desktop\lung_cancer_classification\classification_pipeline\app\cleaned_data.csv")
#         logger.info("cleaned_data.csv loaded successfully.")
        
#         required_cols = ["title", "abstract", "label"]
#         if not all(col in df.columns for col in required_cols):
#             logger.error(f"Missing one or more required columns ({required_cols}) in cleaned_data.csv.")
#             exit()

#     except FileNotFoundError:
#         logger.error("cleaned_data.csv not found. Please ensure it exists in the same directory.")
#         exit()
#     except Exception as e:
#         logger.error(f"An error occurred while loading cleaned_data.csv: {e}")
#         exit()

#     # Generate embeddings
#     X, y = generate_embeddings(df, model) 

#     if X is not None and y is not None:
#         np.save("embeddings.npy", X)
#         np.save("labels.npy", y)
#         logger.info("Embeddings and labels saved successfully.")
#     else:
#         logger.error("Failed to generate embeddings and labels, not saving.")

  from .autonotebook import tqdm as notebook_tqdm
INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2
INFO:__main__:Sentence embedding model 'all-MiniLM-L6-v2' loaded successfully.
INFO:__main__:SentenceTransformer model using device: cpu
ERROR:__main__:cleaned_data.csv not found. Please ensure it exists in the same directory.
INFO:__main__:Processed 640/1000 texts...
INFO:__main__:Embeddings generated successfully. Shape: (1000, 384)
INFO:__main__:Embeddings and labels saved successfully.


: 

In [11]:
df

Unnamed: 0,file_path,label,content,ids,title,abstract
0,C:\Users\Mrityunjay\Desktop\lung_cancer_classi...,Cancer,<ID:30872385>\nTitle: Comparison of methodolog...,30872385,comparison of methodology for the detection of...,aim : braf v600e detection assist in the diagn...
1,C:\Users\Mrityunjay\Desktop\lung_cancer_classi...,Cancer,<ID:30873683>\nTitle: Tumour biomarkers-Tracin...,30873683,tumour biomarkers-tracing the molecular functi...,"in recent year , with the increase in cancer m..."
2,C:\Users\Mrityunjay\Desktop\lung_cancer_classi...,Cancer,"<ID:30874851>\nTitle: Pomalidomide, cyclophosp...",30874851,"pomalidomide , cyclophosphamide , and dexameth...",pomalidomide dexamethasone is a standard of ca...
3,C:\Users\Mrityunjay\Desktop\lung_cancer_classi...,Cancer,<ID:30875581>\nTitle: Aggressive variants of p...,30875581,aggressive variant of prostate cancer - are we...,"recently , adoption of novel drug for systemic..."
4,C:\Users\Mrityunjay\Desktop\lung_cancer_classi...,Cancer,<ID:30875950>\nTitle: Circulating Tumour Cells...,30875950,"circulating tumour cell ( ctc ) , head and nec...",head and neck cancer is the seventh most commo...
...,...,...,...,...,...,...
995,C:\Users\Mrityunjay\Desktop\lung_cancer_classi...,Non-Cancer,<ID:38623902>\nTitle: [Not Available].\nAbstra...,38623902,[ not available ] .,effective longitudinal biomarkers that track d...
996,C:\Users\Mrityunjay\Desktop\lung_cancer_classi...,Non-Cancer,<ID:38640937>\nTitle: Mechanisms and managemen...,38640937,mechanism and management of loss of response t...,we sought to report the effectiveness of infli...
997,C:\Users\Mrityunjay\Desktop\lung_cancer_classi...,Non-Cancer,<ID:38642556>\nTitle: Modification of coronary...,38642556,modification of coronary artery disease clinic...,the extent to which the relationship between c...
998,C:\Users\Mrityunjay\Desktop\lung_cancer_classi...,Non-Cancer,<ID:38650020>\nTitle: Meta-analysis of the glo...,38650020,meta-analysis of the global distribution of cl...,cyp2c8 is responsible for the metabolism of 5 ...


In [16]:
example_text = df["title"] + " " + df["abstract"]

In [25]:
from transformers import pipeline
import logging

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


def load_baseline_pipeline(model_name="facebook/bart-large-mnli"):
    """
    Load a zero-shot classification pipeline using a pre-trained model from Hugging Face.

    Args:
        model_name (str): Name of the pre-trained model to use for the pipeline.

    Returns:
        Pipeline: A Hugging Face pipeline object for zero-shot classification.
    """
    try:
        classifier = pipeline("zero-shot-classification", model=model_name)
        logger.info(f"Pipeline loaded successfully with model {model_name}.")
        return classifier
    except Exception as e:
        logger.error(f"An error occurred while loading the pipeline: {e}")
        raise

def predict_baseline(classifier, text, candidate_labels=["Cancer", "Non-Cancer"]):
    """
    Predict the class of a given text using a zero-shot classification pipeline.

    Args:
        classifier: The Hugging Face pipeline object for zero-shot classification.
        text (str): The text to classify.
        candidate_labels (list): List of candidate labels for classification.

    Returns:
        tuple: A tuple containing the predicted labels and their corresponding scores.
    """
    try:
        result = classifier(text, candidate_labels=candidate_labels)
        logger.info("Prediction completed successfully.")
        return result['labels'], result['scores']
    except Exception as e:
        logger.error(f"An error occurred during prediction: {e}")
        raise

# Example usage
if __name__ == "__main__":
    # Load the baseline pipeline
    classifier = load_baseline_pipeline()

    # from data_loader import load_and_parse_data
    # input_data = load_and_parse_data()
    # #print(input_data)
    # Example text to classify
    #example_text = [i for i in input_data["abstract"]]
    #print(example_text)
    #example_text = "This study explores the genetic markers associated with lung cancer."

    # Make a prediction
    labels, scores = predict_baseline(classifier, example_text)

    # Print the results
    print("Predicted Labels:", labels)
    print("Confidence Scores:", scores)


INFO:__main__:Pipeline loaded successfully with model facebook/bart-large-mnli.
INFO:__main__:Prediction completed successfully.


Predicted Labels: ['Cancer', 'Non-Cancer']
Confidence Scores: [0.5350133776664734, 0.4649865925312042]


In [20]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset
from peft import get_peft_model, LoraConfig, TaskType
import logging
import numpy as np
from sklearn.metrics import f1_score

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def get_tokenizer(model_name="distilbert-base-uncased"):
    """
    Load a tokenizer for a given model name.

    Args:
        model_name (str): Name of the pre-trained model.

    Returns:
        AutoTokenizer: The tokenizer for the specified model.
    """
    try:
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        logger.info(f"Tokenizer loaded successfully for model {model_name}.")
        return tokenizer
    except Exception as e:
        logger.error(f"An error occurred while loading the tokenizer: {e}")
        raise

def tokenize_function(examples, tokenizer, max_len=512):
    """
    Tokenize the examples using the provided tokenizer.

    Args:
        examples (dict): Dictionary containing the text data.
        tokenizer: The tokenizer to use.
        max_len (int): Maximum length for tokenization.

    Returns:
        dict: Tokenized examples.
    """
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=max_len)

def prepare_dataset(X, y, tokenizer):
    """
    Prepare the dataset for training.

    Args:
        X (list): List of text samples.
        y (list): List of labels.
        tokenizer: The tokenizer to use.

    Returns:
        Dataset: Tokenized dataset.
    """
    try:
        dataset = Dataset.from_dict({"text": X, "label": y})
        tokenized_dataset = dataset.map(lambda examples: tokenize_function(examples, tokenizer), batched=True)
        logger.info("Dataset prepared and tokenized successfully.")
        return tokenized_dataset
    except Exception as e:
        logger.error(f"An error occurred while preparing the dataset: {e}")
        raise

def get_lora_model(model_name="distilbert-base-uncased"):
    """
    Load a model and apply LoRA configuration.

    Args:
        model_name (str): Name of the pre-trained model.

    Returns:
        PeftModel: The model with LoRA configuration.
    """
    try:
        model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

        config = LoraConfig(
            r=8,
            lora_alpha=32,
            lora_dropout=0.1,
            bias="none",
            task_type=TaskType.SEQ_CLS,
            target_modules=["q_lin", "v_lin"] if "distilbert" in model_name else ["query", "value"]
        )

        lora_model = get_peft_model(model, config)
        logger.info(f"LoRA model configured successfully for model {model_name}.")
        return lora_model
    except Exception as e:
        logger.error(f"An error occurred while configuring the LoRA model: {e}")
        raise

def compute_metrics(eval_pred):
    """
    Compute metrics for evaluation.

    Args:
        eval_pred (tuple): Tuple of predictions and label ids.

    Returns:
        dict: Dictionary containing accuracy and F1 score.
    """
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    accuracy = np.mean(predictions == labels)
    f1 = f1_score(labels, predictions, average='weighted')
    return {'accuracy': accuracy, 'f1': f1}

def train_model(model, tokenized_dataset, output_dir="./results"):
    """
    Train the model using the provided dataset.

    Args:
        model: The model to train.
        tokenized_dataset (Dataset): The dataset to use for training.
        output_dir (str): Directory to save the model outputs.
    """
    try:
        training_args = TrainingArguments(
            output_dir=output_dir,
            evaluation_strategy="epoch",
            learning_rate=2e-5,
            per_device_train_batch_size=16,
            per_device_eval_batch_size=16,
            num_train_epochs=3,
            weight_decay=0.01,
            logging_dir='./logs',
            logging_steps=10,
        )

        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=tokenized_dataset,
            eval_dataset=tokenized_dataset,
            compute_metrics=compute_metrics,
        )

        trainer.train()
        logger.info("Model training completed successfully.")
    except Exception as e:
        logger.error(f"An error occurred during model training: {e}")
        raise

# Example usage
if __name__ == "__main__":
    # Example data
    X = example_text
    y = df["label"]

    # Load tokenizer and prepare dataset
    tokenizer = get_tokenizer()
    tokenized_dataset = prepare_dataset(X, y, tokenizer)

    # Load and configure LoRA model
    lora_model = get_lora_model()

    # Train the model
    train_model(lora_model, tokenized_dataset)


INFO:__main__:Tokenizer loaded successfully for model distilbert-base-uncased.
Map: 100%|██████████| 1000/1000 [00:01<00:00, 557.32 examples/s]
INFO:__main__:Dataset prepared and tokenized successfully.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
INFO:__main__:LoRA model configured successfully for model distilbert-base-uncased.
  0%|          | 0/189 [00:00<?, ?it/s]ERROR:__main__:An error occurred during model training: too many dimensions 'str'


ValueError: too many dimensions 'str'

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset
from peft import get_peft_model, LoraConfig, TaskType
import logging
import numpy as np
from sklearn.metrics import f1_score
import pandas as pd # Import pandas
import torch # Import torch

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def get_tokenizer(model_name="distilbert-base-uncased"):
    """
    Load a tokenizer for a given model name.

    Args:
        model_name (str): Name of the pre-trained model.

    Returns:
        AutoTokenizer: The tokenizer for the specified model.
    """
    try:
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        logger.info(f"Tokenizer loaded successfully for model {model_name}.")
        return tokenizer
    except Exception as e:
        logger.error(f"An error occurred while loading the tokenizer: {e}")
        raise

def tokenize_function(examples, tokenizer, max_len=512):
    """
    Tokenize the examples using the provided tokenizer.

    Args:
        examples (dict): Dictionary containing the text data.
        tokenizer: The tokenizer to use.
        max_len (int): Maximum length for tokenization.

    Returns:
        dict: Tokenized examples.
    """
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=max_len)

def prepare_dataset(X, y, tokenizer):
    """
    Prepare the dataset for training.

    Args:
        X (list): List of text samples.
        y (list): List of labels.
        tokenizer: The tokenizer to use.

    Returns:
        Dataset: Tokenized dataset.
    """
    try:
        if not isinstance(y, list):
            y = y.tolist()
        y = [int(label) for label in y]

        if not isinstance(X, list):
            X = X.tolist()

        dataset = Dataset.from_dict({"text": X, "label": y})
        tokenized_dataset = dataset.map(lambda examples: tokenize_function(examples, tokenizer), batched=True)
        tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
        logger.info("Dataset prepared and tokenized successfully.")
        return tokenized_dataset
    except Exception as e:
        logger.error(f"An error occurred while preparing the dataset: {e}")
        raise

def get_lora_model(model_name="distilbert-base-uncased"):
    """
    Load a model and apply LoRA configuration.

    Args:
        model_name (str): Name of the pre-trained model.

    Returns:
        PeftModel: The model with LoRA configuration.
    """
    try:
        model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

        config = LoraConfig(
            r=8,
            lora_alpha=32,
            lora_dropout=0.1,
            bias="none",
            task_type=TaskType.SEQ_CLS,
            target_modules=["q_lin", "v_lin"] if "distilbert" in model_name else ["query", "value"]
        )

        lora_model = get_peft_model(model, config)
        logger.info(f"LoRA model configured successfully for model {model_name}.")
        return lora_model
    except Exception as e:
        logger.error(f"An error occurred while configuring the LoRA model: {e}")
        raise

def compute_metrics(eval_pred):
    """
    Compute metrics for evaluation.

    Args:
        eval_pred (tuple): Tuple of predictions and label ids.

    Returns:
        dict: Dictionary containing accuracy and F1 score.
    """
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    accuracy = np.mean(predictions == labels)
    f1 = f1_score(labels, predictions, average='weighted')
    return {'accuracy': accuracy, 'f1': f1}

def train_model(model, tokenized_dataset, output_dir="./results"):
    """
    Train the model using the provided dataset.

    Args:
        model: The model to train.
        tokenized_dataset (Dataset): The dataset to use for training.
        output_dir (str): Directory to save the model outputs.
    """
    try:
        train_val_split = tokenized_dataset.train_test_split(test_size=0.2, seed=42)
        train_dataset = train_val_split["train"]
        eval_dataset = train_val_split["test"]

        training_args = TrainingArguments(
            output_dir=output_dir,
            evaluation_strategy="epoch",  # Evaluation per epoch
            save_strategy="epoch",        
            learning_rate=2e-5,
            per_device_train_batch_size=16,
            per_device_eval_batch_size=16,
            num_train_epochs=3,
            weight_decay=0.01,
            logging_dir='./logs',
            logging_steps=10,
            save_total_limit=1,
            load_best_model_at_end=True,
            metric_for_best_model="f1",
            greater_is_better=True,
            # If you were saving by steps, you'd also set save_steps and eval_steps:
            # save_steps=500,
            # eval_steps=500,
        )

        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            compute_metrics=compute_metrics,
        )

        trainer.train()
        logger.info("Model training completed successfully.")

        logger.info("Evaluating the trained model...")
        eval_results = trainer.evaluate(eval_dataset=eval_dataset)
        logger.info(f"Evaluation results: {eval_results}")

    except Exception as e:
        logger.error(f"An error occurred during model training: {e}")
        raise

# Example usage (remains the same as previous correct version)
if __name__ == "__main__":
    data = {
        'text': ["This is a positive sentence.", "This is a negative sentence.", "Another positive.", "Another negative.", "Positive example five.", "Negative example six.",
                 "I love this product!", "This is terrible.", "Fantastic movie!", "Worst experience ever.", "So happy with the results.", "Disappointing outcome."],
        'label': [1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0]
    }
    long_data = {
        'text': data['text'] * 84,
        'label': data['label'] * 84
    }
    df = pd.DataFrame(long_data)
    print(f"DataFrame length: {len(df)}")

    X = df["text"]
    y = df["label"]

    tokenizer = get_tokenizer()
    tokenized_dataset = prepare_dataset(X, y, tokenizer)

    lora_model = get_lora_model()

    train_model(lora_model, tokenized_dataset)

DataFrame length: 1008


INFO:__main__:Tokenizer loaded successfully for model distilbert-base-uncased.
Map: 100%|██████████| 1008/1008 [00:00<00:00, 2969.51 examples/s]
INFO:__main__:Dataset prepared and tokenized successfully.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
INFO:__main__:LoRA model configured successfully for model distilbert-base-uncased.
  0%|          | 0/189 [16:45<?, ?it/s]
  7%|▋         | 10/153 [06:03<1:31:25, 38.36s/it]

{'loss': 0.6913, 'grad_norm': 1.7869631052017212, 'learning_rate': 1.869281045751634e-05, 'epoch': 0.2}


 13%|█▎        | 20/153 [12:05<1:37:35, 44.03s/it]

{'loss': 0.6824, 'grad_norm': 1.0087590217590332, 'learning_rate': 1.738562091503268e-05, 'epoch': 0.39}


 20%|█▉        | 30/153 [18:00<1:07:18, 32.83s/it]

{'loss': 0.6849, 'grad_norm': 0.8809709548950195, 'learning_rate': 1.607843137254902e-05, 'epoch': 0.59}


 26%|██▌       | 40/153 [23:28<1:02:11, 33.02s/it]

{'loss': 0.6843, 'grad_norm': 1.2216254472732544, 'learning_rate': 1.4771241830065362e-05, 'epoch': 0.78}


 33%|███▎      | 50/153 [29:38<1:03:34, 37.04s/it]

{'loss': 0.667, 'grad_norm': 0.9278779625892639, 'learning_rate': 1.3464052287581701e-05, 'epoch': 0.98}


                                                  
 33%|███▎      | 51/153 [31:52<49:15, 28.97s/it]

{'eval_loss': 0.6526228785514832, 'eval_accuracy': 0.8415841584158416, 'eval_f1': 0.835845517247001, 'eval_runtime': 123.779, 'eval_samples_per_second': 1.632, 'eval_steps_per_second': 0.105, 'epoch': 1.0}


 39%|███▉      | 60/153 [37:22<58:44, 37.90s/it]  

{'loss': 0.6604, 'grad_norm': 1.3572663068771362, 'learning_rate': 1.215686274509804e-05, 'epoch': 1.18}


 46%|████▌     | 70/153 [42:57<43:25, 31.39s/it]

{'loss': 0.656, 'grad_norm': 0.9571828246116638, 'learning_rate': 1.0849673202614379e-05, 'epoch': 1.37}


 52%|█████▏    | 80/153 [48:01<37:05, 30.49s/it]

{'loss': 0.6413, 'grad_norm': 1.3479958772659302, 'learning_rate': 9.54248366013072e-06, 'epoch': 1.57}


 59%|█████▉    | 90/153 [53:10<32:29, 30.95s/it]

{'loss': 0.6428, 'grad_norm': 1.1791025400161743, 'learning_rate': 8.23529411764706e-06, 'epoch': 1.76}


 65%|██████▌   | 100/153 [58:28<28:13, 31.95s/it]

{'loss': 0.6319, 'grad_norm': 1.5587539672851562, 'learning_rate': 6.928104575163399e-06, 'epoch': 1.96}


                                                 
 67%|██████▋   | 102/153 [1:00:53<21:02, 24.76s/it]

{'eval_loss': 0.6136587262153625, 'eval_accuracy': 0.9158415841584159, 'eval_f1': 0.9156661638355404, 'eval_runtime': 104.7999, 'eval_samples_per_second': 1.927, 'eval_steps_per_second': 0.124, 'epoch': 2.0}


 72%|███████▏  | 110/153 [1:05:26<27:11, 37.94s/it]

{'loss': 0.6286, 'grad_norm': 1.657580018043518, 'learning_rate': 5.620915032679739e-06, 'epoch': 2.16}


 78%|███████▊  | 120/153 [1:11:48<18:28, 33.59s/it]

{'loss': 0.6231, 'grad_norm': 1.4232587814331055, 'learning_rate': 4.313725490196079e-06, 'epoch': 2.35}


 85%|████████▍ | 130/153 [1:16:48<11:40, 30.48s/it]

{'loss': 0.6079, 'grad_norm': 1.3350950479507446, 'learning_rate': 3.0065359477124182e-06, 'epoch': 2.55}


 92%|█████████▏| 140/153 [1:21:49<06:30, 30.05s/it]

{'loss': 0.6053, 'grad_norm': 2.006324291229248, 'learning_rate': 1.6993464052287585e-06, 'epoch': 2.75}


 98%|█████████▊| 150/153 [1:26:55<01:30, 30.14s/it]

{'loss': 0.6087, 'grad_norm': 2.0472679138183594, 'learning_rate': 3.921568627450981e-07, 'epoch': 2.94}


                                                   
100%|██████████| 153/153 [1:29:44<00:00, 23.41s/it]

{'eval_loss': 0.5904716849327087, 'eval_accuracy': 1.0, 'eval_f1': 1.0, 'eval_runtime': 100.5648, 'eval_samples_per_second': 2.009, 'eval_steps_per_second': 0.129, 'epoch': 3.0}


100%|██████████| 153/153 [1:29:45<00:00, 35.20s/it]
INFO:__main__:Model training completed successfully.
INFO:__main__:Evaluating the trained model...


{'train_runtime': 5385.0817, 'train_samples_per_second': 0.449, 'train_steps_per_second': 0.028, 'train_loss': 0.646938322416318, 'epoch': 3.0}


100%|██████████| 13/13 [01:56<00:00,  8.98s/it]
INFO:__main__:Evaluation results: {'eval_loss': 0.5904716849327087, 'eval_accuracy': 1.0, 'eval_f1': 1.0, 'eval_runtime': 126.2084, 'eval_samples_per_second': 1.601, 'eval_steps_per_second': 0.103, 'epoch': 3.0}


In [29]:
# In your training script's train_model function, after trainer.train():
# To save LoRA adapters:
model.save_pretrained("my_lora_model_adapters") # This saves the LoRA parts
tokenizer.save_pretrained("my_lora_tokenizer") # Save the tokenizer too

('my_lora_tokenizer\\tokenizer_config.json',
 'my_lora_tokenizer\\special_tokens_map.json',
 'my_lora_tokenizer\\vocab.txt',
 'my_lora_tokenizer\\added_tokens.json',
 'my_lora_tokenizer\\tokenizer.json')

In [30]:
# predictor.py
import logging
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from peft import PeftModel # Import PeftModel for loading LoRA adapters
import pandas as pd # pandas is imported but not used in this specific script's functions directly

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def load_model_and_tokenizer(base_model_name, lora_adapter_path, tokenizer_path):
    """
    Load a trained base model and then apply LoRA adapters,
    along with the tokenizer.

    Args:
        base_model_name (str): Name of the original base model (e.g., "distilbert-base-uncased").
        lora_adapter_path (str): Path to the saved LoRA adapter weights.
        tokenizer_path (str): Path to the saved tokenizer.

    Returns:
        tuple: The loaded model (with LoRA applied) and tokenizer.
    """
    try:
        # 1. Load the base model
        model = AutoModelForSequenceClassification.from_pretrained(base_model_name, num_labels=2) # Ensure num_labels matches your task
        logger.info(f"Base model '{base_model_name}' loaded successfully.")

        # 2. Load the LoRA adapters onto the base model
        model = PeftModel.from_pretrained(model, lora_adapter_path)
        logger.info(f"LoRA adapters loaded from '{lora_adapter_path}' and applied to the model.")

        # 3. Load the tokenizer
        tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
        logger.info(f"Tokenizer loaded successfully from '{tokenizer_path}'.")

        # Set model to evaluation mode
        model.eval()
        return model, tokenizer
    except Exception as e:
        logger.error(f"An error occurred while loading the model or tokenizer: {e}")
        raise

def preprocess_data(texts, tokenizer, max_length=512):
    """
    Preprocess input texts by tokenizing them.

    Args:
        texts (list): List of texts to preprocess.
        tokenizer: The tokenizer to use.
        max_length (int): Maximum length for tokenization.

    Returns:
        dict: Tokenized inputs (PyTorch tensors).
    """
    try:
        # Ensure texts is always a list for consistent behavior
        if not isinstance(texts, list):
            texts = [texts]

        inputs = tokenizer(texts, padding="max_length", truncation=True, max_length=max_length, return_tensors="pt")
        logger.info("Data preprocessing completed successfully.")
        return inputs
    except Exception as e:
        logger.error(f"An error occurred during data preprocessing: {e}")
        raise

def predict(model, inputs):
    """
    Make predictions using the loaded model.

    Args:
        model: The trained model to use for predictions.
        inputs (dict): Tokenized input data.

    Returns:
        torch.Tensor: Predicted logits.
    """
    try:
        # Ensure model is in evaluation mode (already done in load_model_and_tokenizer, but good practice)
        model.eval()
        with torch.no_grad():
            outputs = model(**inputs)
        logger.info("Predictions completed successfully.")
        return outputs.logits
    except Exception as e:
        logger.error(f"An error occurred during prediction: {e}")
        raise

def postprocess_predictions(logits):
    """
    Postprocess prediction logits to get predicted class labels.

    Args:
        logits (torch.Tensor): Prediction logits from the model.

    Returns:
        list: Predicted class labels (integers, e.g., 0 or 1).
    """
    try:
        predictions = torch.argmax(logits, dim=1).tolist()
        logger.info("Postprocessing of predictions completed successfully.")
        return predictions
    except Exception as e:
        logger.error(f"An error occurred during postprocessing: {e}")
        raise

# Example usage
if __name__ == "__main__":
    # Define paths where you saved your fine-tuned model's adapters and tokenizer
    # IMPORTANT: Replace these with the actual paths from your training script's output
    # For example, if you saved them to "./results/my_lora_model_adapters" and "./results/my_lora_tokenizer"
    BASE_MODEL_NAME = "distilbert-base-uncased"
    LORA_ADAPTERS_PATH = "./my_lora_model_adapters" # <--- IMPORTANT: This needs to be the path where you save your LoRA adapters!
    TOKENIZER_SAVE_PATH = "./my_lora_tokenizer"    # <--- IMPORTANT: This needs to be the path where you save your tokenizer!

    # Dummy save for demonstration - in real usage, these would be generated by your training script
    # This block is for testing this `predictor.py` script independently.
    # In a full workflow, your training script would save these.
    try:
        logger.info("Attempting to create dummy model and tokenizer files for testing...")
        # Create a dummy tokenizer in the specified path
        dummy_tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME)
        dummy_tokenizer.save_pretrained(TOKENIZER_SAVE_PATH)

        # Create a dummy LoRA adapter path. In a real scenario, this would be from trainer.model.save_pretrained
        # We need to ensure a directory exists for `PeftModel.from_pretrained`
        import os
        os.makedirs(LORA_ADAPTERS_PATH, exist_ok=True)
        # Create a minimal config.json if not present, though PeftModel might not strictly require it for adapters
        # This is just to make the directory look like a saved PEFT model for a basic test
        with open(os.path.join(LORA_ADAPTERS_PATH, "adapter_config.json"), "w") as f:
            f.write('{"peft_type": "LORA", "task_type": "SEQ_CLS", "r": 8, "lora_alpha": 32, "target_modules": ["q_lin", "v_lin"]}')
        logger.info(f"Dummy tokenizer saved to {TOKENIZER_SAVE_PATH}")
        logger.info(f"Dummy LoRA adapter path created at {LORA_ADAPTERS_PATH}")
    except Exception as e:
        logger.warning(f"Could not create dummy model/tokenizer for testing: {e}. Please ensure you have trained and saved a model.")
        logger.warning("Continuing with the assumption that your paths are correctly set and models exist.")


    # Load model and tokenizer
    try:
        model, tokenizer = load_model_and_tokenizer(BASE_MODEL_NAME, LORA_ADAPTERS_PATH, TOKENIZER_SAVE_PATH)
    except Exception as e:
        logger.error("Failed to load model and tokenizer. Please ensure `BASE_MODEL_NAME`, `LORA_ADAPTERS_PATH`, and `TOKENIZER_SAVE_PATH` are correct and point to existing files/directories.")
        exit(1) # Exit if loading fails

    # Example texts to predict
    texts = ["This patient shows symptoms consistent with early-stage cancer.", "The report indicates no abnormalities, suggesting a healthy outcome."]

    # Preprocess data
    inputs = preprocess_data(texts, tokenizer)

    # Make predictions
    logits = predict(model, inputs)

    # Postprocess predictions
    predictions = postprocess_predictions(logits)

    # Map numerical predictions to human-readable labels if desired
    label_map = {0: "Not Cancer", 1: "Cancer"} # Adjust these based on your actual label mapping during training
    human_readable_predictions = [label_map[p] for p in predictions]

    # Print predictions
    print("Input Texts:", texts)
    print("Numerical Predictions:", predictions)
    print("Human-readable Predictions:", human_readable_predictions)

INFO:__main__:Attempting to create dummy model and tokenizer files for testing...
INFO:__main__:Dummy tokenizer saved to ./my_lora_tokenizer
INFO:__main__:Dummy LoRA adapter path created at ./my_lora_model_adapters
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
INFO:__main__:Base model 'distilbert-base-uncased' loaded successfully.
ERROR:__main__:An error occurred while loading the model or tokenizer: Repo id must use alphanumeric chars or '-', '_', '.', '--' and '..' are forbidden, '-' and '.' cannot start or end the name, max length is 96: './my_lora_model_adapters'.
ERROR:__main__:Failed to load model and tokenizer. Please ensure `BASE_MODEL_NAME`, `LORA_ADAPTERS_PATH`, and `TOKENIZ

IndexError: index -1 is out of bounds for dimension 1 with size 0

In [31]:
# predictor.py
import logging
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from peft import PeftModel # Import PeftModel for loading LoRA adapters
import pandas as pd # pandas is imported but not used in this specific script's functions directly

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def load_model_and_tokenizer(base_model_name, lora_adapter_path, tokenizer_path):
    """
    Load a trained base model and then apply LoRA adapters,
    along with the tokenizer.

    Args:
        base_model_name (str): Name of the original base model (e.g., "distilbert-base-uncased").
        lora_adapter_path (str): Path to the saved LoRA adapter weights.
        tokenizer_path (str): Path to the saved tokenizer.

    Returns:
        tuple: The loaded model (with LoRA applied) and tokenizer.
    """
    try:
        # 1. Load the base model
        # Ensure num_labels matches your actual label range (0, 1 for binary classification)
        model = AutoModelForSequenceClassification.from_pretrained(base_model_name, num_labels=2)
        logger.info(f"Base model '{base_model_name}' loaded successfully.")

        # 2. Load the LoRA adapters onto the base model
        model = PeftModel.from_pretrained(model, lora_adapter_path)
        logger.info(f"LoRA adapters loaded from '{lora_adapter_path}' and applied to the model.")

        # 3. Load the tokenizer
        tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
        logger.info(f"Tokenizer loaded successfully from '{tokenizer_path}'.")

        # Set model to evaluation mode
        model.eval()
        return model, tokenizer
    except Exception as e:
        logger.error(f"An error occurred while loading the model or tokenizer: {e}")
        raise

def preprocess_data(texts, tokenizer, max_length=512):
    """
    Preprocess input texts by tokenizing them.
    Includes robustness checks for empty inputs.

    Args:
        texts (list): List of texts to preprocess.
        tokenizer: The tokenizer to use.
        max_length (int): Maximum length for tokenization.

    Returns:
        dict: Tokenized inputs (PyTorch tensors), or None if no valid inputs.
    """
    if not isinstance(texts, list):
        texts = [texts]

    # Filter out empty strings from the input texts
    non_empty_texts = [text for text in texts if text and text.strip()]
    if not non_empty_texts:
        logger.warning("All input texts are empty or contain only whitespace. Returning None for inputs.")
        return None # No valid texts to process

    try:
        inputs = tokenizer(non_empty_texts, padding="max_length", truncation=True, max_length=max_length, return_tensors="pt")
        logger.info("Data preprocessing completed successfully.")

        # Crucial check: Ensure input_ids are not empty
        if inputs is None or "input_ids" not in inputs or inputs["input_ids"].numel() == 0:
            logger.error("Tokenization resulted in empty input_ids. This can happen with very short or empty texts after truncation/padding.")
            return None # Return None if inputs are effectively empty

        return inputs
    except Exception as e:
        logger.error(f"An error occurred during data preprocessing: {e}")
        raise

def predict(model, inputs):
    """
    Make predictions using the loaded model.

    Args:
        model: The trained model to use for predictions.
        inputs (dict): Tokenized input data.

    Returns:
        torch.Tensor: Predicted logits, or None if inputs are invalid.
    """
    if inputs is None or not inputs.get("input_ids").numel(): # Check if inputs are None or effectively empty
        logger.error("Prediction inputs are empty or invalid. Skipping prediction.")
        return None

    try:
        model.eval() # Set model to evaluation mode
        with torch.no_grad():
            outputs = model(**inputs)
        logger.info("Predictions completed successfully.")
        return outputs.logits
    except Exception as e:
        logger.error(f"An error occurred during prediction: {e}")
        raise

def postprocess_predictions(logits):
    """
    Postprocess prediction logits to get predicted class labels.

    Args:
        logits (torch.Tensor): Prediction logits from the model.

    Returns:
        list: Predicted class labels (integers, e.g., 0 or 1), or empty list if logits are None.
    """
    if logits is None:
        logger.warning("No logits provided for postprocessing. Returning empty list.")
        return []
    try:
        predictions = torch.argmax(logits, dim=1).tolist()
        logger.info("Postprocessing of predictions completed successfully.")
        return predictions
    except Exception as e:
        logger.error(f"An error occurred during postprocessing: {e}")
        raise

# Example usage
if __name__ == "__main__":
    # Define paths where you saved your fine-tuned model's adapters and tokenizer
    BASE_MODEL_NAME = "distilbert-base-uncased"
    # Placeholder paths - REPLACE WITH YOUR ACTUAL SAVED MODEL/TOKENIZER PATHS!
    LORA_ADAPTERS_PATH = "./my_lora_model_adapters"
    TOKENIZER_SAVE_PATH = "./my_lora_tokenizer"

    # --- Dummy save for demonstration (in a real scenario, your training script would do this) ---
    logger.info("Attempting to create dummy model and tokenizer files for testing...")
    try:
        # Create a dummy tokenizer in the specified path
        dummy_tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME)
        dummy_tokenizer.save_pretrained(TOKENIZER_SAVE_PATH)

        # Create a minimal adapter_config.json to make the path loadable by PeftModel
        import os
        os.makedirs(LORA_ADAPTERS_PATH, exist_ok=True)
        adapter_config_content = {
            "peft_type": "LORA",
            "task_type": "SEQ_CLS",
            "r": 8,
            "lora_alpha": 32,
            "lora_dropout": 0.1,
            "bias": "none",
            "target_modules": ["q_lin", "v_lin"], # Must match what you used in LoraConfig
            "fan_in_feature_names": [],
            "fan_out_feature_names": [],
            "modules_to_save": None,
            "init_lora_weights": True,
            "revision": None,
            "with_prompt_embedding": False
        }
        import json
        with open(os.path.join(LORA_ADAPTERS_PATH, "adapter_config.json"), "w") as f:
            json.dump(adapter_config_content, f)

        # Also create a dummy pytorch_model.bin or adapter_model.bin to make PeftModel happy
        # A truly empty file might cause issues, so let's create a minimal valid tensor file
        dummy_state_dict = {"dummy_key": torch.empty(0)} # Needs at least one tensor
        torch.save(dummy_state_dict, os.path.join(LORA_ADAPTERS_PATH, "adapter_model.bin"))

        logger.info(f"Dummy tokenizer saved to {TOKENIZER_SAVE_PATH}")
        logger.info(f"Dummy LoRA adapter path created at {LORA_ADAPTERS_PATH}")
    except Exception as e:
        logger.warning(f"Could not create dummy model/tokenizer for testing: {e}. Please ensure you have trained and saved a model.")
        logger.warning("Continuing with the assumption that your paths are correctly set and models exist.")
    # --- End of dummy save block ---


    # Load model and tokenizer
    try:
        model, tokenizer = load_model_and_tokenizer(BASE_MODEL_NAME, LORA_ADAPTERS_PATH, TOKENIZER_SAVE_PATH)
    except Exception as e:
        logger.error(f"Failed to load model and tokenizer: {e}. Please ensure `BASE_MODEL_NAME`, `LORA_ADAPTERS_PATH`, and `TOKENIZER_SAVE_PATH` are correct and point to existing files/directories.")
        exit(1) # Exit if loading fails

    # Example texts to predict
    # Test with normal texts
    texts_normal = ["This patient shows symptoms consistent with early-stage cancer.", "The report indicates no abnormalities, suggesting a healthy outcome."]
    # Test with an empty string or string with only whitespace
    texts_with_empty = ["Valid sentence.", "", "   ", "Another valid one."]

    logger.info("\n--- Predicting with normal texts ---")
    inputs_normal = preprocess_data(texts_normal, tokenizer)
    if inputs_normal is not None:
        logits_normal = predict(model, inputs_normal)
        if logits_normal is not None:
            predictions_normal = postprocess_predictions(logits_normal)
            label_map = {0: "Not Cancer", 1: "Cancer"}
            human_readable_predictions_normal = [label_map[p] for p in predictions_normal]
            print("Input Texts (Normal):", texts_normal)
            print("Numerical Predictions (Normal):", predictions_normal)
            print("Human-readable Predictions (Normal):", human_readable_predictions_normal)
    else:
        print("Skipped prediction for normal texts due to invalid inputs.")

    logger.info("\n--- Predicting with texts containing empty/whitespace-only strings ---")
    inputs_empty = preprocess_data(texts_with_empty, tokenizer)
    if inputs_empty is not None:
        logits_empty = predict(model, inputs_empty)
        if logits_empty is not None:
            predictions_empty = postprocess_predictions(logits_empty)
            label_map = {0: "Not Cancer", 1: "Cancer"}
            human_readable_predictions_empty = [label_map[p] for p in predictions_empty]
            print("Input Texts (With Empty):", texts_with_empty)
            print("Numerical Predictions (With Empty):", predictions_empty)
            print("Human-readable Predictions (With Empty, only valid ones):", human_readable_predictions_empty)
    else:
        print("Skipped prediction for texts with empty strings due to invalid inputs.")

INFO:__main__:Attempting to create dummy model and tokenizer files for testing...
INFO:__main__:Dummy tokenizer saved to ./my_lora_tokenizer
INFO:__main__:Dummy LoRA adapter path created at ./my_lora_model_adapters
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
INFO:__main__:Base model 'distilbert-base-uncased' loaded successfully.
ERROR:__main__:An error occurred while loading the model or tokenizer: 'base_model.model.pre_classifier.weight'
ERROR:__main__:Failed to load model and tokenizer: 'base_model.model.pre_classifier.weight'. Please ensure `BASE_MODEL_NAME`, `LORA_ADAPTERS_PATH`, and `TOKENIZER_SAVE_PATH` are correct and point to existing files/directories.
INFO:__main__:
--- Pr

IndexError: index -1 is out of bounds for dimension 1 with size 0