In [31]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/pubmedabstracts/PubMedAbstracts_final.parquet
/kaggle/input/pubmedabstracts/P5_final_new.parquet
/kaggle/input/pubmedabstracts/Chunks/sent_chunks_chunk_6.parquet
/kaggle/input/pubmedabstracts/Chunks/sent_chunks_chunk_4.parquet
/kaggle/input/pubmedabstracts/Chunks/sent_chunks_chunk_7.parquet
/kaggle/input/pubmedabstracts/Chunks/sent_chunks_chunk_2.parquet
/kaggle/input/pubmedabstracts/Chunks/sent_chunks_chunk_1.parquet
/kaggle/input/pubmedabstracts/Chunks/sent_chunks_chunk_5.parquet
/kaggle/input/pubmedabstracts/Chunks/sent_chunks_chunk_8.parquet
/kaggle/input/pubmedabstracts/Chunks/sent_chunks_chunk_3.parquet
/kaggle/input/pubmedabstracts/Chunks/sent_chunks_chunk_10.parquet
/kaggle/input/pubmedabstracts/Chunks/sent_chunks_chunk_11.parquet
/kaggle/input/pubmedabstracts/Chunks/sent_chunks_chunk_9.parquet


In [32]:
# Creation of folder labeled_chunks for labeled data
import os

output_dir = "/kaggle/working/labeled_chunks"

# Create the directory if it does not exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
    print(f"Directory created: {output_dir}")
else:
    print(f"Directory already exists: {output_dir}")

Directory already exists: /kaggle/working/labeled_chunks


In [33]:
# 1. Import Libraries
import os
import pandas as pd
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
from tqdm import tqdm

In [34]:
# Input and output directories
input_dir = "/kaggle/input/pubmedabstracts/Chunks"  # Folder containing input chunks
output_dir = "/kaggle/working/labeled_chunks"  # Folder for labeled output

# Create the output directory if it does not exist
os.makedirs(output_dir, exist_ok=True)
print(f"Output directory: {output_dir}")

# File pattern for identifying chunk files
file_pattern = "sent_chunks_chunk"

# Pre-trained model for sentiment analysis
biomedbert_model = "microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract"

Output directory: /kaggle/working/labeled_chunks


In [35]:
import pandas as pd
import os

# Path to the chunks folder
chunks_dir = "/kaggle/input/pubmedabstracts/Chunks"

# List all Parquet files
chunk_files = sorted([os.path.join(chunks_dir, f) for f in os.listdir(chunks_dir) if f.endswith(".parquet")])

# Load one chunk to preview
for chunk_file in chunk_files:
    print(f"Previewing file: {chunk_file}")
    df_chunk = pd.read_parquet(chunk_file)
    print(df_chunk.head())  # Display first few rows
    break  # Only preview the first file

Previewing file: /kaggle/input/pubmedabstracts/Chunks/sent_chunks_chunk_1.parquet
        uid                                           abstract
0  10186596  General: This article observes that, despite t...
1  10186588  General: Health promotion is a major component...
2  10186587  General: Health care reform in the United Stat...
3  10163501  General: The Cavitron Ultrasonic Surgical Aspi...
4  10157383  General: Previous work has documented large di...


In [36]:
# 2. Helper Function for Long Texts
def predict_long_text_biomed(
    text: str,
    pipe,
    max_length=512,
    stride=256
):
    """
    Handles long texts using a sliding window.
    """
    tokenizer = pipe.tokenizer
    input_ids = tokenizer(text, add_special_tokens=True)["input_ids"]
    total_tokens = len(input_ids)

    if total_tokens <= max_length:
        return pipe(text)[0]

    subchunk_sentiments = []
    start = 0
    while start < total_tokens:
        end = start + max_length
        sub_ids = input_ids[start:end]
        sub_text = tokenizer.decode(sub_ids, skip_special_tokens=True)

        sub_res = pipe(sub_text)
        subchunk_sentiments.append(sub_res[0])

        if end >= total_tokens:
            break
        start += max_length - stride

    label_counts = {}
    for chunk_res in subchunk_sentiments:
        lbl = chunk_res["label"]
        label_counts[lbl] = label_counts.get(lbl, 0) + 1

    overall_label = max(label_counts, key=label_counts.get)
    overall_score = label_counts[overall_label] / len(subchunk_sentiments)
    return {"label": overall_label, "score": overall_score}


In [37]:
# 3. Single Chunk Processing
def process_chunk(
    chunk_file: str,
    output_folder: str,
    biomedbert_model: str,
    max_length: int = 512,
    stride: int = 256,
    batch_size: int = 500,  # Process in batches
    text_column: str = "abstract",
    uid_column: str = "uid"
):
    """
    Processes a single chunk, predicts sentiment in batches, and saves the output.
    """
    output_file = os.path.join(output_folder, f"labeled_{os.path.basename(chunk_file)}")
    if os.path.exists(output_file):
        print(f"Chunk {chunk_file} already processed. Skipping.")
        return

    # Load chunk
    df_chunk = pd.read_parquet(chunk_file)
    print(f"Processing {len(df_chunk)} rows from {chunk_file}...")

    # Initialize pipeline
    tokenizer = AutoTokenizer.from_pretrained(biomedbert_model)
    model = AutoModelForSequenceClassification.from_pretrained(biomedbert_model)
    pipe = pipeline(
        "sentiment-analysis",
        model=model,
        tokenizer=tokenizer,
        clean_up_tokenization_spaces=True,  # warning avoidance #newly added line
        truncation=True,
        max_length=max_length,
        device=0  # Use GPU
    )

    results = []
    for start in tqdm(range(0, len(df_chunk), batch_size), desc="Batch Processing"):
        # Process batch
        sub_batch = df_chunk.iloc[start:start + batch_size]
        for _, row in sub_batch.iterrows():
            prediction = predict_long_text_biomed(
                text=row[text_column],
                pipe=pipe,
                max_length=max_length,
                stride=stride
            )
            results.append({
                "UID": row[uid_column],
                "label": prediction["label"],
                "score": prediction["score"]
            })

    # Save results
    output_df = pd.DataFrame(results)
    output_df.to_parquet(output_file, index=False)
    print(f"Saved labeled chunk to {output_file}")


In [38]:
# 4. Sequential Processing for All Chunks
def biomedbert_sequential_processing(
    input_folder: str,
    output_folder: str,
    file_pattern: str,
    biomedbert_model: str,
    max_length: int = 512,
    stride: int = 256,
    text_column: str = "abstract",
    uid_column: str = "uid",
    specific_chunk: str = None
):
    """
    Processes all chunks sequentially or a specific chunk.
    """
    os.makedirs(output_folder, exist_ok=True)

    chunk_files = [
        os.path.join(input_folder, f)
        for f in os.listdir(input_folder)
        if file_pattern in f
    ]
    chunk_files.sort()

    if specific_chunk:
        process_chunk(
            chunk_file=specific_chunk,
            output_folder=output_folder,
            biomedbert_model=biomedbert_model,
            max_length=max_length,
            stride=stride,
            text_column=text_column,
            uid_column=uid_column
        )
    else:
        for chunk_file in tqdm(chunk_files, desc="Processing Chunks"):
            process_chunk(
                chunk_file=chunk_file,
                output_folder=output_folder,
                biomedbert_model=biomedbert_model,
                max_length=max_length,
                stride=stride,
                text_column=text_column,
                uid_column=uid_column
            )


In [39]:
if __name__ == "__main__":
    input_dir = "/kaggle/input/pubmedabstracts/Chunks"
    output_dir = "/kaggle/working/labeled_chunks"
    file_pattern = "sent_chunks_chunk"
    biomedbert_model = "microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract"

    # Specify the chunk you want to process
    specific_chunk = None #"/kaggle/input/pubmedabstracts/Chunks/sent_chunks_chunk_7.parquet" #None #"/kaggle/input/pubmedabstracts/Chunks/sent_chunks_chunk_1.parquet"

    # Call the function
    biomedbert_sequential_processing(
        input_folder=input_dir,
        output_folder=output_dir,
        file_pattern=file_pattern,
        biomedbert_model=biomedbert_model,
        max_length=512,
        stride=256,
        text_column="abstract",
        uid_column="uid",
        specific_chunk=specific_chunk
    )


Processing Chunks:   0%|          | 0/11 [00:00<?, ?it/s]

Chunk /kaggle/input/pubmedabstracts/Chunks/sent_chunks_chunk_1.parquet already processed. Skipping.
Chunk /kaggle/input/pubmedabstracts/Chunks/sent_chunks_chunk_10.parquet already processed. Skipping.
Chunk /kaggle/input/pubmedabstracts/Chunks/sent_chunks_chunk_11.parquet already processed. Skipping.
Chunk /kaggle/input/pubmedabstracts/Chunks/sent_chunks_chunk_2.parquet already processed. Skipping.
Chunk /kaggle/input/pubmedabstracts/Chunks/sent_chunks_chunk_3.parquet already processed. Skipping.
Chunk /kaggle/input/pubmedabstracts/Chunks/sent_chunks_chunk_4.parquet already processed. Skipping.
Chunk /kaggle/input/pubmedabstracts/Chunks/sent_chunks_chunk_5.parquet already processed. Skipping.
Chunk /kaggle/input/pubmedabstracts/Chunks/sent_chunks_chunk_6.parquet already processed. Skipping.
Chunk /kaggle/input/pubmedabstracts/Chunks/sent_chunks_chunk_7.parquet already processed. Skipping.
Processing 100000 rows from /kaggle/input/pubmedabstracts/Chunks/sent_chunks_chunk_8.parquet...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.

Batch Processing:   0%|          | 0/200 [00:00<?, ?it/s][A
Batch Processing:   0%|          | 1/200 [00:10<35:10, 10.61s/it][A
Batch Processing:   1%|          | 2/200 [00:22<37:05, 11.24s/it][A
Batch Processing:   2%|▏         | 3/200 [00:33<36:37, 11.15s/it][A
Batch Processing:   2%|▏         | 4/200 [00:43<35:47, 10.95s/it][A
Batch Processing:   2%|▎         | 5/200 [00:53<34:20, 10.57s/it][A
Batch Processing:   3%|▎         | 6/200 [01:04<34:30, 10.67s/it][A
Batch Processing:   4%|▎         | 7/200 [01:15<34:28, 10.72s/it][A
Batch Processing:   4%|▍         | 8/200 [01:26<34:22, 10.74s/it][A
Batch Processing:   4%|▍         | 9/200 [01:38<35:50, 11.

Saved labeled chunk to /kaggle/working/labeled_chunks/labeled_sent_chunks_chunk_8.parquet
Chunk /kaggle/input/pubmedabstracts/Chunks/sent_chunks_chunk_9.parquet already processed. Skipping.





In [1]:
import shutil

# Zip the labeled_chunks folder
shutil.make_archive("/kaggle/working/labeled_chunks", 'zip', "/kaggle/working/labeled_chunks")

print("Folder zipped successfully!")

Folder zipped successfully!


In [1]:
import os

# List files in the labeled_chunks folder
output_dir = "/kaggle/working/labeled_chunks"
files = os.listdir(output_dir)
print("Files in labeled_chunks:", files)

Files in labeled_chunks: ['labeled_sent_chunks_chunk_3.parquet', 'labeled_sent_chunks_chunk_1.parquet', 'labeled_sent_chunks_chunk_9.parquet', 'labeled_sent_chunks_chunk_10.parquet', 'labeled_sent_chunks_chunk_7.parquet', 'labeled_sent_chunks_chunk_4.parquet', 'labeled_sent_chunks_chunk_11.parquet', 'labeled_sent_chunks_chunk_5.parquet', 'labeled_sent_chunks_chunk_2.parquet', 'labeled_sent_chunks_chunk_6.parquet']


In [29]:
# import os

# # Path to the file you want to delete
# file_to_delete = #"/kaggle/working/labeled_chunks.zip" #"/kaggle/working/merged_labeled_chunks.parquet"#"/kaggle/working/labeled_chunks/chunk_8.parquet"

# # Check if the file exists and delete it
# if os.path.exists(file_to_delete):
#     os.remove(file_to_delete)
#     print(f"Deleted: {file_to_delete}")
# else:
#     print(f"File not found: {file_to_delete}")


Deleted: /kaggle/working/labeled_chunks.zip


In [22]:
# import pandas as pd
# import os

# # Paths to the files and folders
# p5_file = "/kaggle/input/pubmedabstracts/P5_final_new.parquet"
# labeled_dir = "/kaggle/working/labeled_chunks"
# output_file = "/kaggle/working/chunk_8.parquet"

# # Step 1: Load P5 file and add an auto-incremented index
# p5_df = pd.read_parquet(p5_file)
# p5_df = p5_df.reset_index(drop=True).reset_index()
# p5_df.rename(columns={"index": "auto_index"}, inplace=True)
# print(f"Loaded P5 Parquet file with {len(p5_df)} rows and added auto-incremented index.")

# # Step 2: Load and combine all labeled chunks
# labeled_dfs = []
# for file_name in os.listdir(labeled_dir):
#     if file_name.endswith(".parquet"):
#         file_path = os.path.join(labeled_dir, file_name)
#         print(f"Reading labeled file: {file_path}")
#         labeled_dfs.append(pd.read_parquet(file_path))

# labeled_df = pd.concat(labeled_dfs, ignore_index=True)
# print(f"Combined labeled DataFrame has {len(labeled_df)} rows.")

# # Step 3: Identify rows in P5 that are not in the labeled data
# # Ensure the labeled data has 'UID' column for matching
# if "UID" not in labeled_df.columns:
#     raise ValueError("Labeled DataFrame must have a 'UID' column for matching.")

# labeled_uids = set(labeled_df["UID"])
# missing_rows = p5_df[~p5_df["uid"].isin(labeled_uids)]
# print(f"Found {len(missing_rows)} missing rows in P5 file.")

# # Step 4: Create a new DataFrame with 'uid' and 'abstract' for the missing rows
# remaining_rows = missing_rows[["uid", "abstract"]]
# print(f"Remaining rows DataFrame created with {len(remaining_rows)} rows.")

# # Step 5: Save the remaining rows as chunk_8.parquet
# remaining_rows.to_parquet(output_file, index=False)
# print(f"Remaining rows saved as {output_file}.")

Loaded P5 Parquet file with 1057871 rows and added auto-incremented index.
Reading labeled file: /kaggle/working/labeled_chunks/labeled_sent_chunks_chunk_3.parquet
Reading labeled file: /kaggle/working/labeled_chunks/labeled_sent_chunks_chunk_1.parquet
Reading labeled file: /kaggle/working/labeled_chunks/labeled_sent_chunks_chunk_9.parquet
Reading labeled file: /kaggle/working/labeled_chunks/labeled_sent_chunks_chunk_10.parquet
Reading labeled file: /kaggle/working/labeled_chunks/labeled_sent_chunks_chunk_7.parquet
Reading labeled file: /kaggle/working/labeled_chunks/labeled_sent_chunks_chunk_4.parquet
Reading labeled file: /kaggle/working/labeled_chunks/labeled_sent_chunks_chunk_11.parquet
Reading labeled file: /kaggle/working/labeled_chunks/labeled_sent_chunks_chunk_5.parquet
Reading labeled file: /kaggle/working/labeled_chunks/labeled_sent_chunks_chunk_2.parquet
Reading labeled file: /kaggle/working/labeled_chunks/labeled_sent_chunks_chunk_6.parquet
Combined labeled DataFrame has 95

In [None]:
# missing_rows[["auto_index", "uid", "abstract"]].head()a

Unnamed: 0,auto_index,uid,abstract
700000,700000,27377686,AIMS: Type 2 diabetes has grown to epidemic pr...
700001,700001,27377678,"SCOPE: Trimethylamine-N-oxide (TMAO), a metabo..."
700002,700002,27377577,General: A pilot study was conducted to assess...
700003,700003,27377169,OBJECTIVES/HYPOTHESIS: Low-grade myofibroblast...
700004,700004,27376900,General: Little is known about the relationshi...


In [24]:
# missing_rows[["auto_index", "uid", "abstract"]].tail()

Unnamed: 0,auto_index,uid,abstract
799995,799995,31548641,General: Genome-wide association studies (GWAS...
799996,799996,31548545,General: Breast cancer is the most frequent ma...
799997,799997,31547868,General: The field of regenerative medicine pr...
799998,799998,31547842,OBJECTIVE: Histopathological studies suggest t...
799999,799999,31547819,"BACKGROUND: Medulloblastoma (MB), the most com..."


In [25]:
# # Display the length of the DataFrame containing missing rows
# missing_rows_subset = missing_rows[["auto_index", "uid", "abstract"]]
# print(f"Length of missing rows (with auto_index, uid, and abstract): {len(missing_rows_subset)}")

# # Optional: Preview the DataFrame to ensure correctness
# print("Preview of missing rows (with auto_index, uid, abstract):")
# print(missing_rows_subset.head())


Length of missing rows (with auto_index, uid, and abstract): 100000
Preview of missing rows (with auto_index, uid, abstract):
        auto_index       uid  \
700000      700000  27377686   
700001      700001  27377678   
700002      700002  27377577   
700003      700003  27377169   
700004      700004  27376900   

                                                 abstract  
700000  AIMS: Type 2 diabetes has grown to epidemic pr...  
700001  SCOPE: Trimethylamine-N-oxide (TMAO), a metabo...  
700002  General: A pilot study was conducted to assess...  
700003  OBJECTIVES/HYPOTHESIS: Low-grade myofibroblast...  
700004  General: Little is known about the relationshi...  
