## FUNCTIONS TO USE

In [1]:
import pandas as pd
import pyarrow.parquet as pq
from tqdm.auto import tqdm

def read_parquet_in_batches_with_progress(file_path, batch_size):
    """
    Read a Parquet file in fixed-size row batches with a progress bar and per-chunk logging.

    Args:
        file_path (str): Path to the Parquet file.
        batch_size (int): Number of rows per batch.

    Returns:
        pd.DataFrame: Combined DataFrame after processing all batches.
    """
    # Open the Parquet file
    parquet_file = pq.ParquetFile(file_path)
    
    # Total number of rows in the file
    total_rows = parquet_file.metadata.num_rows
    
    # Initialize a list to store DataFrame chunks
    all_chunks = []
    
    # Initialize the progress bar
    with tqdm(total=total_rows, desc="Processing Batches", unit="rows") as pbar:
        # Enumerate batches for logging
        for batch_number, batch in enumerate(parquet_file.iter_batches(batch_size=batch_size), start=1):
            # Convert the batch to a Pandas DataFrame
            df_batch = batch.to_pandas()
            
            # Simulate processing (add your custom logic here)
            all_chunks.append(df_batch)
            
            # Update the progress bar
            pbar.update(len(df_batch))
            
            # Print per-chunk information
            print(f"Processed Chunk {batch_number}: {len(df_batch)} rows")
    
    # Combine all chunks into a single DataFrame
    combined_df = pd.concat(all_chunks, ignore_index=True)
    
    return combined_df

In [2]:
if __name__ == "__main__":
    file_path = "Data/2.Processed/ModellingData/P5_final_new.parquet"
    batch_size = 100_000  # Define your desired chunk size
    
    df = read_parquet_in_batches_with_progress(file_path, batch_size)
    
    print(f"\nFinal DataFrame with {len(df)} rows:")
    df.head()

Processing Batches:   0%|          | 0/1057871 [00:00<?, ?rows/s]

Processed Chunk 1: 100000 rows
Processed Chunk 2: 100000 rows
Processed Chunk 3: 100000 rows
Processed Chunk 4: 100000 rows
Processed Chunk 5: 100000 rows
Processed Chunk 6: 100000 rows
Processed Chunk 7: 100000 rows
Processed Chunk 8: 100000 rows
Processed Chunk 9: 100000 rows
Processed Chunk 10: 100000 rows
Processed Chunk 11: 57871 rows

Final DataFrame with 1057871 rows:


In [3]:
import os
import pandas as pd
from tqdm import tqdm

def save_and_merge_in_batches(
    df: pd.DataFrame,
    batch_size: int,
    output_folder: str,
    final_filename: str = "final_merged.parquet",
    temp_batch_prefix: str = "temp_batch_"
):
    """
    Splits 'df' into multiple batches (size = batch_size), writes each batch to a Parquet file,
    then merges them into one final Parquet, with a progress bar showing how many batches are done.

    Steps:
    ------
    1) Creates subfolder 'temp_batches' in output_folder for batch files.
    2) For each chunk of rows:
       - Writes it to 'temp_batch_X.parquet'
       - Increments a progress bar
    3) Reads & merges all batch files into 'final_filename', then removes them.

    Returns:
    --------
    str -> path to the final merged Parquet file.
    """

    # Ensure output folder exists
    os.makedirs(output_folder, exist_ok=True)

    # Subfolder for temporary batch files
    temp_folder = os.path.join(output_folder, "temp_batches")
    os.makedirs(temp_folder, exist_ok=True)

    total_rows = len(df)
    batch_count = (total_rows + batch_size - 1) // batch_size
    print(f"Splitting DataFrame of {total_rows} rows into {batch_count} batches (size={batch_size}).")

    temp_files = []
    current_row = 0
    batch_index = 1

    # -- 1) SAVE IN MULTIPLE BATCHES WITH A PROGRESS BAR FOR THE BATCHES --
    with tqdm(total=batch_count, desc="Saving Batches", unit="batch") as pbar:
        while current_row < total_rows:
            end_row = min(current_row + batch_size, total_rows)
            df_batch = df.iloc[current_row:end_row]

            temp_file_name = f"{temp_batch_prefix}{batch_index}.parquet"
            temp_file_path = os.path.join(temp_folder, temp_file_name)

            # Write the chunk (one shot for each batch)
            df_batch.to_parquet(temp_file_path, index=False, compression="snappy")

            temp_files.append(temp_file_path)

            # Update progress bar
            pbar.update(1)

            # Optional: Print log
            print(f"  -> Batch {batch_index} rows [{current_row}:{end_row}] saved to {temp_file_path}")

            current_row = end_row
            batch_index += 1

    # -- 2) MERGE ALL BATCH FILES INTO A SINGLE PARQUET --
    final_file_path = os.path.join(output_folder, final_filename)
    print(f"\nMerging {len(temp_files)} batch files into {final_file_path}...")

    merged_parts = []
    # Another progress bar for reading merges (optional)
    with tqdm(total=len(temp_files), desc="Merging Batches", unit="file") as pbar_merge:
        for file_path in temp_files:
            merged_parts.append(pd.read_parquet(file_path))
            pbar_merge.update(1)

    df_merged = pd.concat(merged_parts, ignore_index=True)
    df_merged.to_parquet(final_file_path, index=False, compression="snappy")
    print(f"Final merged DataFrame saved as: {final_file_path}\n")

    # -- 3) CLEAN UP TEMPORARY FILES --
    for path in temp_files:
        os.remove(path)
    os.rmdir(temp_folder)

    print("Temporary batch files removed. All done!")
    return final_file_path

In [4]:
df.head()

Unnamed: 0,uid,title,journal,abstract,authors,affiliations,mesh_terms,keywords,coi_statement,parsed_date,...,cleaned_title_tokens_hf,cleaned_abstract_tokens_simple,cleaned_abstract_tokens_hf,disease_title_tokens_simple,disease_title_tokens_hf,disease_abstract_tokens_simple,disease_abstract_tokens_hf,disease_abstract_spacy,disease_title_spacy,disease_mesh_terms_spacy
0,10186596,The potential impact of health care reform on ...,Journal of public health management and practi...,"General: This article observes that, despite t...",Auerbach J; McGuire J,"HIV/AIDS Bureau, Massachusetts Department of P...","Financing, Government; HIV Infections; Health ...",,,1995-01-01,...,"[[CLS], potential, impact, health, care, refor...","[general, article, observes, despite, clear, p...","[[CLS], general, article, observes, despite, c...",[hiv],[hiv],"[hiv, aids]","[hiv, aids]","[human immunodeficiency virus (HIV) disease, a...",[],[HIV Infections]
1,10186588,New Jersey health promotion and disease preven...,Journal of public health management and practi...,General: Health promotion is a major component...,Louria D B,Department of Preventive Medicine and Communit...,Female; Health Education; Health Promotion; Hu...,,,1995-01-01,...,"[[CLS], new, jersey, health, promotion, diseas...","[general, health, promotion, major, component,...","[[CLS], general, health, promotion, major, com...",[],[],[],[],[],[],[]
2,10186587,Who will provide preventive services? The chan...,Journal of public health management and practi...,General: Health care reform in the United Stat...,Pearson T A; Spencer M; Jenkins P,"Mary Imogene Bassett Research Institute, Coope...",Delivery of Health Care; Female; Health Care R...,,,1995-01-01,...,"[[CLS], provide, prevent, ##ive, services, ?, ...","[general, health, care, reform, united, states...","[[CLS], general, health, care, reform, united,...",[],[],[],[],[],[],[]
3,10163501,Cytoreduction of small intestine metastases us...,Journal of gynecologic surgery,General: The Cavitron Ultrasonic Surgical Aspi...,Adelson M D,"Department of Obstetrics and Gynecology, Crous...",Adenocarcinoma; Fallopian Tube Neoplasms; Fema...,,,1995-01-01,...,"[[CLS], cy, ##tore, ##duction, small, int, ##e...","[general, cavitron, ultrasonic, surgical, aspi...","[[CLS], general, ca, ##vi, ##tron, ultra, ##so...",[],[],[tumor],[tumor],"[carcinoma of the ovary, and one each had, tub...",[],"[Adenocarcinoma, Neoplasms, Ovarian Neoplasms]"
4,10157383,Racial differences in access to kidney transpl...,Health care financing review,General: Previous work has documented large di...,Eggers P W,"Office of Research and Demonstrations, Health ...",Adolescent; Adult; Black or African American; ...,Empirical Approach; End Stage Renal Disease Pr...,,1995-01-01,...,"[[CLS], racial, differences, access, kidney, t...","[general, previous, work, documented, large, d...","[[CLS], general, previous, work, documented, l...",[],[],[],[],"[renal failure, renal failure, end stage renal...",[],[American Kidney Failure]


In [5]:
df.columns

Index(['uid', 'title', 'journal', 'abstract', 'authors', 'affiliations',
       'mesh_terms', 'keywords', 'coi_statement', 'parsed_date',
       'cleaned_title_tokens_simple', 'cleaned_title_tokens_hf',
       'cleaned_abstract_tokens_simple', 'cleaned_abstract_tokens_hf',
       'disease_title_tokens_simple', 'disease_title_tokens_hf',
       'disease_abstract_tokens_simple', 'disease_abstract_tokens_hf',
       'disease_abstract_spacy', 'disease_title_spacy',
       'disease_mesh_terms_spacy'],
      dtype='object')

In [5]:
df = df[["uid","abstract"]].copy()

In [6]:
df.head()

Unnamed: 0,uid,abstract
0,10186596,"General: This article observes that, despite t..."
1,10186588,General: Health promotion is a major component...
2,10186587,General: Health care reform in the United Stat...
3,10163501,General: The Cavitron Ultrasonic Surgical Aspi...
4,10157383,General: Previous work has documented large di...


In [7]:
if __name__ == "__main__":
    file_path = "Data/2.Processed/ModellingData/P2_abstract.parquet"
    batch_size = 100_000  # Define your desired chunk size
    
    df = read_parquet_in_batches_with_progress(file_path, batch_size)
    
    print(f"\nFinal DataFrame with {len(df)} rows:")
    df.head()

Processing Batches:   9%|▉         | 100000/1057871 [00:21<03:29, 4578.60rows/s]

Processed Chunk 1: 100000 rows


Processing Batches:  19%|█▉        | 200000/1057871 [00:22<01:19, 10838.60rows/s]

Processed Chunk 2: 100000 rows


Processing Batches:  28%|██▊       | 300000/1057871 [00:22<00:39, 19344.51rows/s]

Processed Chunk 3: 100000 rows


Processing Batches:  38%|███▊      | 400000/1057871 [00:22<00:21, 30581.19rows/s]

Processed Chunk 4: 100000 rows


Processing Batches:  47%|████▋     | 500000/1057871 [00:23<00:12, 44734.72rows/s]

Processed Chunk 5: 100000 rows


Processing Batches:  57%|█████▋    | 600000/1057871 [00:23<00:07, 62032.36rows/s]

Processed Chunk 6: 100000 rows


Processing Batches:  66%|██████▌   | 700000/1057871 [00:24<00:04, 81757.41rows/s]

Processed Chunk 7: 100000 rows


Processing Batches:  76%|███████▌  | 800000/1057871 [00:24<00:02, 100706.17rows/s]

Processed Chunk 8: 100000 rows


Processing Batches:  85%|████████▌ | 900000/1057871 [00:25<00:01, 121595.46rows/s]

Processed Chunk 9: 100000 rows


Processing Batches:  95%|█████████▍| 1000000/1057871 [00:25<00:00, 140618.83rows/s]

Processed Chunk 10: 100000 rows


Processing Batches: 100%|██████████| 1057871/1057871 [00:26<00:00, 40521.84rows/s] 

Processed Chunk 11: 57871 rows

Final DataFrame with 1057871 rows:





In [8]:
df.head()

Unnamed: 0,uid,abstract
0,10186596,"General: This article observes that, despite t..."
1,10186588,General: Health promotion is a major component...
2,10186587,General: Health care reform in the United Stat...
3,10163501,General: The Cavitron Ultrasonic Surgical Aspi...
4,10157383,General: Previous work has documented large di...


## TESTING MODELLING APPROACH

### Processing in batches 

In [8]:
###############################################################################
# SCRIPT 1: split_into_parquet.py
###############################################################################
import os
import pandas as pd
from tqdm import tqdm

def split_df_into_parquet_batches(
    df: pd.DataFrame,
    batch_size: int,
    output_folder: str,
    file_prefix: str = "chunk_"
):
    """
    Splits a DataFrame into multiple Parquet files (batch_size rows each),
    storing them in 'output_folder'. Each file is named like 'chunk_1.parquet',
    'chunk_2.parquet', etc.

    A progress bar shows how many batches are being saved.

    This script does NOT merge them back into a single file. The idea is to keep
    each chunk separate so you can process them individually later.
    """
    os.makedirs(output_folder, exist_ok=True)

    total_rows = len(df)
    batch_count = (total_rows + batch_size - 1) // batch_size
    print(f"Splitting DF with {total_rows} rows into {batch_count} batches of size {batch_size}.")

    current_row = 0
    batch_idx = 1

    with tqdm(total=batch_count, desc="Saving Batches", unit="batch") as pbar:
        while current_row < total_rows:
            end_row = min(current_row + batch_size, total_rows)
            df_batch = df.iloc[current_row:end_row]

            chunk_filename = f"{file_prefix}{batch_idx}.parquet"
            chunk_path = os.path.join(output_folder, chunk_filename)

            df_batch.to_parquet(chunk_path, index=False)
            
            pbar.update(1)
            print(f"  -> Saved batch {batch_idx} rows [{current_row}:{end_row}] to {chunk_path}")

            current_row = end_row
            batch_idx += 1

    print("\nAll done. Each batch is in its own .parquet file in:", output_folder)


if __name__ == "__main__":

    # Decide where to store chunked Parquet files
    out_folder = "Data/2.Processed/SentimentAnalysis"
    prefix = "chunk_"
    b_size = 10_000

    split_df_into_parquet_batches(
        df=df,
        batch_size=b_size,
        output_folder=out_folder,
        file_prefix=prefix
    )

    print("Done splitting!")

Splitting DF with 1057871 rows into 106 batches of size 10000.


Saving Batches:   1%|          | 1/106 [00:00<00:24,  4.20batch/s]

  -> Saved batch 1 rows [0:10000] to Data/2.Processed/SentimentAnalysis\chunk_1.parquet


Saving Batches:   2%|▏         | 2/106 [00:00<00:25,  4.12batch/s]

  -> Saved batch 2 rows [10000:20000] to Data/2.Processed/SentimentAnalysis\chunk_2.parquet


Saving Batches:   3%|▎         | 3/106 [00:00<00:25,  4.04batch/s]

  -> Saved batch 3 rows [20000:30000] to Data/2.Processed/SentimentAnalysis\chunk_3.parquet


Saving Batches:   4%|▍         | 4/106 [00:00<00:24,  4.19batch/s]

  -> Saved batch 4 rows [30000:40000] to Data/2.Processed/SentimentAnalysis\chunk_4.parquet


Saving Batches:   5%|▍         | 5/106 [00:01<00:24,  4.17batch/s]

  -> Saved batch 5 rows [40000:50000] to Data/2.Processed/SentimentAnalysis\chunk_5.parquet


Saving Batches:   6%|▌         | 6/106 [00:01<00:24,  4.02batch/s]

  -> Saved batch 6 rows [50000:60000] to Data/2.Processed/SentimentAnalysis\chunk_6.parquet


Saving Batches:   7%|▋         | 7/106 [00:01<00:23,  4.13batch/s]

  -> Saved batch 7 rows [60000:70000] to Data/2.Processed/SentimentAnalysis\chunk_7.parquet


Saving Batches:   8%|▊         | 8/106 [00:01<00:23,  4.17batch/s]

  -> Saved batch 8 rows [70000:80000] to Data/2.Processed/SentimentAnalysis\chunk_8.parquet


Saving Batches:   8%|▊         | 9/106 [00:02<00:23,  4.12batch/s]

  -> Saved batch 9 rows [80000:90000] to Data/2.Processed/SentimentAnalysis\chunk_9.parquet


Saving Batches:   9%|▉         | 10/106 [00:02<00:23,  4.06batch/s]

  -> Saved batch 10 rows [90000:100000] to Data/2.Processed/SentimentAnalysis\chunk_10.parquet


Saving Batches:  10%|█         | 11/106 [00:02<00:23,  4.12batch/s]

  -> Saved batch 11 rows [100000:110000] to Data/2.Processed/SentimentAnalysis\chunk_11.parquet


Saving Batches:  11%|█▏        | 12/106 [00:02<00:22,  4.17batch/s]

  -> Saved batch 12 rows [110000:120000] to Data/2.Processed/SentimentAnalysis\chunk_12.parquet


Saving Batches:  12%|█▏        | 13/106 [00:03<00:22,  4.12batch/s]

  -> Saved batch 13 rows [120000:130000] to Data/2.Processed/SentimentAnalysis\chunk_13.parquet


Saving Batches:  13%|█▎        | 14/106 [00:03<00:22,  4.17batch/s]

  -> Saved batch 14 rows [130000:140000] to Data/2.Processed/SentimentAnalysis\chunk_14.parquet


Saving Batches:  14%|█▍        | 15/106 [00:03<00:21,  4.17batch/s]

  -> Saved batch 15 rows [140000:150000] to Data/2.Processed/SentimentAnalysis\chunk_15.parquet


Saving Batches:  15%|█▌        | 16/106 [00:03<00:21,  4.12batch/s]

  -> Saved batch 16 rows [150000:160000] to Data/2.Processed/SentimentAnalysis\chunk_16.parquet


Saving Batches:  16%|█▌        | 17/106 [00:04<00:22,  3.98batch/s]

  -> Saved batch 17 rows [160000:170000] to Data/2.Processed/SentimentAnalysis\chunk_17.parquet


Saving Batches:  17%|█▋        | 18/106 [00:04<00:21,  4.06batch/s]

  -> Saved batch 18 rows [170000:180000] to Data/2.Processed/SentimentAnalysis\chunk_18.parquet


Saving Batches:  18%|█▊        | 19/106 [00:04<00:21,  4.01batch/s]

  -> Saved batch 19 rows [180000:190000] to Data/2.Processed/SentimentAnalysis\chunk_19.parquet


Saving Batches:  19%|█▉        | 20/106 [00:04<00:21,  4.08batch/s]

  -> Saved batch 20 rows [190000:200000] to Data/2.Processed/SentimentAnalysis\chunk_20.parquet


Saving Batches:  20%|█▉        | 21/106 [00:05<00:21,  3.93batch/s]

  -> Saved batch 21 rows [200000:210000] to Data/2.Processed/SentimentAnalysis\chunk_21.parquet


Saving Batches:  21%|██        | 22/106 [00:05<00:21,  3.90batch/s]

  -> Saved batch 22 rows [210000:220000] to Data/2.Processed/SentimentAnalysis\chunk_22.parquet


Saving Batches:  22%|██▏       | 23/106 [00:05<00:21,  3.90batch/s]

  -> Saved batch 23 rows [220000:230000] to Data/2.Processed/SentimentAnalysis\chunk_23.parquet


Saving Batches:  23%|██▎       | 24/106 [00:05<00:20,  4.01batch/s]

  -> Saved batch 24 rows [230000:240000] to Data/2.Processed/SentimentAnalysis\chunk_24.parquet


Saving Batches:  24%|██▎       | 25/106 [00:06<00:19,  4.08batch/s]

  -> Saved batch 25 rows [240000:250000] to Data/2.Processed/SentimentAnalysis\chunk_25.parquet


Saving Batches:  25%|██▍       | 26/106 [00:06<00:19,  4.15batch/s]

  -> Saved batch 26 rows [250000:260000] to Data/2.Processed/SentimentAnalysis\chunk_26.parquet


Saving Batches:  25%|██▌       | 27/106 [00:06<00:18,  4.18batch/s]

  -> Saved batch 27 rows [260000:270000] to Data/2.Processed/SentimentAnalysis\chunk_27.parquet


Saving Batches:  26%|██▋       | 28/106 [00:06<00:18,  4.19batch/s]

  -> Saved batch 28 rows [270000:280000] to Data/2.Processed/SentimentAnalysis\chunk_28.parquet


Saving Batches:  27%|██▋       | 29/106 [00:07<00:18,  4.11batch/s]

  -> Saved batch 29 rows [280000:290000] to Data/2.Processed/SentimentAnalysis\chunk_29.parquet


Saving Batches:  28%|██▊       | 30/106 [00:07<00:19,  3.97batch/s]

  -> Saved batch 30 rows [290000:300000] to Data/2.Processed/SentimentAnalysis\chunk_30.parquet


Saving Batches:  29%|██▉       | 31/106 [00:07<00:18,  4.03batch/s]

  -> Saved batch 31 rows [300000:310000] to Data/2.Processed/SentimentAnalysis\chunk_31.parquet


Saving Batches:  30%|███       | 32/106 [00:07<00:18,  4.09batch/s]

  -> Saved batch 32 rows [310000:320000] to Data/2.Processed/SentimentAnalysis\chunk_32.parquet


Saving Batches:  31%|███       | 33/106 [00:08<00:17,  4.15batch/s]

  -> Saved batch 33 rows [320000:330000] to Data/2.Processed/SentimentAnalysis\chunk_33.parquet


Saving Batches:  32%|███▏      | 34/106 [00:08<00:17,  4.14batch/s]

  -> Saved batch 34 rows [330000:340000] to Data/2.Processed/SentimentAnalysis\chunk_34.parquet


Saving Batches:  33%|███▎      | 35/106 [00:08<00:17,  4.10batch/s]

  -> Saved batch 35 rows [340000:350000] to Data/2.Processed/SentimentAnalysis\chunk_35.parquet


Saving Batches:  34%|███▍      | 36/106 [00:08<00:17,  4.01batch/s]

  -> Saved batch 36 rows [350000:360000] to Data/2.Processed/SentimentAnalysis\chunk_36.parquet


Saving Batches:  35%|███▍      | 37/106 [00:09<00:17,  3.93batch/s]

  -> Saved batch 37 rows [360000:370000] to Data/2.Processed/SentimentAnalysis\chunk_37.parquet


Saving Batches:  36%|███▌      | 38/106 [00:09<00:17,  3.99batch/s]

  -> Saved batch 38 rows [370000:380000] to Data/2.Processed/SentimentAnalysis\chunk_38.parquet


Saving Batches:  37%|███▋      | 39/106 [00:09<00:16,  4.04batch/s]

  -> Saved batch 39 rows [380000:390000] to Data/2.Processed/SentimentAnalysis\chunk_39.parquet


Saving Batches:  38%|███▊      | 40/106 [00:09<00:16,  4.09batch/s]

  -> Saved batch 40 rows [390000:400000] to Data/2.Processed/SentimentAnalysis\chunk_40.parquet


Saving Batches:  39%|███▊      | 41/106 [00:10<00:15,  4.11batch/s]

  -> Saved batch 41 rows [400000:410000] to Data/2.Processed/SentimentAnalysis\chunk_41.parquet


Saving Batches:  40%|███▉      | 42/106 [00:10<00:15,  4.12batch/s]

  -> Saved batch 42 rows [410000:420000] to Data/2.Processed/SentimentAnalysis\chunk_42.parquet


Saving Batches:  41%|████      | 43/106 [00:10<00:16,  3.92batch/s]

  -> Saved batch 43 rows [420000:430000] to Data/2.Processed/SentimentAnalysis\chunk_43.parquet


Saving Batches:  42%|████▏     | 44/106 [00:10<00:16,  3.83batch/s]

  -> Saved batch 44 rows [430000:440000] to Data/2.Processed/SentimentAnalysis\chunk_44.parquet


Saving Batches:  42%|████▏     | 45/106 [00:11<00:15,  3.92batch/s]

  -> Saved batch 45 rows [440000:450000] to Data/2.Processed/SentimentAnalysis\chunk_45.parquet


Saving Batches:  43%|████▎     | 46/106 [00:11<00:15,  3.98batch/s]

  -> Saved batch 46 rows [450000:460000] to Data/2.Processed/SentimentAnalysis\chunk_46.parquet


Saving Batches:  44%|████▍     | 47/106 [00:11<00:14,  4.00batch/s]

  -> Saved batch 47 rows [460000:470000] to Data/2.Processed/SentimentAnalysis\chunk_47.parquet


Saving Batches:  45%|████▌     | 48/106 [00:11<00:14,  4.00batch/s]

  -> Saved batch 48 rows [470000:480000] to Data/2.Processed/SentimentAnalysis\chunk_48.parquet


Saving Batches:  46%|████▌     | 49/106 [00:12<00:14,  3.87batch/s]

  -> Saved batch 49 rows [480000:490000] to Data/2.Processed/SentimentAnalysis\chunk_49.parquet


Saving Batches:  47%|████▋     | 50/106 [00:12<00:14,  3.76batch/s]

  -> Saved batch 50 rows [490000:500000] to Data/2.Processed/SentimentAnalysis\chunk_50.parquet


Saving Batches:  48%|████▊     | 51/106 [00:12<00:14,  3.81batch/s]

  -> Saved batch 51 rows [500000:510000] to Data/2.Processed/SentimentAnalysis\chunk_51.parquet


Saving Batches:  49%|████▉     | 52/106 [00:12<00:13,  3.86batch/s]

  -> Saved batch 52 rows [510000:520000] to Data/2.Processed/SentimentAnalysis\chunk_52.parquet


Saving Batches:  50%|█████     | 53/106 [00:13<00:13,  3.91batch/s]

  -> Saved batch 53 rows [520000:530000] to Data/2.Processed/SentimentAnalysis\chunk_53.parquet


Saving Batches:  51%|█████     | 54/106 [00:13<00:13,  3.86batch/s]

  -> Saved batch 54 rows [530000:540000] to Data/2.Processed/SentimentAnalysis\chunk_54.parquet


Saving Batches:  52%|█████▏    | 55/106 [00:13<00:13,  3.67batch/s]

  -> Saved batch 55 rows [540000:550000] to Data/2.Processed/SentimentAnalysis\chunk_55.parquet


Saving Batches:  53%|█████▎    | 56/106 [00:14<00:14,  3.47batch/s]

  -> Saved batch 56 rows [550000:560000] to Data/2.Processed/SentimentAnalysis\chunk_56.parquet


Saving Batches:  54%|█████▍    | 57/106 [00:14<00:13,  3.60batch/s]

  -> Saved batch 57 rows [560000:570000] to Data/2.Processed/SentimentAnalysis\chunk_57.parquet


Saving Batches:  55%|█████▍    | 58/106 [00:14<00:13,  3.68batch/s]

  -> Saved batch 58 rows [570000:580000] to Data/2.Processed/SentimentAnalysis\chunk_58.parquet


Saving Batches:  56%|█████▌    | 59/106 [00:14<00:12,  3.72batch/s]

  -> Saved batch 59 rows [580000:590000] to Data/2.Processed/SentimentAnalysis\chunk_59.parquet


Saving Batches:  57%|█████▋    | 60/106 [00:15<00:12,  3.80batch/s]

  -> Saved batch 60 rows [590000:600000] to Data/2.Processed/SentimentAnalysis\chunk_60.parquet


Saving Batches:  58%|█████▊    | 61/106 [00:15<00:12,  3.65batch/s]

  -> Saved batch 61 rows [600000:610000] to Data/2.Processed/SentimentAnalysis\chunk_61.parquet


Saving Batches:  58%|█████▊    | 62/106 [00:15<00:12,  3.50batch/s]

  -> Saved batch 62 rows [610000:620000] to Data/2.Processed/SentimentAnalysis\chunk_62.parquet


Saving Batches:  59%|█████▉    | 63/106 [00:15<00:11,  3.64batch/s]

  -> Saved batch 63 rows [620000:630000] to Data/2.Processed/SentimentAnalysis\chunk_63.parquet


Saving Batches:  60%|██████    | 64/106 [00:16<00:11,  3.67batch/s]

  -> Saved batch 64 rows [630000:640000] to Data/2.Processed/SentimentAnalysis\chunk_64.parquet


Saving Batches:  61%|██████▏   | 65/106 [00:16<00:11,  3.68batch/s]

  -> Saved batch 65 rows [640000:650000] to Data/2.Processed/SentimentAnalysis\chunk_65.parquet


Saving Batches:  62%|██████▏   | 66/106 [00:16<00:10,  3.67batch/s]

  -> Saved batch 66 rows [650000:660000] to Data/2.Processed/SentimentAnalysis\chunk_66.parquet


Saving Batches:  63%|██████▎   | 67/106 [00:17<00:11,  3.53batch/s]

  -> Saved batch 67 rows [660000:670000] to Data/2.Processed/SentimentAnalysis\chunk_67.parquet


Saving Batches:  64%|██████▍   | 68/106 [00:17<00:11,  3.36batch/s]

  -> Saved batch 68 rows [670000:680000] to Data/2.Processed/SentimentAnalysis\chunk_68.parquet


Saving Batches:  65%|██████▌   | 69/106 [00:17<00:10,  3.44batch/s]

  -> Saved batch 69 rows [680000:690000] to Data/2.Processed/SentimentAnalysis\chunk_69.parquet


Saving Batches:  66%|██████▌   | 70/106 [00:17<00:10,  3.54batch/s]

  -> Saved batch 70 rows [690000:700000] to Data/2.Processed/SentimentAnalysis\chunk_70.parquet


Saving Batches:  67%|██████▋   | 71/106 [00:18<00:09,  3.53batch/s]

  -> Saved batch 71 rows [700000:710000] to Data/2.Processed/SentimentAnalysis\chunk_71.parquet


Saving Batches:  68%|██████▊   | 72/106 [00:18<00:09,  3.56batch/s]

  -> Saved batch 72 rows [710000:720000] to Data/2.Processed/SentimentAnalysis\chunk_72.parquet


Saving Batches:  69%|██████▉   | 73/106 [00:18<00:09,  3.34batch/s]

  -> Saved batch 73 rows [720000:730000] to Data/2.Processed/SentimentAnalysis\chunk_73.parquet


Saving Batches:  70%|██████▉   | 74/106 [00:19<00:10,  3.19batch/s]

  -> Saved batch 74 rows [730000:740000] to Data/2.Processed/SentimentAnalysis\chunk_74.parquet


Saving Batches:  71%|███████   | 75/106 [00:19<00:09,  3.26batch/s]

  -> Saved batch 75 rows [740000:750000] to Data/2.Processed/SentimentAnalysis\chunk_75.parquet


Saving Batches:  72%|███████▏  | 76/106 [00:19<00:09,  3.32batch/s]

  -> Saved batch 76 rows [750000:760000] to Data/2.Processed/SentimentAnalysis\chunk_76.parquet


Saving Batches:  73%|███████▎  | 77/106 [00:20<00:08,  3.38batch/s]

  -> Saved batch 77 rows [760000:770000] to Data/2.Processed/SentimentAnalysis\chunk_77.parquet


Saving Batches:  74%|███████▎  | 78/106 [00:20<00:08,  3.46batch/s]

  -> Saved batch 78 rows [770000:780000] to Data/2.Processed/SentimentAnalysis\chunk_78.parquet


Saving Batches:  75%|███████▍  | 79/106 [00:20<00:08,  3.28batch/s]

  -> Saved batch 79 rows [780000:790000] to Data/2.Processed/SentimentAnalysis\chunk_79.parquet


Saving Batches:  75%|███████▌  | 80/106 [00:20<00:07,  3.30batch/s]

  -> Saved batch 80 rows [790000:800000] to Data/2.Processed/SentimentAnalysis\chunk_80.parquet


Saving Batches:  76%|███████▋  | 81/106 [00:21<00:07,  3.35batch/s]

  -> Saved batch 81 rows [800000:810000] to Data/2.Processed/SentimentAnalysis\chunk_81.parquet


Saving Batches:  77%|███████▋  | 82/106 [00:21<00:07,  3.35batch/s]

  -> Saved batch 82 rows [810000:820000] to Data/2.Processed/SentimentAnalysis\chunk_82.parquet


Saving Batches:  78%|███████▊  | 83/106 [00:21<00:06,  3.40batch/s]

  -> Saved batch 83 rows [820000:830000] to Data/2.Processed/SentimentAnalysis\chunk_83.parquet


Saving Batches:  79%|███████▉  | 84/106 [00:22<00:06,  3.47batch/s]

  -> Saved batch 84 rows [830000:840000] to Data/2.Processed/SentimentAnalysis\chunk_84.parquet


Saving Batches:  80%|████████  | 85/106 [00:22<00:06,  3.41batch/s]

  -> Saved batch 85 rows [840000:850000] to Data/2.Processed/SentimentAnalysis\chunk_85.parquet


Saving Batches:  81%|████████  | 86/106 [00:22<00:05,  3.35batch/s]

  -> Saved batch 86 rows [850000:860000] to Data/2.Processed/SentimentAnalysis\chunk_86.parquet


Saving Batches:  82%|████████▏ | 87/106 [00:22<00:05,  3.42batch/s]

  -> Saved batch 87 rows [860000:870000] to Data/2.Processed/SentimentAnalysis\chunk_87.parquet


Saving Batches:  83%|████████▎ | 88/106 [00:23<00:05,  3.42batch/s]

  -> Saved batch 88 rows [870000:880000] to Data/2.Processed/SentimentAnalysis\chunk_88.parquet


Saving Batches:  84%|████████▍ | 89/106 [00:23<00:04,  3.45batch/s]

  -> Saved batch 89 rows [880000:890000] to Data/2.Processed/SentimentAnalysis\chunk_89.parquet


Saving Batches:  85%|████████▍ | 90/106 [00:23<00:04,  3.49batch/s]

  -> Saved batch 90 rows [890000:900000] to Data/2.Processed/SentimentAnalysis\chunk_90.parquet


Saving Batches:  86%|████████▌ | 91/106 [00:24<00:04,  3.33batch/s]

  -> Saved batch 91 rows [900000:910000] to Data/2.Processed/SentimentAnalysis\chunk_91.parquet


Saving Batches:  87%|████████▋ | 92/106 [00:24<00:04,  3.30batch/s]

  -> Saved batch 92 rows [910000:920000] to Data/2.Processed/SentimentAnalysis\chunk_92.parquet


Saving Batches:  88%|████████▊ | 93/106 [00:24<00:03,  3.33batch/s]

  -> Saved batch 93 rows [920000:930000] to Data/2.Processed/SentimentAnalysis\chunk_93.parquet


Saving Batches:  89%|████████▊ | 94/106 [00:25<00:03,  3.37batch/s]

  -> Saved batch 94 rows [930000:940000] to Data/2.Processed/SentimentAnalysis\chunk_94.parquet


Saving Batches:  90%|████████▉ | 95/106 [00:25<00:03,  3.32batch/s]

  -> Saved batch 95 rows [940000:950000] to Data/2.Processed/SentimentAnalysis\chunk_95.parquet


Saving Batches:  91%|█████████ | 96/106 [00:25<00:02,  3.37batch/s]

  -> Saved batch 96 rows [950000:960000] to Data/2.Processed/SentimentAnalysis\chunk_96.parquet


Saving Batches:  92%|█████████▏| 97/106 [00:25<00:02,  3.24batch/s]

  -> Saved batch 97 rows [960000:970000] to Data/2.Processed/SentimentAnalysis\chunk_97.parquet


Saving Batches:  92%|█████████▏| 98/106 [00:26<00:02,  3.19batch/s]

  -> Saved batch 98 rows [970000:980000] to Data/2.Processed/SentimentAnalysis\chunk_98.parquet


Saving Batches:  93%|█████████▎| 99/106 [00:26<00:02,  3.24batch/s]

  -> Saved batch 99 rows [980000:990000] to Data/2.Processed/SentimentAnalysis\chunk_99.parquet


Saving Batches:  94%|█████████▍| 100/106 [00:26<00:01,  3.27batch/s]

  -> Saved batch 100 rows [990000:1000000] to Data/2.Processed/SentimentAnalysis\chunk_100.parquet


Saving Batches:  95%|█████████▌| 101/106 [00:27<00:01,  3.32batch/s]

  -> Saved batch 101 rows [1000000:1010000] to Data/2.Processed/SentimentAnalysis\chunk_101.parquet


Saving Batches:  96%|█████████▌| 102/106 [00:27<00:01,  3.32batch/s]

  -> Saved batch 102 rows [1010000:1020000] to Data/2.Processed/SentimentAnalysis\chunk_102.parquet


Saving Batches:  97%|█████████▋| 103/106 [00:27<00:00,  3.23batch/s]

  -> Saved batch 103 rows [1020000:1030000] to Data/2.Processed/SentimentAnalysis\chunk_103.parquet


Saving Batches:  98%|█████████▊| 104/106 [00:28<00:00,  3.21batch/s]

  -> Saved batch 104 rows [1030000:1040000] to Data/2.Processed/SentimentAnalysis\chunk_104.parquet


Saving Batches:  99%|█████████▉| 105/106 [00:28<00:00,  3.24batch/s]

  -> Saved batch 105 rows [1040000:1050000] to Data/2.Processed/SentimentAnalysis\chunk_105.parquet


Saving Batches: 100%|██████████| 106/106 [00:28<00:00,  3.69batch/s]

  -> Saved batch 106 rows [1050000:1057871] to Data/2.Processed/SentimentAnalysis\chunk_106.parquet

All done. Each batch is in its own .parquet file in: Data/2.Processed/SentimentAnalysis
Done splitting!





### FURTHER MODELLING

In [9]:
# ###############################################################################
# # SCRIPT: domain_sentiment.py
# ###############################################################################
# import os
# import glob
# import pandas as pd
# from tqdm import tqdm

# # Hugging Face Transformers
# from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

# def domain_sentiment_on_parquet_chunks(
#     input_folder: str,
#     output_folder: str,
#     file_pattern: str = "chunk_*.parquet",
#     out_prefix: str = "sentiment_",
#     model_name: str = "nlptown/bert-base-multilingual-uncased-sentiment",
#     text_column: str = "abstract"
# ):
#     """
#     1) Finds Parquet files (e.g., chunk_1.parquet, chunk_2.parquet...) in `input_folder`.
#     2) Loads a domain-specific (or generic) sentiment model from HF Transformers.
#     3) For each chunk:
#        - Read the chunk into a DataFrame
#        - Predict sentiment on 'text_column' (abstract)
#        - Save as new file in `output_folder`, e.g. 'sentiment_chunk_1.parquet'
#     """

#     os.makedirs(output_folder, exist_ok=True)

#     # 1) Initialize the pipeline with your domain-specific model
#     print(f"Loading model: {model_name}")
#     tokenizer = AutoTokenizer.from_pretrained(model_name)
#     model = AutoModelForSequenceClassification.from_pretrained(model_name)
#     sentiment_pipeline = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

#     # 2) Gather chunk files
#     chunk_files = glob.glob(os.path.join(input_folder, file_pattern))
#     chunk_files.sort()  # optional, to process in numeric order

#     print(f"Found {len(chunk_files)} chunk files in '{input_folder}' with pattern '{file_pattern}'")
#     if not chunk_files:
#         print("No files found. Exiting.")
#         return

#     # 3) Process each chunk
#     with tqdm(total=len(chunk_files), desc="Sentiment Chunks", unit="file") as pbar:
#         for chunk_file in chunk_files:
#             df_chunk = pd.read_parquet(chunk_file)

#             # We'll gather texts
#             texts = df_chunk[text_column].fillna("").astype(str).tolist()

#             # Predict in a batch manner (pipeline automatically does internal batching)
#             # If memory is still an issue, we can do smaller sub-batches. For now, we do the entire chunk.
#             sentiments = sentiment_pipeline(texts)

#             # sentiments is a list of dicts like [{"label": "POSITIVE", "score": 0.998}, ...]
#             # We'll store them in new columns
#             df_chunk["sent_label"] = [s["label"] for s in sentiments]
#             df_chunk["sent_score"] = [s["score"] for s in sentiments]

#             # 4) Save chunk
#             base_name = os.path.basename(chunk_file)  # e.g. 'chunk_1.parquet'
#             out_name = out_prefix + base_name         # e.g. 'sentiment_chunk_1.parquet'
#             out_path = os.path.join(output_folder, out_name)
#             df_chunk.to_parquet(out_path, index=False)

#             print(f"  -> Labeled sentiment for {chunk_file} -> {out_path}")
#             pbar.update(1)

#     print("\nAll done! Labeled chunk files are in:", output_folder)


# if __name__ == "__main__":
#     input_dir = "Data/2.Processed/SentimentAnalysis"    # where your chunk_*.parquet are
#     output_dir = "Data/2.Processed/SentimentAnalysis/Snorkel" # where you want to save labeled files
#     model_name = "nlptown/bert-base-multilingual-uncased-sentiment" # microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract # nlptown/bert-base-multilingual-uncased-sentiment
#     # ^ Replace with your domain model, e.g. "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract"
#     #   if it has a sentiment head. Otherwise you may do zero-shot classification.

#     domain_sentiment_on_parquet_chunks(
#         input_folder=input_dir,
#         output_folder=output_dir,
#         file_pattern="chunk_*.parquet",
#         out_prefix="sentiment_",
#         model_name=model_name,
#         text_column="abstract"
#     )
#     print("Sentiment analysis complete on chunked data.")


In [10]:
# # Load model directly
# from transformers import AutoTokenizer, AutoModelForMaskedLM

# tokenizer = AutoTokenizer.from_pretrained("microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract")
# model = AutoModelForMaskedLM.from_pretrained("microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract")

In [None]:
###############################################################################
# SCRIPT: domain_sentiment_sliding.py
###############################################################################
import os
import glob
import pandas as pd
from tqdm import tqdm

from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import pipeline

import torch
import math

def predict_long_text(
    text: str,
    pipe,
    max_length=512,
    stride=256,
    aggregation="average"
):
    """
    Use a sliding-window approach to handle text longer than `max_length`.
    - `pipe` is a Hugging Face pipeline for sentiment or sequence classification.
    - We chunk the text into overlapping windows of size `max_length`, with `stride`.
    - For each chunk, we get a sentiment label distribution.
    - Then we combine them (e.g. by averaging "positive" probability).

    Returns a dict with overall "label" and "score", e.g. {"label": "POSITIVE", "score": 0.87}.
    """

    # The pipeline may do its own tokenization, but we want more control here:
    # We'll manually tokenize with the same tokenizer used in the pipeline.
    tokenizer = pipe.tokenizer  # must match pipe's tokenizer

    # Tokenize the entire text, but don't do immediate truncation
    # We rely on the sliding window approach below
    encoding = tokenizer(
        text,
        add_special_tokens=True,
        return_tensors="pt",
        truncation=False,  # We'll handle it ourselves
        return_overflowing_tokens=True,
        max_length=max_length,
        stride=stride
    )

    # The above call might produce multiple "overflowing" item sets in encoding
    # if text is bigger than max_length. But actually, the HF approach is a bit different.
    # We can do a simpler approach: manually chunk the text ourselves. Let's do so.

    # We'll do an alternative approach: break the text into sub-chunks ourselves,
    # each up to `max_length` tokens, with overlap `stride`.
    # Then run pipe on each sub-chunk of tokens -> gather results.

    # We'll define a manual chunking:
    input_ids = tokenizer(text, add_special_tokens=True)["input_ids"]
    # The length in tokens
    total_tokens = len(input_ids)
    # If it fits in max_length, just do one pass
    if total_tokens <= max_length:
        results = pipe(text)
        return results[0]  # pipeline returns a list of dicts, so we take the first

    # Otherwise, define sub-chunks
    # We'll store probabilities per chunk, then average or combine
    # pipeline returns label + score. Typically "POSITIVE"/"NEGATIVE" w/ a single score, or multiple classes if >2

    subchunk_sentiments = []

    # We'll do a loop over start positions:
    start = 0
    while start < total_tokens:
        end = start + max_length
        sub_ids = input_ids[start:end]
        sub_text = tokenizer.decode(sub_ids, skip_special_tokens=True)
        # run pipeline on that sub_text
        r = pipe(sub_text)
        # r is typically e.g. [{"label": "POSITIVE", "score": 0.98}]
        subchunk_sentiments.append(r[0])
        if end >= total_tokens:
            break
        start += max_length - stride  # move by stride

    # Now combine subchunk_sentiments. We can do many approaches:
    # 1) If it's a binary classifier (POS/NEG), we can average the "positive" score, etc.
    # 2) If it's multi-class, we might do pipe.model.config.num_labels to handle multiple classes

    # We'll assume a typical BERT sentiment with "POSITIVE"/"NEGATIVE"/"NEUTRAL" labels.
    # We'll convert each chunk's "label" into a numeric distribution if possible.
    # But the pipeline by default might just give top label. That's ambiguous.
    # For better approach, we might define pipe as "text-classification" with return_all_scores=True.

    # For demonstration, let's do the simplest approach:
    # We'll count how many sub-chunks are "POSITIVE", "NEGATIVE", "NEUTRAL", etc.
    # Then pick the majority or pick whichever is most frequent.

    label_counts = {}
    for chunk_res in subchunk_sentiments:
        lbl = chunk_res["label"]
        label_counts[lbl] = label_counts.get(lbl, 0) + 1

    # pick the label that appears most frequently
    overall_label = max(label_counts, key=label_counts.get)
    # the "score" we can set to the fraction of sub-chunks with that label
    overall_score = label_counts[overall_label] / len(subchunk_sentiments)

    return {"label": overall_label, "score": overall_score}

def domain_sentiment_on_parquet_chunks(
    input_folder: str,
    output_folder: str,
    file_pattern: str = "chunk_*.parquet",
    out_prefix: str = "sentiment_",
    model_name: str = "nlptown/bert-base-multilingual-uncased-sentiment",
    text_column: str = "abstract",
    max_length=512,
    stride=256
):
    """
    1) Finds Parquet chunk files in `input_folder`.
    2) Loads the specified model from HF Transformers.
    3) For each chunk, read => for each row's text => do sliding-window sentiment => store label + score => save.
    """

    os.makedirs(output_folder, exist_ok=True)

    print(f"Loading model: {model_name}")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name)

    # We define the pipeline, but we'll do sub-chunk logic ourselves in predict_long_text
    # The pipeline will be used for classification of sub-chunks
    sentiment_pipeline = pipeline(
        "sentiment-analysis",
        model=model,
        tokenizer=tokenizer,
        # We'll still pass truncation=False here because we're handling sub-chunk truncation manually
        truncation=False
    )

    chunk_files = glob.glob(os.path.join(input_folder, file_pattern))
    chunk_files.sort()
    print(f"Found {len(chunk_files)} chunk files in '{input_folder}' matching '{file_pattern}'.")
    if not chunk_files:
        print("No chunk files found. Exiting.")
        return

    with tqdm(total=len(chunk_files), desc="Sentiment Chunks", unit="file") as pbar:
        for chunk_file in chunk_files:
            df_chunk = pd.read_parquet(chunk_file)
            # We'll do row-by-row if the chunk size is not huge. If chunk is big, consider sub-batching.

            all_labels = []
            all_scores = []

            # row-based loop
            for text in tqdm(df_chunk[text_column].fillna("").astype(str), desc="Rows", leave=False):
                # do sub-chunk approach
                res = predict_long_text(
                    text=text,
                    pipe=sentiment_pipeline,
                    max_length=max_length,
                    stride=stride
                )
                all_labels.append(res["label"])
                all_scores.append(res["score"])

            df_chunk["sent_label"] = all_labels
            df_chunk["sent_score"] = all_scores

            base_name = os.path.basename(chunk_file)
            out_name = out_prefix + base_name
            out_path = os.path.join(output_folder, out_name)
            df_chunk.to_parquet(out_path, index=False)

            print(f"  -> Labeled {chunk_file} -> {out_path}")
            pbar.update(1)

    print("\nAll done! Labeled chunk files are in:", output_folder)


if __name__ == "__main__":
    input_dir = "Data/2.Processed/SentimentAnalysis"    # where your chunk_*.parquet are
    output_dir = "Data/2.Processed/SentimentAnalysis/Snorkel"
    model_name = "microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract" #microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract
    # If you have a domain model w/ a sentiment classification head, put it here.
    # If not, consider zero-shot approach or see if the text is short enough to not exceed 512 tokens

    domain_sentiment_on_parquet_chunks(
        input_folder=input_dir,
        output_folder=output_dir,
        file_pattern="chunk_*.parquet",
        out_prefix="sentiment_",
        model_name=model_name,
        text_column="abstract",
        max_length=496,
        stride=256
    )
    print("Sentiment analysis complete on chunked data (sliding window).")


Loading model: microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cpu


Found 106 chunk files in 'Data/2.Processed/SentimentAnalysis' matching 'chunk_*.parquet'.


Sentiment Chunks:   0%|          | 0/106 [00:00<?, ?file/s]

In [None]:
####  Cutting df

In [17]:
###############################################################################
# SCRIPT: domain_sentiment_sliding.py
###############################################################################
import os
import glob
import pandas as pd
from tqdm import tqdm

from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import pipeline

import torch
import math

def predict_long_text(
    text: str,
    pipe,
    max_length=512,
    stride=256,
    aggregation="average"
):
    """
    Use a sliding-window approach to handle text longer than `max_length`.
    - `pipe` is a Hugging Face pipeline for sentiment or sequence classification.
    - We chunk the text into overlapping windows of size `max_length`, with `stride`.
    - For each chunk, we get a sentiment label distribution.
    - Then we combine them (e.g. by averaging "positive" probability).

    Returns a dict with overall "label" and "score", e.g. {"label": "POSITIVE", "score": 0.87}.
    """

    # The pipeline may do its own tokenization, but we want more control here:
    # We'll manually tokenize with the same tokenizer used in the pipeline.
    tokenizer = pipe.tokenizer  # must match pipe's tokenizer

    # Tokenize the entire text, but don't do immediate truncation
    # We rely on the sliding window approach below
    encoding = tokenizer(
        text,
        add_special_tokens=True,
        return_tensors="pt",
        truncation=False,  # We'll handle it ourselves
        return_overflowing_tokens=True,
        max_length=max_length,
        stride=stride
    )

    # The above call might produce multiple "overflowing" item sets in encoding
    # if text is bigger than max_length. But actually, the HF approach is a bit different.
    # We can do a simpler approach: manually chunk the text ourselves. Let's do so.

    # We'll do an alternative approach: break the text into sub-chunks ourselves,
    # each up to `max_length` tokens, with overlap `stride`.
    # Then run pipe on each sub-chunk of tokens -> gather results.

    # We'll define a manual chunking:
    input_ids = tokenizer(text, add_special_tokens=True)["input_ids"]
    # The length in tokens
    total_tokens = len(input_ids)
    # If it fits in max_length, just do one pass
    if total_tokens <= max_length:
        results = pipe(text)
        return results[0]  # pipeline returns a list of dicts, so we take the first

    # Otherwise, define sub-chunks
    # We'll store probabilities per chunk, then average or combine
    # pipeline returns label + score. Typically "POSITIVE"/"NEGATIVE" w/ a single score, or multiple classes if >2

    subchunk_sentiments = []

    # We'll do a loop over start positions:
    start = 0
    while start < total_tokens:
        end = start + max_length
        sub_ids = input_ids[start:end]
        sub_text = tokenizer.decode(sub_ids, skip_special_tokens=True)
        # run pipeline on that sub_text
        r = pipe(sub_text)
        # r is typically e.g. [{"label": "POSITIVE", "score": 0.98}]
        subchunk_sentiments.append(r[0])
        if end >= total_tokens:
            break
        start += max_length - stride  # move by stride

    # Now combine subchunk_sentiments. We can do many approaches:
    # 1) If it's a binary classifier (POS/NEG), we can average the "positive" score, etc.
    # 2) If it's multi-class, we might do pipe.model.config.num_labels to handle multiple classes

    # We'll assume a typical BERT sentiment with "POSITIVE"/"NEGATIVE"/"NEUTRAL" labels.
    # We'll convert each chunk's "label" into a numeric distribution if possible.
    # But the pipeline by default might just give top label. That's ambiguous.
    # For better approach, we might define pipe as "text-classification" with return_all_scores=True.

    # For demonstration, let's do the simplest approach:
    # We'll count how many sub-chunks are "POSITIVE", "NEGATIVE", "NEUTRAL", etc.
    # Then pick the majority or pick whichever is most frequent.

    label_counts = {}
    for chunk_res in subchunk_sentiments:
        lbl = chunk_res["label"]
        label_counts[lbl] = label_counts.get(lbl, 0) + 1

    # pick the label that appears most frequently
    overall_label = max(label_counts, key=label_counts.get)
    # the "score" we can set to the fraction of sub-chunks with that label
    overall_score = label_counts[overall_label] / len(subchunk_sentiments)

    return {"label": overall_label, "score": overall_score}

def domain_sentiment_on_parquet_chunks(
    input_folder: str,
    output_folder: str,
    file_pattern: str = "chunk_*.parquet",
    out_prefix: str = "sentiment_",
    model_name: str = "nlptown/bert-base-multilingual-uncased-sentiment",
    text_column: str = "abstract",
    max_length=512,
    stride=256
):
    """
    1) Finds Parquet chunk files in `input_folder`.
    2) Loads the specified model from HF Transformers.
    3) For each chunk, read => for each row's text => do sliding-window sentiment => store label + score => save.
    """

    os.makedirs(output_folder, exist_ok=True)

    print(f"Loading model: {model_name}")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name)

    # We define the pipeline, but we'll do sub-chunk logic ourselves in predict_long_text
    # The pipeline will be used for classification of sub-chunks
    sentiment_pipeline = pipeline(
        "sentiment-analysis",
        model=model,
        tokenizer=tokenizer,
        # We'll still pass truncation=False here because we're handling sub-chunk truncation manually
        truncation=False
    )

    chunk_files = glob.glob(os.path.join(input_folder, file_pattern))
    chunk_files.sort()
    print(f"Found {len(chunk_files)} chunk files in '{input_folder}' matching '{file_pattern}'.")
    if not chunk_files:
        print("No chunk files found. Exiting.")
        return

    with tqdm(total=len(chunk_files), desc="Sentiment Chunks", unit="file") as pbar:
        for chunk_file in chunk_files:
            df_chunk = pd.read_parquet(chunk_file)
            # We'll do row-by-row if the chunk size is not huge. If chunk is big, consider sub-batching.

            all_labels = []
            all_scores = []

            # row-based loop
            for text in tqdm(df_chunk[text_column].fillna("").astype(str), desc="Rows", leave=False):
                # do sub-chunk approach
                res = predict_long_text(
                    text=text,
                    pipe=sentiment_pipeline,
                    max_length=max_length,
                    stride=stride
                )
                all_labels.append(res["label"])
                all_scores.append(res["score"])

            df_chunk["sent_label"] = all_labels
            df_chunk["sent_score"] = all_scores

            base_name = os.path.basename(chunk_file)
            out_name = out_prefix + base_name
            out_path = os.path.join(output_folder, out_name)
            df_chunk.to_parquet(out_path, index=False)

            print(f"  -> Labeled {chunk_file} -> {out_path}")
            pbar.update(1)

    print("\nAll done! Labeled chunk files are in:", output_folder)


if __name__ == "__main__":
    input_dir = "Data/2.Processed/SentimentAnalysis"    # where your chunk_*.parquet are
    output_dir = "Data/2.Processed/SentimentAnalysis/Snorkel"
    model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
    # If you have a domain model w/ a sentiment classification head, put it here.
    # If not, consider zero-shot approach or see if the text is short enough to not exceed 512 tokens

    domain_sentiment_on_parquet_chunks(
        input_folder=input_dir,
        output_folder=output_dir,
        file_pattern="chunk_*.parquet",
        out_prefix="sentiment_",
        model_name=model_name,
        text_column="abstract",
        max_length=510,
        stride=256
    )
    print("Sentiment analysis complete on chunked data (sliding window).")


Loading model: nlptown/bert-base-multilingual-uncased-sentiment


Device set to use cpu


Found 11 chunk files in 'Data/2.Processed/SentimentAnalysis' matching 'chunk_*.parquet'.


Sentiment Chunks:   0%|          | 0/11 [00:00<?, ?file/s]Token indices sequence length is longer than the specified maximum sequence length for this model (700 > 512). Running this sequence through the model will result in indexing errors
Sentiment Chunks:   0%|          | 0/11 [00:21<?, ?file/s]


KeyboardInterrupt: 

In [5]:
pip show snorkel


Name: snorkel
Version: 0.10.0
Summary: A system for quickly generating training data with weak supervision
Home-page: https://github.com/snorkel-team/snorkel
Author: 
Author-email: 
License: Apache License 2.0
Location: c:\Users\macie\AppData\Local\Programs\Python\Python312\Lib\site-packages
Requires: munkres, networkx, numpy, pandas, protobuf, scikit-learn, scipy, tensorboard, torch, tqdm
Required-by: 
Note: you may need to restart the kernel to use updated packages.


# TESTING 1-5

In [None]:
from snorkel.labeling import labeling_function, PandasLFApplier
import pandas as pd

@labeling_function()
def lf_positive_terms(x):
    return 1 if any(term in x.text.lower() for term in ["effective", "promising", "successful"]) else -1

@labeling_function()
def lf_negative_terms(x):
    return 0 if any(term in x.text.lower() for term in ["adverse", "ineffective", "failure"]) else -1

# Combine and apply labeling functions
lfs = [lf_positive_terms, lf_negative_terms]
df = pd.DataFrame({"text": ["Effective treatment.", "Adverse effects noted.", "No clear results."]})
applier = PandasLFApplier(lfs=lfs)
label_matrix = applier.apply(df)
print(label_matrix)

go for batches beacause this model won't work with this much data also maybe create new dataset without most columns

## SENTIMENT ANALYSIS

1. **Lexicon-Based Approach**

1.1 Why a Lexicon-Based Method?
No training data required. It uses a dictionary (lexicon) of words mapped to sentiment scores (positive, negative, neutral).

Quick to implement, can provide a baseline or unsupervised vantage.

1.2 Commonly used library: 
`VADER` (suitable for short social media–style text but can be adapted) or 
`SentiWordNet` (a more general WordNet-based approach).

`VADER` 
Works decently on short, informal text (abstracts, titles).
If your text is more scientific (like PubMed titles/abstracts), you may find many neutral or domain words not recognized by VADER.

`SentiWordNet`
If your text is more formal or domain-based, you might want to explore SentiWordNet or a domain-specific lexicon. The logic is similar, but you’d look up each word’s positivity/negativity in the SentiWordNet dictionary, summing or averaging them.

`Justification`
Lexicon-based methods are quick for an unsupervised sentiment estimate.
They can fail in domain-specific contexts (e.g., biomedical text might mention “cancer” or “infection,” which are negative in a lay sense but may be neutral from a purely scientific standpoint).

This is why a **supervised** approach may be more accurate if you have labeled data.

In [4]:
###############################################################################
# 1) LEXICON-BASED (VADER) EXAMPLE
###############################################################################
import pandas as pd
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk

# 1) Download the VADER lexicon if not installed
nltk.download('vader_lexicon')

sid = SentimentIntensityAnalyzer()

def lexicon_based_vader(text):
    """
    Return sentiment scores for the given text using VADER.
    """
    # If text is None or not a string, convert to empty string
    if not isinstance(text, str):
        text = ""
    scores = sid.polarity_scores(text)
    return scores

def get_vader_label(scores):
    compound = scores["compound"]
    if compound >= 0.05:
        return "positive"
    elif compound <= -0.05:
        return "negative"
    else:
        return "neutral"

# 2) Suppose your DataFrame is 'df' with column "abstract" or "title"
#    We'll do it on "abstract"
df["vader_scores"] = df["abstract"].apply(lexicon_based_vader)
df["vader_label"] = df["vader_scores"].apply(get_vader_label)

df[["title", "abstract", "vader_scores", "vader_label"]].head(10)


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\macie\AppData\Roaming\nltk_data...


: 

1.2 Alternative: SentiWordNet

If your text is more formal or domain-based, you might want to explore SentiWordNet or a domain-specific lexicon. The logic is similar, but you’d look up each word’s positivity/negativity in the SentiWordNet dictionary, summing or averaging them.

Justification
Lexicon-based methods are quick for an unsupervised sentiment estimate.
They can fail in domain-specific contexts (e.g., biomedical text might mention “cancer” or “infection,” which are negative in a lay sense but may be neutral from a purely scientific standpoint).
This is why a supervised approach may be more accurate if you have labeled data.

In [None]:
#CODE

**2. Supervised Machine Learning Approach**

**2.1 Why a Supervised Method?**

You’ll train a model on labeled examples of text → sentiment (pos/neg/neu).

This typically yields better results than lexicon-based if you have enough labeled data.

Common supervised classifiers: Logistic Regression, SVM, Naive Bayes, or even a fine-tuned BERT.

**2.2 Example with Logistic Regression**

Steps:

1. Gather labeled data: 

You need text + a sentiment label. Perhaps you label 1000+ random samples as positive/negative/neutral.
2. Feature extraction:

A simple approach uses TF-IDF or CountVectorizer on the text.

3. More advanced: 

use pretrained embeddings (e.g., BERT) as features.
Train a scikit-learn classifier (LogReg or SVM).
Predict on unseen text.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Suppose df_labeled has columns: "abstract", "label"
# label in {pos, neg, neu}, curated or partially annotated

train_df, test_df = train_test_split(df_labeled, test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(train_df["abstract"])
y_train = train_df["label"]

X_test = vectorizer.transform(test_df["abstract"])
y_test = test_df["label"]

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

Justification:

Logistic Regression is a common baseline supervised approach for sentiment analysis.
TF-IDF is quick to implement and typically effective for textual classification if domain-specific embeddings aren’t available.
2.3 Possible Upgrades
SVM or RandomForest: Slight variations in performance.
Neural approach with BERT-based embeddings: If you have enough data/time.
If your text is domain-specific, consider a SciBERT or BioBERT for embeddings or fine-tuning.
3. Combining or Reporting
You’re “obliged to perform sentiment analysis using at least one lexicon-based approach and at least one supervised technique.” So in your final report, you can:

Implement VADER for the lexicon-based method.
Implement Logistic Regression (or SVM) for the supervised method.
Compare performance on the same test set (requires labeled data for the supervised approach).
Justify choices:
VADER is quick, well-known, but might not handle domain terms well.
Logistic Regression is a standard baseline for text classification, interpretable, typically robust.
If your text is heavily domain-specific (scientific, biomedical), mention that both methods might have limitations:

Lexicon: might incorrectly classify domain words.
Logistic Regression: requires a domain-labeled dataset.
Still, this approach meets the stated requirement: one lexicon-based, one supervised approach, plus a reasoned justification.

4. Considering Neutral vs. Non-Neutral
If your data is primarily neutral (like many scientific abstracts), the distribution of sentiment might be heavily skewed. You can either:

Keep a 3-class system (pos/neg/neu).
Merge pos/neg into non-neutral vs. neutral.
Provide an analysis of how many are likely to be neutral, if that’s the main interest.
Either approach is valid, but note that heavily neutral data can reduce your classifier’s performance if you have few positive/negative examples.

5. Potential Models Summarized
Lexicon-based:

VADER if text is general or social media–like.
SentiWordNet or other dictionary for more formal text.
Bio domain: Possibly no major out-of-the-box lexicon for sentiment, so VADER is a fallback.
Supervised:

Logistic Regression or SVM with TF-IDF → easy to implement, relatively fast.
If large labeled data + domain complexity → fine-tuned BERT (e.g., SciBERT or BioBERT). But that’s more advanced in setup.

Conclusion
Yes, you can approach your data with two methods:
Lexicon-based (like VADER) for an unsupervised baseline,
Supervised (like Logistic Regression) for better domain accuracy if you have labeled data.
Keep in mind domain-specific challenges if your text is specialized.
If your data is mostly neutral, analyzing pos/neg signals might require a large labeled set or more sophisticated domain lexicons.
This satisfies the requirement to use “at least one lexicon-based approach and at least one supervised machine learning technique”, plus justification for each choice.
With these code blocks and the rationale above, you can implement both methods, compare them, and then summarize in your final report.

Key Takeaways
Use raw text (not tokenized) for VADER, because it relies on punctuation, capitalization, etc.
Expect mostly neutral results for biomedical articles.
For your supervised approach, a standard logistic regression or SVM with TF-IDF is straightforward. Or you can do advanced domain-based or neural embeddings.
You’ll include both approaches in your final deliverable, explaining their rationale and limitations

CHATT GO GO GO

1. Data Acquisition & Initial Storage
Gather the Articles

You likely have data from PubMed or a similar source, with columns like uid, title, abstract, parsed_date, etc.
If you have over a million records, store them in a batch-oriented or chunk-based format (e.g., multiple Parquet files). This ensures you don’t exceed memory constraints.
Ensure You Have Enough Labeled Data (for the supervised part)

If sentiment labels do not exist, you must create at least a small labeled set (e.g., 1,000–2,000 randomly sampled abstracts/titles).
Alternatively, if there is no direct sentiment labeling, consider a small manual annotation or partially auto-labeled approach to bootstrap your supervised classifier.
Because biomedical text is typically neutral, you’ll need to carefully define “positive” vs. “negative” in a domain context (e.g., describing successful treatments, hopeful outcomes → positive; describing severe complications or mortality → negative).
2. Data Cleaning & Preprocessing
Check for Missing / Incomplete Rows

Titles or abstracts that are empty → handle them (they might default to “neutral” in lexicon-based, or be dropped for supervised training).
Normalize (Optional)

Typically, for lexicon-based approaches (like VADER), punctuation and case matter. So keep them.
For the supervised approach (TF-IDF or embeddings), you might do normal lowercasing/punctuation removal.
You can keep a separate column for raw text if your lexicon-based approach benefits from punctuation, capitalization, etc.
Tokenization (for supervised approach)

If you plan to do a simpler TF-IDF approach, you can use standard tokenization (like nltk.word_tokenize) plus lowercasing.
If you plan advanced embeddings (e.g., BioBERT), you feed raw text into the model’s tokenizer.
Optional Stopword Removal

For typical sentiment tasks, removing standard English stopwords can help in a supervised approach. But in the medical domain, certain “stop” words might have sentiment implications, so proceed with caution.
Chunking (Memory Management)

If your DataFrame is huge (1M+ rows), you might do a chunk-based approach for certain tasks (reading/writing). For lexicon-based and supervised inference, you can process in smaller chunks and then store results.
3. Lexicon-Based Sentiment Analysis
3.1. Why Lexicon?
Requirement: At least one lexicon-based approach.
Pros: No training data needed, quick to set up.
Cons: Domain mismatch likely → many biomedical terms not in the default sentiment dictionary.
3.2. Implementation Steps
Choose a Lexicon

VADER (if text is somewhat plain English).
SentiWordNet or another approach if you prefer more general coverage.
For domain-specific expansions: Possibly incorporate medical synonyms with negative connotations (e.g., “adverse event,” “fatal,” etc.) if you have a custom dictionary.
Apply Lexicon

For VADER, install nltk → download 'vader_lexicon' → SentimentIntensityAnalyzer().
For each row’s text (preferably the raw title or abstract, not tokenized), get the polarity scores.
Decide a threshold: e.g., compound >= 0.05 → positive, compound <= -0.05 → negative, else neutral.
Store & Summarize

Create columns vader_scores (dictionary) and vader_label (pos/neg/neu).
Expect many “neutral” labels. This is normal for scientific text.
Review

Possibly analyze a small sample to see if domain words are missed.
In your final report, note the likely mismatch and the expected neutral skew.
4. Supervised Machine Learning Approach
4.1. Why Supervised?
Typically yields better results if you have labeled data, because the classifier learns domain context.
You meet the requirement: “at least one supervised technique.”
4.2. Steps
Obtain / Create Labeled Data

You need a set of text samples with sentiment labels (pos/neg/neu).
If you have no pre-labeled set, you might do a small manual annotation or an auto-labeled heuristic.
Keep class distributions in mind: if your domain is heavily neutral, consider more examples for positive/negative to balance.
Feature Extraction

Basic: Use TfidfVectorizer on the text (title/abstract).
Intermediate: Use word embeddings (e.g., GloVe) or domain embeddings (like scispaCy vectors).
Advanced: Fine-tune a BioBERT or SciBERT model for sentiment classification. This is more complex but can handle domain terms better.
Model Choice

Logistic Regression or SVM: common baseline.
Possibly a RandomForest or XGBoost if you suspect non-linear relationships.
Or a transformer-based approach (BioBERT) if you have enough labeled data/time/GPU resources.
Training & Evaluation

Split data into train/dev/test sets (e.g., 80/10/10).
Train on train set, tune hyperparameters on dev set, evaluate final on test set.
Use metrics: accuracy, F1, or a confusion matrix to see how well each class is predicted.
Because you might have a mostly neutral dataset, track class distribution carefully.
Interpretation

If using Logistic Regression, you can examine top coefficients for each class.
If using advanced embeddings, you might do some error analysis to see if certain domain terms always lead to “negative.”
5. Final Assembly & Reporting
Implementation

Code each pipeline (lexicon-based & supervised) in a modular way.
Possibly store sentiment results in new columns: vader_label, supervised_label.
Compare or Correlate

You can see how often your lexicon-based label matches your supervised label. Probably a large portion is “neutral” from both.
The difference might be in detecting minor positivity or negativity that VADER missed.
Summarize

Provide a short table: e.g., how many articles each approach labeled as pos/neg/neu.
If you have test labels for the supervised approach, show confusion matrix & F1-scores.
Justify

Lexicon-based: easy to set up, no training data. Good for a quick baseline or if no labeling is possible.
Supervised: required labeled data but typically more accurate for domain-specific text.
Conclusions

Probably you’ll find the text is predominantly neutral. This is your real-world domain outcome.
If “positive” or “negative” emerges, it likely correlates with language describing strong beneficial outcomes or severe adverse events.
6. Additional Options for Medical Field
Domain-Specific Lexicon

Some resources provide medical synonyms for “risk,” “complication,” “fatal,” etc. If integrated into your lexicon-based approach, you might capture more domain negativity.
scispaCy

For biomedical text processing: tokenization, NER. Not specifically sentiment, but can help with domain text.
BioBERT / SciBERT

If you have a large labeled dataset for sentiment, you can fine-tune a domain BERT model. This can yield state-of-the-art results for domain text. But it’s more advanced in terms of GPU usage and setup.
Error Analysis

Because sentiment in medical text is subtle, examine misclassifications carefully. Terms like “cancer” might appear negative to a typical lexicon, but the article might be neutral if it’s just describing a procedure.
7. Proposed Pipeline (Summary)
Putting it all in a step-by-step bullet:

Data Preparation

Load your ~1M articles (title, abstract, etc.). Possibly store in chunked Parquet if memory is an issue.
Data Cleaning

Remove or handle missing abstracts. Keep punctuation/case for VADER.
Lexicon-Based Approach (VADER)

For each row’s raw text (abstract or title), get VADER scores → finalize vader_label (pos/neg/neu).
Expect mostly neutral. Store the results.
Supervised Approach

a) Obtain labeled data (pos/neg/neu). Possibly label a random subset.
b) Split train/test.
c) Feature Extraction (TF-IDF) or advanced embeddings.
d) Train a classifier (LogReg, SVM, or a BERT-based approach).
e) Evaluate on test set → get accuracy, F1, confusion matrix.
Merge & Compare

Optionally compare vader_label vs. supervised_label to see alignment. Summarize stats.
Analysis & Report

Summarize method, limitations, domain mismatch issues.
Conclude that domain text is largely neutral, but show some examples where a small subset might appear positive or negative (e.g., “successful outcome,” “mortality,” etc.).
8. Final Notes & Best Practices
Because your domain is heavily neutral, your supervised model might need a class re-balancing technique (e.g. oversampling the minority classes).
If you do find that 90%+ of your text is “neutral” from both methods, report that as a finding. The domain might truly not exhibit strong sentiment.
For large-scale data (~1 million + rows), efficiency matters:
Use chunk-based reading/writing to avoid memory overflows.
For lexicon-based scoring, you can chunk the DataFrame (e.g., 50k at a time), apply VADER, store results.
For the supervised approach, ensure you have enough labeled examples. If you only label 100 or 200 items in a domain with 1 million rows, it may not generalize well.
This pipeline addresses the entire journey:

At least one lexicon-based approach: VADER.
At least one supervised technique: Logistic Regression or SVM, with justification.
Careful about domain mismatch and the high neutrality of the medical field.
You’ll produce two sets of labels (vader_label, model_label) and a discussion of their alignment, limitations, and next steps (like domain expansions or advanced embeddings). This thoroughly satisfies your assignment.

CHAT 2

1. Why You Need Labeled Data
A supervised approach (like Logistic Regression, SVM, or BERT-based classification) requires labeled examples of your text mapped to sentiment labels (positive/negative/neutral, or some variant). If you have zero labeled data, you cannot directly train or evaluate a supervised model.

However, there are ways to generate or approximate labeled data:

Manual annotation of a small subset (ideal but might be time-consuming).
Heuristic or “weak supervision” methods that auto-label some data with certain rules.
Use an external, relevant corpus that is already labeled for sentiment.
Crowdsourcing or domain-expert partial labeling.
Below are some approaches in more detail.

2. Small Manual Annotation (A Minimal, “Gold” Sample)
If you have even a small capacity to label, you might:

Sample ~500–2,000 articles’ titles or short abstracts randomly.
Spend time (or ask a small group of domain experts) to label them as “positive,” “negative,” or “neutral.”
Positive = text describing beneficial outcomes, improvements, success, etc.
Negative = describing severe risk, complications, mortality, adverse events.
Neutral = purely descriptive or objective with no implied positivity/negativity.
This small labeled set can train a baseline classifier. You can’t handle the entire million-row dataset manually, but a small subset is often enough to get started.
Pros:

Real domain data, labeled by humans.
Even 500–1,000 labeled examples can allow a basic supervised model.
Cons:

Takes time (especially if domain experts are needed).
Still might be skewed “neutral.”
3. Heuristic / “Weak Supervision” Approaches
If no direct labeling is possible, consider:

Keyword-based heuristics:

Mark documents containing specific terms (“success,” “improved,” “effective”) as “positive.”
Mark those mentioning severe complications, “fatal,” “mortality,” “adverse event” as “negative.”
Mark everything else “neutral.”
Domain dictionary expansions:

If you can gather a small set of “negative” medical synonyms (e.g., “death,” “severe,” “complication,” “failure,” “risk,” “hazard,” “worsen,” etc.), plus “positive” synonyms (“success,” “improve,” “beneficial,” “safe,” etc.), you can label text that clearly uses these words.
Auto-labeled data from external cues:

If your text has meta-data that indicates outcomes or conclusions (like “Conclusion: The approach was successful…”), you might parse that as a positive label.
Then:

All docs that match your “positive” rules are labeled “positive,” those matching “negative” rules are labeled “negative,” and everything else is “neutral.”
This becomes a “silver standard” or “weakly supervised” dataset.
You can then train a supervised classifier on these auto-labeled examples.
Caveat:

The classifier will learn from potentially noisy labels. But it’s better than no supervision at all, and you can refine the rules if you see major errors.
4. Use an External Labeled Corpus
In some cases, you might:

Find a public medical-sentiment dataset (though these are rare).
Pre-train or fine-tune your model on that external dataset.
Optionally adapt it to your domain using a small subset of your data labeled via heuristics or manual sampling.
Example: If there’s a small medical tweet dataset with sentiment labeled, you might train or at least partially adapt that model. The domain mismatch might not be ideal, but it’s a start.

5. Crowdsourcing or Domain-Expert Partial Labeling
If you have a small group of domain experts (e.g., med students, colleagues with subject knowledge), you can do a labeling sprint:

Each person labels 100–200 examples.
Combine them into a minimal but “gold quality” dataset.
If the text is short enough, you might use a crowdsourcing platform (like Amazon Mechanical Turk). But for deeply biomedical text, crowd workers might not parse the domain-specific content properly—expert labeling is usually more accurate.

6. Summarizing a Feasible Path
Given you said: “I do not have a way to manually label,” you might choose:

Heuristic-based labeling:
Construct or gather a domain dictionary indicating “positive” vs. “negative” words.
Auto-label ~10,000 articles.
Accept that some fraction might be mislabeled (noise).
Train a logistic regression or SVM on these auto-labeled (weakly supervised) examples.
Evaluate it qualitatively or on a smaller manually-labeled sample if possible.
This method fulfills the “supervised approach” requirement, even if your labels aren’t perfect. You can highlight that in your report:

“Due to lack of manual labeling resources, we used a heuristic-based approach to label texts with certain keywords. The resulting dataset was used to train a logistic regression classifier. We acknowledge the potential noise in these labels but it demonstrates how a supervised pipeline can be employed with minimal human annotation.”
7. In Conclusion
Yes, you do need labeled data for a supervised approach.
If you absolutely can’t label, consider a weak supervision scheme or minimal manual labeling.
This will let you train a model and meet the requirement for a supervised method.
In your final project, you can highlight the limitations of auto-labeling vs. a carefully curated gold standard.
Key: The “point” is to show you can build a supervised pipeline. Even if your data is mostly neutral and you lack direct labels, you can approximate or gather a minimal labeled set. That’s typically how professional data scientists handle sentiment analysis in a domain with little or no labeled data: creative heuristics, partial annotation, or domain expansions.

Below is a detailed explanation of how to implement weak supervision (auto-labeling) for sentiment in biomedical text, plus whether scispaCy is usable for sentiment analysis in this domain. I’ll outline practical steps and tools you can use, especially if you have no labeled data, want to create a “silver standard,” or want to incorporate domain knowledge.

1. What Is Weak Supervision (Auto-Labeling)?
Weak supervision (sometimes called distant supervision or auto-labeling) is a technique to generate approximate or noisy labels for your data without manually labeling each example. You create a set of rules or heuristics that label each text as positive, negative, or neutral, effectively bootstrapping a training set for your supervised classifier. It’s not perfect, but it can be enough to train an initial model if you have no direct labeled data.

1.1. Simple Heuristic Example
Positive keywords: “successful,” “improved,” “beneficial,” “effective,” “safe,” “significant improvement,” etc.
Negative keywords: “fatal,” “adverse,” “complication,” “increased risk,” “worse outcome,” “failure,” “severe,” “mortality,” etc.
If the text contains ≥1 strongly negative phrase → label “negative.”
If the text contains ≥1 strongly positive phrase → label “positive.”
Otherwise → label “neutral.”
You can refine the rules, e.g., if the text matches both negative and positive keywords, maybe neutral or whichever is “stronger.” This is your “weak supervision” approach.

Implementation:

In [None]:
import re

positive_terms = [
    r"successful", r"improved", r"beneficial", r"effective", r"safe", r"significant improvement"
]
negative_terms = [
    r"fatal", r"adverse", r"complication", r"failure", r"severe", r"mortality",
    r"increased risk", r"worse outcome"
]

def weak_label_text(text):
    text_lower = text.lower()  # simple approach
    # Search for positive or negative keywords
    found_pos = any(re.search(term, text_lower) for term in positive_terms)
    found_neg = any(re.search(term, text_lower) for term in negative_terms)
    
    if found_pos and not found_neg:
        return "positive"
    elif found_neg and not found_pos:
        return "negative"
    else:
        return "neutral"

df["weak_label"] = df["abstract"].fillna("").apply(weak_label_text)


This will produce labels. They’ll be noisy because domain text might have complicated language. But it’s better than zero labeled data.

1.2. More Advanced Approaches (Snorkel, etc.)
Snorkel is a framework from Stanford for building more sophisticated label functions that can overlap or conflict, then it uses a label model to produce a more consistent final label. This is recommended if you want to scale up rule-based labeling with advanced conflict resolution.

Snorkel can combine multiple heuristics:
E.g., “If text mentions improve AND does not mention risk, label positive.”
Weighted or conflict rules.
The label model estimates the accuracy/overlap of each rule function.
But at a high level, the concept is the same: you end up with “silver standard” labeled data to train a classifier.

2. Training a Supervised Classifier from Weak Labels
Once you have these auto-labeled (weak-labeled) examples, you can:

Split them into train/dev/test or do cross-validation.
Use a standard approach like:
TfidfVectorizer + Logistic Regression / SVM, or
BERT-based approach (if you can handle the domain complexity and have enough examples).
Your model will learn from these imperfect labels. The final classifier might generalize somewhat better than your original rules alone, because the classifier can pick up additional patterns.

3. Using scispaCy for Sentiment?
scispaCy provides:

Domain tokenization
POS tagging
NER models for biomedical text (like en_ner_bc5cdr_md for diseases and chemicals).
It does NOT provide an out-of-the-box sentiment pipeline. So you can’t just do nlp(text)._.sentiment in scispaCy. Instead, you can use scispaCy to get domain-aware tokens, lemma, etc., then feed those tokens or embeddings into your own sentiment model. Some suggestions:

Domain Tokenization: Instead of standard spaCy English, load en_core_sci_sm or en_core_sci_md, which handle biomedical text better. Then your doc object might have more accurate token boundaries for domain terms.
NER from scispaCy**: You can identify diseases, chemicals, genes, etc. If you see certain negative or positive disease mentions, that might feed into your labeling rules or supervised features.
Vectors from en_core_sci_md**: This model has ~50k word vectors specialized for biomedical text. You can convert tokens to these vectors for your supervised classifier if you prefer that to TF-IDF.
But scispaCy does not do sentiment classification on its own. You still must define or train a sentiment approach.

4. Putting It All Together: Proposed Pipeline
Data

You have 1M+ biomedical articles (title, abstract).
Possibly store them in chunked Parquet for memory reasons.
Weak Labeling

a) Define keyword-based rules for “positive,” “negative,” “neutral.”
b) Label the entire dataset or a big chunk.
c) Accept it’s noisy but large scale.
Train a Classifier

a) Use scispaCy to tokenize if you want domain tokenization.
b) Extract features:
Option A: TF-IDF on the text.
Option B: scispaCy “md” or “lg” vectors as embeddings for each token. Possibly average them.
c) Fit an SVM, Logistic Regression, or a simple neural net.
d) Evaluate on a small manually-labeled set if possible, or do cross-validation on your weak-labeled data.
Analysis

See how many get labeled positive/negative. Possibly everything is mostly neutral.
If you do find partial positives or negatives, that might mean your domain text specifically mentions improvements or severe complications.
5. Additional Considerations
5.1. Domain-Specific Dictionary
You can improve your weak supervision rules by including medical synonyms:

Negative: “adverse event,” “complications,” “mortality,” “fatal,” “toxic,” …
Positive: “improved survival,” “beneficial,” “safe,” “well-tolerated,” “positive outcome,” …
5.2. Combine Lexicon + scispaCy Entities
If you see that an article mentions “severe disease progression,” you might treat that as negative. If it mentions “successful therapy,” treat as positive. scispaCy NER can help you identify DISEASE or CHEMICAL mentions, but you still need your own rules or classifier for sentiment.

5.3. If You Obtain Some Labeled Data
Even 100–200 manual labels can help you refine or check your heuristic approach:

Compare how many times your auto-labeled data matches the small gold standard.
Possibly reweight or re-engineer your rules to reduce false positives.
6. Example Code Skeleton for Weak Labeling + scispaCy (High-Level)
python
Copy code
import re
import spacy
from tqdm import tqdm

# 1) Load scispaCy model for domain tokenization (optional)
nlp = spacy.load("en_core_sci_md")

# 2) Define your negative/positive patterns
pos_patterns = [r"successful", r"improved", r"beneficial", r"safe", r"positive outcome", ...]
neg_patterns = [r"fatal", r"adverse", r"complication", r"failure", r"severe", r"mortality", ...]

def auto_label_biomedical(text):
    text_lower = text.lower()
    found_pos = any(re.search(p, text_lower) for p in pos_patterns)
    found_neg = any(re.search(p, text_lower) for p in neg_patterns)
    if found_pos and not found_neg:
        return "positive"
    elif found_neg and not found_pos:
        return "negative"
    else:
        return "neutral"

# 3) Suppose you read your data chunk by chunk
# for chunk_df in read_in_chunks(...):
#     chunk_df["weak_label"] = chunk_df["abstract"].apply(auto_label_biomedical)
#     # Save or combine
Then train a classifier:

python
Copy code
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer

df_weak = df[ df["weak_label"] != "neutral" ]  # or keep all, depends on distribution
X_train, X_test, y_train, y_test = train_test_split(df_weak["abstract"], df_weak["weak_label"], ...)
vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

clf = SVC()
clf.fit(X_train_vec, y_train)

print("Accuracy on test:", clf.score(X_test_vec, y_test))
Summary
Weak supervision (auto-labeling) can be done with keyword-based or more advanced rule-based approaches.
This yields a noisy but large labeled dataset, letting you train a supervised sentiment classifier.
scispaCy doesn’t have a built-in sentiment pipeline—it’s for tokenization, domain NER, or embeddings in the biomedical domain. You can use it to create domain tokens, but you still must define or train your own sentiment model.
For best results, combine a small set of actual manual labels (if possible) with your heuristic approach to refine or evaluate the auto-labeled data.
Thus, you can handle no-labeled-data scenarios by:

Writing heuristics or dictionary rules → produce “weak labels.”
Using scispaCy for domain tokenization or NER if desired.
Training a supervised method on your auto-labeled “silver standard.”
That is how you achieve a practical sentiment classification pipeline in biomedical text without having to manually label large volumes from scratch.

In [None]:
df

In [None]:
###############################################################################
# SCRIPT 1: split_into_parquet.py
###############################################################################
import os
import pandas as pd
from tqdm import tqdm

def split_df_into_parquet_batches(
    df: pd.DataFrame,
    batch_size: int,
    output_folder: str,
    file_prefix: str = "chunk_"
):
    """
    Splits a DataFrame into multiple Parquet files (batch_size rows each),
    storing them in 'output_folder'. Each file is named like 'chunk_1.parquet',
    'chunk_2.parquet', etc.

    A progress bar shows how many batches are being saved.

    This script does NOT merge them back into a single file. The idea is to keep
    each chunk separate so you can process them individually later.
    """
    os.makedirs(output_folder, exist_ok=True)

    total_rows = len(df)
    batch_count = (total_rows + batch_size - 1) // batch_size
    print(f"Splitting DF with {total_rows} rows into {batch_count} batches of size {batch_size}.")

    current_row = 0
    batch_idx = 1

    with tqdm(total=batch_count, desc="Saving Batches", unit="batch") as pbar:
        while current_row < total_rows:
            end_row = min(current_row + batch_size, total_rows)
            df_batch = df.iloc[current_row:end_row]

            chunk_filename = f"{file_prefix}{batch_idx}.parquet"
            chunk_path = os.path.join(output_folder, chunk_filename)

            df_batch.to_parquet(chunk_path, index=False)
            
            pbar.update(1)
            print(f"  -> Saved batch {batch_idx} rows [{current_row}:{end_row}] to {chunk_path}")

            current_row = end_row
            batch_idx += 1

    print("\nAll done. Each batch is in its own .parquet file in:", output_folder)


if __name__ == "__main__":
    # EXAMPLE USAGE:
    # Suppose you have a big DataFrame 'df_final' already in memory
    # or you read it from somewhere.

    # For demonstration, let's just create a small DataFrame:
    import numpy as np

    df_final = pd.DataFrame({
        "colA": np.random.randint(0, 1000, 350_000),
        "colB": [f"Row {i}" for i in range(350_000)]
    })

    # Decide where to store chunked Parquet files
    out_folder = "Data/2.Processed/ParquetChunks"
    prefix = "chunk_"
    b_size = 100_000

    split_df_into_parquet_batches(
        df=df_final,
        batch_size=b_size,
        output_folder=out_folder,
        file_prefix=prefix
    )

    print("Done splitting!")


In [None]:
SNORKEL