In [None]:
import os
import pandas as pd
from transformers import pipeline
from datetime import datetime
import time

def process_dataset_in_chunks(df, categories, batch_size=25, chunk_size=10000, threshold=0.5, results_dir="results"):
    """
    Process a large dataset in chunks, saving results after each chunk and resuming from the last completed chunk.

    Parameters:
    df (pd.DataFrame): DataFrame containing 'context' column
    categories (list): List of categories to classify into
    batch_size (int): Number of texts to process per batch
    chunk_size (int): Number of rows to process per chunk
    threshold (float): Confidence threshold for top 2 topics
    results_dir (str): Directory to save results

    Returns:
    None
    """
    os.makedirs(results_dir, exist_ok=True)


    classifier = pipeline(
        "zero-shot-classification",
        model="cross-encoder/nli-distilroberta-base",
        device=-1
    )

    total_rows = len(df)
    num_chunks = (total_rows + chunk_size - 1) // chunk_size  #  number of chunks to process

    for chunk_idx in range(num_chunks):
        start_idx = chunk_idx * chunk_size
        end_idx = min((chunk_idx + 1) * chunk_size, total_rows)

        # skip if chunk has already been processed
        output_file = os.path.join(results_dir, f"results_{end_idx}.csv")
        if os.path.exists(output_file):
            print(f"Chunk {chunk_idx + 1}/{num_chunks} already processed. Skipping rows {start_idx} to {end_idx}.")
            continue

        print(f"Processing chunk {chunk_idx + 1}/{num_chunks}, rows {start_idx} to {end_idx} at {datetime.now().strftime('%H:%M:%S')}")
        chunk_df = df.iloc[start_idx:end_idx].copy()  # work on a copy of the data

        all_predictions = []
        all_confidences = []
        error_count = 0

        # process in batches
        for i in range(0, len(chunk_df), batch_size):
            batch_texts = chunk_df['context'].iloc[i:i+batch_size]
            batch_predictions = []
            batch_confidences = []

            for text in batch_texts:
                try:
                    truncated_text = text[:80] if isinstance(text, str) else ""

                    if not truncated_text:
                        batch_predictions.append("unknown")
                        batch_confidences.append(0.0)
                        continue

                    # classification
                    result = classifier(truncated_text, candidate_labels=categories)

                    # Get top 2 labels and scores
                    top_labels = result['labels'][:2]
                    top_scores = result['scores'][:2]
                    combined_confidence = sum(top_scores)

                    # check confidence threshold to determine if we add a third topic
                    if combined_confidence < threshold:
                        top_labels.append(result['labels'][2])

                    batch_predictions.append(", ".join(top_labels))
                    batch_confidences.append(round(combined_confidence, 3))

                except Exception as e:
                    print(f"Error processing text: {str(e)}")
                    batch_predictions.append("error")
                    batch_confidences.append(0.0)
                    error_count += 1

            all_predictions.extend(batch_predictions)
            all_confidences.extend(batch_confidences)

        # Save chunk results
        chunk_df['predicted_category'] = all_predictions
        chunk_df['confidence'] = all_confidences
        chunk_df.to_csv(output_file, index=False)
        print(f"Chunk saved to '{output_file}' with {len(chunk_df)} rows.")

    print("All chunks processed and saved.")

# define categories
categories = [
    "politics", "ethics", "epistemology", "logic",
    "art", "metaphysics", "science", "language",
]

# Load the CSV file
df = pd.read_csv('references.csv')

process_dataset_in_chunks(
    df,
    categories,
    batch_size=25,  # good batch for Mac M3 chip
    chunk_size=10000,  # chunks are rows of 10,000
    threshold=0.5,  # confidence threshold
    results_dir="results"  # save each chunk
)

  return self.fget.__get__(instance, owner)()


Chunk 1/11 already processed. Skipping rows 0 to 10000.
Chunk 2/11 already processed. Skipping rows 10000 to 20000.
Chunk 3/11 already processed. Skipping rows 20000 to 30000.
Chunk 4/11 already processed. Skipping rows 30000 to 40000.
Chunk 5/11 already processed. Skipping rows 40000 to 50000.
Chunk 6/11 already processed. Skipping rows 50000 to 60000.
Chunk 7/11 already processed. Skipping rows 60000 to 70000.
Chunk 8/11 already processed. Skipping rows 70000 to 80000.
Processing chunk 9/11, rows 80000 to 90000 at 10:29:10
Chunk saved to 'results/results_90000.csv' with 10000 rows.
Processing chunk 10/11, rows 90000 to 100000 at 11:03:40
Chunk saved to 'results/results_100000.csv' with 10000 rows.
Processing chunk 11/11, rows 100000 to 109691 at 12:11:56
Chunk saved to 'results/results_109691.csv' with 9691 rows.
All chunks processed and saved.


In [None]:
import pandas as pd
import os

# Combine chunks
results_dir = "results"
dataframes = []

for file in os.listdir(results_dir):
    if file.endswith(".csv"):
        file_path = os.path.join(results_dir, file)
        chunk_df = pd.read_csv(file_path)
        dataframes.append(chunk_df)

combined_df = pd.concat(dataframes, ignore_index=True)

output_file = "combined_results.csv"
combined_df.to_csv(output_file, index=False)
print(f"All chunks have been combined and saved to {output_file}")

All chunks have been combined and saved to combined_results.csv
