This file is for classifies references into topics on the entire references csv file. To do this, we:

* Used a 'BART' model to give each reference a confidence score for each topic. Updated .csv files are saved at every 100,000 rows

* All .csv files are combined and saved as 'combined_results.csv'

* Top 35% of confidence scores for each topic are classified as 1, the rest are classified as 0.

In [None]:
import os
import pandas as pd
from transformers import pipeline
from datetime import datetime

def process_dataset_in_chunks(df, categories, batch_size=25, chunk_size=10000, results_dir="results"):
    os.makedirs(results_dir, exist_ok=True)

    classifier = pipeline(
        "zero-shot-classification",
        model="cross-encoder/nli-distilroberta-base",
        device=-1
    )

    total_rows = len(df)
    num_chunks = (total_rows + chunk_size - 1) // chunk_size

    for chunk_idx in range(num_chunks):
        start_idx = chunk_idx * chunk_size
        end_idx = min((chunk_idx + 1) * chunk_size, total_rows)

        output_file = os.path.join(results_dir, f"results_{end_idx}.csv")
        if os.path.exists(output_file):
            print(f"Chunk {chunk_idx + 1}/{num_chunks} already processed. Skipping rows {start_idx} to {end_idx}.")
            continue

        print(f"Processing chunk {chunk_idx + 1}/{num_chunks}, rows {start_idx} to {end_idx} at {datetime.now().strftime('%H:%M:%S')}")
        chunk_df = df.iloc[start_idx:end_idx].copy()

        category_scores = {cat: [] for cat in categories}

        for i in range(0, len(chunk_df), batch_size):
            batch_texts = chunk_df['context'].iloc[i:i+batch_size]
            
            for text in batch_texts:
                try:
                    if not isinstance(text, str) or not text:
                        for cat in categories:
                            category_scores[cat].append(0.0)
                        continue

                    result = classifier(text, candidate_labels=categories)
                    scores_dict = dict(zip(result['labels'], result['scores']))
                    
                    for cat in categories:
                        category_scores[cat].append(round(scores_dict.get(cat, 0.0), 3))

                except Exception as e:
                    print(f"Error processing text: {str(e)}")
                    for cat in categories:
                        category_scores[cat].append(0.0)

        for cat in categories:
            chunk_df[f'confidence_{cat}'] = category_scores[cat]

        chunk_df.to_csv(output_file, index=False)
        print(f"Chunk saved to '{output_file}' with {len(chunk_df)} rows.")

    print("All chunks processed and saved.")

categories = [
    "politics", "ethics", "epistemology", "logic",
    "metaphysics", "science", "religion",
]

df = pd.read_csv('references.csv')

process_dataset_in_chunks(
    df,
    categories,
    batch_size=25,
    chunk_size=10000,
    results_dir="results"

  return self.fget.__get__(instance, owner)()


Chunk 1/11 already processed. Skipping rows 0 to 10000.
Chunk 2/11 already processed. Skipping rows 10000 to 20000.
Chunk 3/11 already processed. Skipping rows 20000 to 30000.
Chunk 4/11 already processed. Skipping rows 30000 to 40000.
Chunk 5/11 already processed. Skipping rows 40000 to 50000.
Chunk 6/11 already processed. Skipping rows 50000 to 60000.
Chunk 7/11 already processed. Skipping rows 60000 to 70000.
Chunk 8/11 already processed. Skipping rows 70000 to 80000.
Processing chunk 9/11, rows 80000 to 90000 at 10:29:10
Chunk saved to 'results/results_90000.csv' with 10000 rows.
Processing chunk 10/11, rows 90000 to 100000 at 11:03:40
Chunk saved to 'results/results_100000.csv' with 10000 rows.
Processing chunk 11/11, rows 100000 to 109691 at 12:11:56
Chunk saved to 'results/results_109691.csv' with 9691 rows.
All chunks processed and saved.


In [None]:
import pandas as pd
import os

# Combine chunks
results_dir = "results"
dataframes = []

for file in os.listdir(results_dir):
    if file.endswith(".csv"):
        file_path = os.path.join(results_dir, file)
        chunk_df = pd.read_csv(file_path)
        dataframes.append(chunk_df)

combined_df = pd.concat(dataframes, ignore_index=True)

output_file = "combined_results.csv"
combined_df.to_csv(output_file, index=False)
print(f"All chunks have been combined and saved to {output_file}")

All chunks have been combined and saved to combined_results.csv


In [None]:
import pandas as pd

df = pd.read_csv('combined_results.csv')

confidence_cols = [col for col in df.columns if col.startswith('confidence_')]
binary_df = df.copy()

for col in confidence_cols:
    threshold = df[col].quantile(0.65)
    binary_df[col] = (binary_df[col] >= threshold).astype(int)

all_zeros = binary_df[confidence_cols].sum(axis=1) == 0
if all_zeros.any():
    for idx in binary_df[all_zeros].index:
        best_topic = df.loc[idx, confidence_cols].idxmax()
        binary_df.loc[idx, best_topic] = 1

print("\nBinary Classification Results:")
print(f"Total rows: {len(binary_df)}")
print(f"Rows with all zeros before fixing: {all_zeros.sum()}")
print("\nDistribution for each topic:")
for col in confidence_cols:
    ones = binary_df[col].sum()
    print(f"\n{col}:")
    print(f"Ones: {ones} ({(ones/len(binary_df))*100:.1f}%)")
    print(f"Zeros: {len(binary_df)-ones} ({((len(binary_df)-ones)/len(binary_df))*100:.1f}%)")

binary_df.to_csv('binary_results.csv', index=False)
print("\nBinary results saved to 'binary_results.csv'")