This file is for classifies references into topics on a small portion of our dataset. To do this, we:

* Used a 'BART' model to give each reference a confidence score for each topic.

* Normalized the confidence scores using z-standarization

* Coverted the confidence scores to binary if they scored in the top 65% for their topic



In [13]:
import pandas as pd
from transformers import pipeline

categories = [
    "politics", "ethics", "epistemology", "logic", 
    "metaphysics", "science", "religion",
]

df_test = pd.read_csv('references.csv', nrows=50)

classifier = pipeline(
    "zero-shot-classification",
    model="cross-encoder/nli-distilroberta-base",
    device=-1
)

category_scores = {cat: [] for cat in categories}

for text in df_test['context']:
    try:
        if not isinstance(text, str) or not text:
            for cat in categories:
                category_scores[cat].append(0.0)
            continue

        result = classifier(text, candidate_labels=categories)
        
        scores_dict = dict(zip(result['labels'], result['scores']))
        
        for cat in categories:
            category_scores[cat].append(round(scores_dict.get(cat, 0.0), 3))

    except Exception as e:
        print(f"Error processing text: {str(e)}")
        for cat in categories:
            category_scores[cat].append(0.0)

for cat in categories:
    df_test[f'confidence_{cat}'] = category_scores[cat]

print("\nFirst 50 rows with confidence scores:")
print(df_test)

df_test.to_csv('test_results.csv', index=False)


First 120 rows with confidence scores:
     book_filename author_of_book  birth_death  reference  \
0   books/1223.txt       Buchanan  1804 - 1870  aristotle   
1   books/1223.txt       Buchanan  1804 - 1870  augustine   
2   books/1223.txt       Buchanan  1804 - 1870   berkeley   
3   books/1223.txt       Buchanan  1804 - 1870   berkeley   
4   books/1223.txt       Buchanan  1804 - 1870   berkeley   
..             ...            ...          ...        ...   
60  books/1223.txt       Buchanan  1804 - 1870   holyoake   
61  books/1223.txt       Buchanan  1804 - 1870   holyoake   
62  books/1223.txt       Buchanan  1804 - 1870   holyoake   
63  books/1223.txt       Buchanan  1804 - 1870   holyoake   
64  books/1223.txt       Buchanan  1804 - 1870   holyoake   

   full_author_referenced                                            context  \
0               Aristotle  l its laws and processes, its tribes and races...   
1               Augustine  oth, 25 cts. philip doddridge. his life 

In [15]:
import numpy as np

confidence_cols = [col for col in df_test.columns if col.startswith('confidence_')]
normalized_df = df_test.copy()

Z_SCORE_CAP = 3

for col in confidence_cols:
    topic_mean = normalized_df[col].mean()
    topic_std = normalized_df[col].std()
    
    normalized_df[col] = (normalized_df[col] - topic_mean) / topic_std
    
    normalized_df[col] = normalized_df[col].clip(lower=-Z_SCORE_CAP, upper=Z_SCORE_CAP).round(3)

for col in confidence_cols:
    print(f"\nStats for {col}:")
    print(f"Original mean: {df_test[col].mean():.3f}")
    print(f"Original std: {df_test[col].std():.3f}")
    print(f"Normalized mean: {normalized_df[col].mean():.3f}")
    print(f"Normalized std: {normalized_df[col].std():.3f}")
    print(f"Normalized min: {normalized_df[col].min():.3f}")
    print(f"Normalized max: {normalized_df[col].max():.3f}")

normalized_df.to_csv('normalized_120.csv', index=False)


Stats for confidence_politics:
Original mean: 0.073
Original std: 0.049
Normalized mean: -0.048
Normalized std: 0.746
Normalized min: -1.131
Normalized max: 3.000

Stats for confidence_ethics:
Original mean: 0.127
Original std: 0.043
Normalized mean: -0.033
Normalized std: 0.853
Normalized min: -2.231
Normalized max: 3.000

Stats for confidence_epistemology:
Original mean: 0.210
Original std: 0.057
Normalized mean: -0.000
Normalized std: 1.000
Normalized min: -2.968
Normalized max: 2.507

Stats for confidence_logic:
Original mean: 0.174
Original std: 0.063
Normalized mean: -0.022
Normalized std: 0.911
Normalized min: -2.065
Normalized max: 3.000

Stats for confidence_metaphysics:
Original mean: 0.208
Original std: 0.052
Normalized mean: -0.000
Normalized std: 1.000
Normalized min: -2.753
Normalized max: 2.779

Stats for confidence_science:
Original mean: 0.103
Original std: 0.082
Normalized mean: -0.050
Normalized std: 0.784
Normalized min: -0.998
Normalized max: 3.000

Stats for conf

In [17]:
import numpy as np

confidence_cols = [col for col in df_test.columns if col.startswith('confidence_')]
binary_df = df_test.copy()

for col in confidence_cols:
    threshold = binary_df[col].quantile(0.65)
    binary_df[col] = (binary_df[col] >= threshold).astype(int)

all_zeros = binary_df[confidence_cols].sum(axis=1) == 0
if all_zeros.any():
    for idx in binary_df[all_zeros].index:
        best_topic = df_test.loc[idx, confidence_cols].idxmax()
        binary_df.loc[idx, best_topic] = 1

print("\nVerification:")
all_zeros_after = binary_df[confidence_cols].sum(axis=1) == 0
print(f"Rows with all zeros: {all_zeros_after.sum()} (should be 0)")

for col in confidence_cols:
    num_ones = binary_df[col].sum()
    total = len(binary_df[col])
    print(f"\nStats for {col}:")
    print(f"Number of 1s: {num_ones} ({(num_ones/total)*100:.1f}%)")
    print(f"Number of 0s: {total-num_ones} ({((total-num_ones)/total)*100:.1f}%)")

binary_df.to_csv('test_results_binary3.csv', index=False)


Verification:
Rows with all zeros: 0 (should be 0)

Stats for confidence_politics:
Number of 1s: 23 (35.4%)
Number of 0s: 42 (64.6%)

Stats for confidence_ethics:
Number of 1s: 24 (36.9%)
Number of 0s: 41 (63.1%)

Stats for confidence_epistemology:
Number of 1s: 23 (35.4%)
Number of 0s: 42 (64.6%)

Stats for confidence_logic:
Number of 1s: 23 (35.4%)
Number of 0s: 42 (64.6%)

Stats for confidence_metaphysics:
Number of 1s: 23 (35.4%)
Number of 0s: 42 (64.6%)

Stats for confidence_science:
Number of 1s: 23 (35.4%)
Number of 0s: 42 (64.6%)

Stats for confidence_religion:
Number of 1s: 23 (35.4%)
Number of 0s: 42 (64.6%)
