In [1]:
import csv

def sort_csv_by_author(input_csv, output_csv):
    with open(input_csv, mode='r', newline='', encoding='utf-8') as csv_file:
        csv_reader = csv.reader(csv_file)
        header = next(csv_reader)
        rows = list(csv_reader)
        sorted_rows = sorted(rows, key=lambda row: row[1])

    with open(output_csv, mode='w', newline='', encoding='utf-8') as csv_file:
        csv_writer = csv.writer(csv_file)
        csv_writer.writerow(header)
        csv_writer.writerows(sorted_rows)

    print(f"Sorted CSV saved as: {output_csv}")

if __name__ == "__main__":
    input_csv = 'books_metadata.csv'
    output_csv = 'books_metadata_sorted.csv'
    sort_csv_by_author(input_csv, output_csv)

Sorted CSV saved as: books_metadata_sorted.csv


In [2]:
import csv

def remove_empty_authors(input_csv, output_csv):
    cleaned_rows = []

    with open(input_csv, mode='r', newline='', encoding='utf-8') as csv_file:
        csv_reader = csv.reader(csv_file)
        header = next(csv_reader)
        cleaned_rows.append(header)

        for row in csv_reader:
            if row[1].strip():
                cleaned_rows.append(row)

    with open(output_csv, mode='w', newline='', encoding='utf-8') as csv_file:
        csv_writer = csv.writer(csv_file)
        csv_writer.writerows(cleaned_rows)

    print(f"Rows with empty 'Author' values have been removed. Cleaned CSV saved as: {output_csv}")

if __name__ == "__main__":
    input_csv = 'cleared.csv'
    output_csv = 'new.csv'
    remove_empty_authors(input_csv, output_csv)

Rows with empty 'Author' values have been removed. Cleaned CSV saved as: new.csv


In [4]:
import csv

def count_unique_authors(csv_filename):
    unique_authors = set()

    with open(csv_filename, mode='r', newline='', encoding='utf-8') as csv_file:
        csv_reader = csv.reader(csv_file)
        header = next(csv_reader)

        for row in csv_reader:
            author_name = row[1].strip()
            if author_name:
                unique_authors.add(author_name)

    print(f"Number of unique authors: {len(unique_authors)}")

if __name__ == "__main__":
    csv_filename = "new.csv"
    count_unique_authors(csv_filename)

Number of unique authors: 488


In [2]:
import csv

def add_reference_column(input_csv, output_csv):
    updated_rows = []

    with open(input_csv, mode='r', newline='', encoding='utf-8') as csv_file:
        csv_reader = csv.reader(csv_file)
        header = next(csv_reader)
        header.append('Reference')
        updated_rows.append(header)

        for row in csv_reader:
            author_name = row[1].strip()

            if author_name:
                reference = author_name.split()[0]
                reference = reference.replace(',', '')
            else:
                reference = 'Unknown'

            row.append(reference)
            updated_rows.append(row)

    with open(output_csv, mode='w', newline='', encoding='utf-8') as csv_file:
        csv_writer = csv.writer(csv_file)
        csv_writer.writerows(updated_rows)

    print(f"Added 'Reference' column and removed all commas from it. Updated CSV saved as: {output_csv}")


if __name__ == "__main__":
    input_csv = 'reference2.csv'
    output_csv = 'final.csv'
    add_reference_column(input_csv, output_csv)

Added 'Reference' column and removed all commas from it. Updated CSV saved as: final.csv


In [1]:
import pandas as pd

def remove_self_references(input_csv, output_csv):
    df = pd.read_csv(input_csv)
    cleaned_df = df[df['author_of_book'] != df['full_author_referenced']]
    cleaned_df.to_csv(output_csv, index=False)
    print(f"Removed self-references. Cleaned data saved to {output_csv}")

if __name__ == "__main__":
    input_csv = 'references.csv'
    output_csv = 'references_cleaned.csv'
    remove_self_references(input_csv, output_csv)

Removed self-references. Cleaned data saved to references_cleaned.csv


In [4]:
import csv
import os
import re
import pandas as pd

def load_book_metadata(csv_file):
    book_metadata = {}
    with open(csv_file, mode='r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for row in reader:
            index = row['Index']
            book_metadata[index] = {
                'filename': row['Filename'],
                'author_of_book': row['Author'],
                'birth_death': row['Birth - Death']
            }
    return book_metadata

def clean_context(text):
    return re.sub(r'\s+', ' ', text).strip()

def find_marx_references_with_context(book_text, book_filename, author_of_book, birth_death, context_size=100):
    snippets = []
    pattern = re.compile(r'\bmarx\b', re.IGNORECASE)

    for match in pattern.finditer(book_text):
        start, end = match.start(), match.end()

        before = clean_context(book_text[max(0, start - context_size):start])
        after = clean_context(book_text[end:end + context_size])

        snippet = before + book_text[start:end] + after

        snippets.append({
            'book_filename': book_filename,
            'author_of_book': author_of_book,
            'birth_death': birth_death,
            'reference': 'Marx',
            'context': snippet
        })

    return snippets

def process_books_for_marx(book_metadata, output_file, context_size=100):
    all_snippets = []

    for index, book_info in book_metadata.items():
        book_path = book_info['filename']

        if os.path.exists(book_path):
            print(f"Processing {book_path}...")

            with open(book_path, 'r', encoding='utf-8', errors='ignore') as f:
                book_text = f.read().lower()

                snippets = find_marx_references_with_context(
                    book_text, 
                    book_path, 
                    book_info['author_of_book'], 
                    book_info['birth_death'], 
                    context_size
                )

                if snippets:
                    print(f"Found {len(snippets)} Marx references in {book_path}")

                all_snippets.extend(snippets)

    if all_snippets:
        df = pd.DataFrame(all_snippets)
        with open(output_file, 'a', encoding='utf-8', newline='') as f:
            df.to_csv(f, header=f.tell() == 0, index=False)
        print(f"Appended Marx references to {output_file}")
    else:
        print("No Marx references found in any of the books.")

def main():
    csv_file = 'newest.csv'
    output_file = 'references_cleaned.csv'
    context_size = 100

    book_metadata = load_book_metadata(csv_file)

    process_books_for_marx(book_metadata, output_file, context_size)

if __name__ == "__main__":
    main()


Processing books/10.txt...
Processing books/30.txt...
Processing books/54.txt...
Processing books/135.txt...
Processing books/217.txt...
Processing books/351.txt...
Processing books/5.txt...
Processing books/31.txt...
Processing books/104.txt...
Processing books/1758.txt...
Processing books/2264.txt...
Processing books/2326.txt...
Processing books/2142.txt...
Processing books/2307.txt...
Processing books/571.txt...
Processing books/214.txt...
Processing books/289.txt...
Processing books/315.txt...
Processing books/412.txt...
Processing books/1488.txt...
Processing books/1199.txt...
Processing books/2229.txt...
Processing books/188.txt...
Processing books/308.txt...
Processing books/327.txt...
Processing books/438.txt...
Processing books/475.txt...
Processing books/518.txt...
Processing books/549.txt...
Processing books/800.txt...
Processing books/20.txt...
Processing books/324.txt...
Processing books/353.txt...
Processing books/988.txt...
Processing books/999.txt...
Processing books/10

In [6]:
import csv
import re
import pandas as pd

def load_author_list(csv_file):
    authors_to_reference = []
    with open(csv_file, mode='r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for row in reader:
            author = row['Author']
            authors_to_reference.append(author)
    return list(set(authors_to_reference))

def clean_context(text):
    return re.sub(r'\s+', ' ', text).strip()

def find_references_in_manifesto(book_text, authors, context_size=100):
    snippets = []
    pattern = re.compile(r'\b(' + '|'.join(map(re.escape, authors)) + r')\b', re.IGNORECASE)

    for match in pattern.finditer(book_text):
        start, end = match.start(), match.end()
        referenced_author = match.group()

        before = clean_context(book_text[max(0, start - context_size):start])
        after = clean_context(book_text[end:end + context_size])

        snippet = before + book_text[start:end] + after

        snippets.append({
            'book_filename': '27.txt',
            'author_of_book': 'Marx',
            'birth_death': '1818 - 1883',
            'reference': referenced_author,
            'context': snippet
        })

    return snippets

def process_manifesto_references(csv_file, book_path, output_file, context_size=100):
    authors_to_reference = load_author_list(csv_file)

    if not os.path.exists(book_path):
        print(f"{book_path} not found.")
        return

    with open(book_path, 'r', encoding='utf-8', errors='ignore') as f:
        book_text = f.read().lower()

        snippets = find_references_in_manifesto(book_text, authors_to_reference, context_size)

        if snippets:
            df = pd.DataFrame(snippets)
            with open(output_file, 'a', encoding='utf-8', newline='') as f:
                df.to_csv(f, header=f.tell() == 0, index=False)
            print(f"Appended references from Marx to other authors in {output_file}")
        else:
            print("No references from Marx to other authors found in 27.txt.")

def main():
    csv_file = 'newest.csv'
    book_path = 'books/27.txt'
    output_file = 'references_cleaned.csv'
    context_size = 100

    process_manifesto_references(csv_file, book_path, output_file, context_size)

if __name__ == "__main__":
    main()


Appended references from Marx to other authors in references_cleaned.csv


In [1]:
import pandas as pd

def remove_specific_author(input_csv, output_csv, author_to_remove):
    df = pd.read_csv(input_csv)
    cleaned_df = df[(df['author_of_book'] != author_to_remove) & (df['full_author_referenced'] != author_to_remove)]

    cleaned_df.to_csv(output_csv, index=False)
    print(f"Removed rows where '{author_to_remove}' is the author or the referenced author. Cleaned data saved to {output_csv}")

if __name__ == "__main__":
    input_csv = 'references.csv'
    output_csv = 'reference2.csv'
    author_to_remove = 'Plato (spurious and doubtful works)'
    remove_specific_author(input_csv, output_csv, author_to_remove)

Removed rows where 'Plato (spurious and doubtful works)' is the author or the referenced author. Cleaned data saved to reference2.csv


In [3]:
import pandas as pd

input_file = 'reference2.csv'
df = pd.read_csv(input_file)
df_no_self_references = df[df['author_of_book'] != df['full_author_referenced']]
output_file = 'updated_references.csv'  # Path for the updated CSV file
df_no_self_references.to_csv(output_file, index=False)

print(f"Updated CSV file without self-references saved as '{output_file}'")

Updated CSV file without self-references saved as 'updated_references.csv'


In [1]:
import pandas as pd
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from sentence_transformers import SentenceTransformer

def analyze_topics(input_file, sample_size=5000):
    df = pd.read_csv(input_file)
    df_sample = df.head(sample_size)
    
    embedding_model = SentenceTransformer('all-mpnet-base-v2')
    
    vectorizer_model = CountVectorizer(
        stop_words="english", 
        min_df=2,
        ngram_range=(1, 2)
    )
    
    topic_model = BERTopic(
        embedding_model=embedding_model,
        vectorizer_model=vectorizer_model,
        min_topic_size=5,
        verbose=True
    )
    
    topics, probs = topic_model.fit_transform(df_sample['context'].tolist())
    
    topic_info = topic_model.get_document_info(df_sample['context'].tolist())
    
    df_with_topics = df_sample.copy()
    df_with_topics['Topic'] = topics
    df_with_topics['Topic_Name'] = topic_info['Name']
    df_with_topics['Topic_Probability'] = probs
    
    topic_labels = {}
    for topic in set(topics):
        if topic != -1:
            topic_terms = topic_model.get_topic(topic)
            label = ', '.join([term[0] for term in topic_terms[:3]])
            topic_labels[topic] = label
    
    df_with_topics['Topic_Terms'] = df_with_topics['Topic'].map(
        lambda x: topic_labels.get(x, 'Outlier'))
    
    output_file = 'output_with_topics_4.csv'
    df_with_topics.to_csv(output_file, index=False)
    
    print(f"\nProcessed {len(df_sample)} documents")
    print(f"Found {len(set(topics))} topics (excluding outliers)")
    print(f"Results saved to {output_file}")
    
    return df_with_topics

input_file = "references.csv"
results = analyze_topics(input_file, sample_size=5000)

display(results.head())

2024-10-27 15:19:54,858 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

2024-10-27 15:22:36,757 - BERTopic - Embedding - Completed ✓
2024-10-27 15:22:36,761 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.
2024-10-27 15:22:48,306 - BERTopic - Dimensionality - Completed ✓
2024-10-27 15:22:48,308 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-10-27 15:22:48,425 - BERTopic - Cluster - Completed ✓
2024-10-27 15:22:48,436 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-10-27 15:22:48,785 - BERTopic - Representation - Completed ✓



Processed 5000 documents
Found 229 topics (excluding outliers)
Results saved to output_with_topics_4.csv


Unnamed: 0,book_filename,author_of_book,birth_death,reference,full_author_referenced,context,Topic,Topic_Name,Topic_Probability,Topic_Terms
0,books/1223.txt,Buchanan,1804 - 1870,aristotle,Aristotle,"l its laws and processes, its tribes and races...",48,48_eternal_existent_self existent_existent ete...,0.369623,"eternal, existent, self existent"
1,books/1223.txt,Buchanan,1804 - 1870,augustine,Augustine,"oth, 25 cts. philip doddridge. his life and la...",205,205_cloth_tweedie_works_cents,1.0,"cloth, tweedie, works"
2,books/1223.txt,Buchanan,1804 - 1870,berkeley,Berkeley,"every man's reason, therefore, is really god; ...",163,163_personality_mere_self consciousness_self,0.93281,"personality, mere, self consciousness"
3,books/1223.txt,Buchanan,1804 - 1870,berkeley,Berkeley,"nd ""object"" of thought are the same? or, wheth...",-1,-1_footnote_god_philosophy_life,0.0,Outlier
4,books/1223.txt,Buchanan,1804 - 1870,berkeley,Berkeley,existence of unthinking things without any rel...,-1,-1_footnote_god_philosophy_life,0.0,Outlier


In [4]:
from transformers import pipeline
import pandas as pd
from tqdm.notebook import tqdm
from datetime import datetime
import time

def process_large_dataset(df, categories, batch_size=10, total_samples=None):
    if total_samples is None:
        total_samples = len(df)

    print(f"Starting classification of {total_samples} samples at {datetime.now().strftime('%H:%M:%S')}")

    all_predictions = []
    all_confidences = []
    error_count = 0
    start_time = time.time()

    df_subset = df.head(total_samples).copy()
    
    for i in tqdm(range(0, len(df_subset), batch_size), desc="Processing batches"):
        batch_start_time = time.time()

        batch_texts = df_subset['context'].iloc[i:i+batch_size]

        try:
            classifier = pipeline(
                "zero-shot-classification",
                model="cross-encoder/nli-distilroberta-base",
                device=-1
            )

            batch_predictions = []
            batch_confidences = []

            for text in batch_texts:
                try:
                    truncated_text = text[:128] if isinstance(text, str) else ""

                    if not truncated_text:
                        batch_predictions.append("unknown")
                        batch_confidences.append(0.0)
                        continue

                    result = classifier(
                        truncated_text,
                        candidate_labels=categories
                    )
                    batch_predictions.append(result['labels'][0])
                    batch_confidences.append(round(result['scores'][0], 3))

                except Exception as e:
                    print(f"Error processing individual text: {str(e)}")
                    batch_predictions.append("error")
                    batch_confidences.append(0.0)
                    error_count += 1

            all_predictions.extend(batch_predictions)
            all_confidences.extend(batch_confidences)

            del classifier

            batch_time = time.time() - batch_start_time
            if (i / batch_size) % 5 == 0:
                print(f"\nBatch {i // batch_size + 1} processed in {batch_time:.1f} seconds")
                print(f"Current error rate: {(error_count / len(all_predictions)) * 100:.1f}%")

            if (i + batch_size) % 1000 == 0 or i + batch_size >= len(df_subset):
                temp_df = df_subset.iloc[:len(all_predictions)].copy()
                temp_df['predicted_category'] = all_predictions
                temp_df['confidence'] = all_confidences
                temp_df.to_csv(f"intermediate_results_{len(all_predictions)}.csv", index=False)
                print(f"Intermediate results saved at {len(all_predictions)} samples")

        except Exception as e:
            print(f"\nError processing batch starting at index {i}: {str(e)}")
            all_predictions.extend(["error"] * len(batch_texts))
            all_confidences.extend([0.0] * len(batch_texts))
            error_count += len(batch_texts)

    df_results = df_subset.iloc[:len(all_predictions)].copy()
    df_results['predicted_category'] = all_predictions
    df_results['confidence'] = all_confidences

    total_time = time.time() - start_time
    print("\n=== Classification Complete ===")
    print(f"Total processing time: {total_time / 60:.1f} minutes")
    print(f"Average time per sample: {total_time / len(df_results):.2f} seconds")
    print(f"Total errors: {error_count} ({(error_count / len(df_results)) * 100:.1f}%)")
    print("\nCategory Distribution:")
    print(df_results['predicted_category'].value_counts())
    print("\nConfidence Statistics:")
    print(df_results['confidence'].describe())

    return df_results

df = pd.read_csv('final.csv')

results_df = process_large_dataset(
    df, 
    categories,
    batch_size=10,
    total_samples=len(df)
)

results_df.to_csv('philosophy_classifications_full.csv', index=False)

print("All samples processed and results saved to 'philosophy_classifications_full.csv'.")

Starting classification of 109691 samples at 21:57:51


Processing batches:   0%|          | 0/10970 [00:00<?, ?it/s]


Batch 1 processed in 2.5 seconds
Current error rate: 0.0%

Batch 6 processed in 1.5 seconds
Current error rate: 0.0%

Batch 11 processed in 1.9 seconds
Current error rate: 0.0%

Batch 16 processed in 1.5 seconds
Current error rate: 0.0%

Batch 21 processed in 1.5 seconds
Current error rate: 0.0%

Batch 26 processed in 1.5 seconds
Current error rate: 0.0%

Batch 31 processed in 1.6 seconds
Current error rate: 0.0%

Batch 36 processed in 1.5 seconds
Current error rate: 0.0%

Batch 41 processed in 1.6 seconds
Current error rate: 0.0%

Batch 46 processed in 1.5 seconds
Current error rate: 0.0%

Batch 51 processed in 1.6 seconds
Current error rate: 0.0%

Batch 56 processed in 1.5 seconds
Current error rate: 0.0%

Batch 61 processed in 1.5 seconds
Current error rate: 0.0%

Batch 66 processed in 1.5 seconds
Current error rate: 0.0%

Batch 71 processed in 1.8 seconds
Current error rate: 0.0%

Batch 76 processed in 1.6 seconds
Current error rate: 0.0%

Batch 81 processed in 1.6 seconds
Current

ValueError: Length of values (109752) does not match length of index (109691)