# Data Processing

## Initial Data Cleaning

Run after 'download.py' but before 'reference_fetcher.py'

### 1.1: Combine all the csv files

In [4]:
import csv
import glob

csv_files = [
    'books_metadata_literature.csv',
    'books_metadata_mathematics.csv',
    'books_metadata_philosophy.csv',
    'books_metadata_physics.csv',
    'books_metadata_politics.csv',
    'books_metadata_religion.csv',
    'books_metadata_science.csv'
]

output_file = 'books.csv'

written_headers = set()

with open(output_file, 'w', newline='', encoding='utf-8') as outfile:
    writer = csv.writer(outfile)
    for filename in csv_files:
        with open(filename, 'r', encoding='utf-8') as infile:
            reader = csv.reader(infile)
            headers = next(reader, None)
            if headers and tuple(headers) not in written_headers:
                writer.writerow(headers)
                written_headers.add(tuple(headers))
            for row in reader:
                writer.writerow(row)

print("Combined CSV created:", output_file)


Combined CSV created: books.csv


### 1.2: Remove Duplicate Books

In [5]:
import csv

csv_files = [
    'books_metadata_literature.csv',
    'books_metadata_mathematics.csv',
    'books_metadata_philosophy.csv',
    'books_metadata_physics.csv',
    'books_metadata_politics.csv',
    'books_metadata_religion.csv',
    'books_metadata_science.csv'
]

output_file = 'books_combined.csv'

written_headers = set()
seen_books = set()

with open(output_file, 'w', newline='', encoding='utf-8') as outfile:
    writer = csv.writer(outfile)
    for filename in csv_files:
        with open(filename, 'r', encoding='utf-8') as infile:
            reader = csv.reader(infile)
            headers = next(reader, None)
            if headers and tuple(headers) not in written_headers:
                writer.writerow(headers)
                written_headers.add(tuple(headers))

            author_idx = headers.index("Author")
            title_idx = headers.index("Title_50_Chars")

            for row in reader:
                author = row[author_idx]
                title_50 = row[title_idx]
                book_key = (author, title_50)

                if book_key not in seen_books:
                    seen_books.add(book_key)
                    writer.writerow(row)

print("Combined CSV created without duplicates:", output_file)


Combined CSV created without duplicates: books_metadata_combined.csv


### 1.3: Create CSV Files of Each Unique Author

In [6]:
import csv
from collections import defaultdict

input_file = 'books_metadata_combined.csv'
output_file = 'authors_aggregated.csv'

authors = {}

with open(input_file, 'r', encoding='utf-8') as infile:
    reader = csv.reader(infile)
    headers = next(reader)
    author_idx = headers.index("Author")
    title_idx = headers.index("Title_50_Chars")
    birth_idx = headers.index("Birth Year")
    death_idx = headers.index("Death Year")

    for row in reader:
        author = row[author_idx]
        title = row[title_idx]
        birth_year = row[birth_idx]
        death_year = row[death_idx]

        if author not in authors:
            authors[author] = {
                "birth_year": birth_year,
                "death_year": death_year,
                "titles": set()
            }

        if birth_year and not authors[author]["birth_year"]:
            authors[author]["birth_year"] = birth_year
        if death_year and not authors[author]["death_year"]:
            authors[author]["death_year"] = death_year

        authors[author]["titles"].add(title)

with open(output_file, 'w', newline='', encoding='utf-8') as outfile:
    writer = csv.writer(outfile)
    writer.writerow(["Author", "Birth Year", "Death Year", "Titles"])

    for author, data in authors.items():
        titles_str = ", ".join(sorted(data["titles"]))
        writer.writerow([
            author,
            data["birth_year"],
            data["death_year"],
            titles_str
        ])

print("Aggregated author CSV created:", output_file)


Aggregated author CSV created: authors_aggregated.csv


## Initial Data Cleaning

Run after 'download.py' but before 'reference_fetcher.py'

### 2.1: Find Authors with Duplicate Reference Names

In [4]:
import pandas as pd
df = pd.read_csv('books_filtered.csv', encoding='utf-8')

def extract_last_name(author):
    if pd.isnull(author):
        return None
    author = author.strip()
    if ',' in author:
        last_name = author.split(',')[0].strip()
    else:
        parts = author.split()
        last_name = parts[-1] if parts else None
    return last_name

df['ReferenceName'] = df['Author'].apply(extract_last_name)

reference_groups = df.groupby('ReferenceName')['Author'].nunique()

duplicate_references = reference_groups[reference_groups > 1].index.tolist()

conflicting_authors_df = df[df['ReferenceName'].isin(duplicate_references)]

conflicting_authors = conflicting_authors_df['Author'].drop_duplicates()

conflicting_authors = conflicting_authors.sort_values()

print("Authors with Duplicate Reference Names:")
print(conflicting_authors.to_string(index=False))


Authors with Duplicate Reference Names:
Series([], )


### 2.2: Remove Selected Authors

In [None]:
def remove_author_from_csv(author_list, input_file='books_filtered.csv', output_file='books_filtered.csv'):
    df = pd.read_csv(input_file)
    initial_count = len(df)
    df_filtered = df[~df['Author'].isin(author_list)]
    rows_removed = initial_count - len(df_filtered)
    df_filtered.to_csv(output_file, index=False)
    print(f"Removed {rows_removed} rows with authors: {', '.join(author_list)}")
    print(f"Filtered data saved to: {output_file}")
    return rows_removed

remove_author_from_csv([
"Penn, William",
"Angell, Norman",
"Hearn, Lafcadio",
])


KeyError: 'Author'

### 2.3: Print Total Unique Authors Remaining

In [2]:
import pandas as pd

df = pd.read_csv('books_filtered.csv')

num_unique_authors = df['Author'].nunique()

print(f"Total number of unique authors: {num_unique_authors}")


Total number of unique authors: 1117


## Post-References Data Cleaning

After running "reference_fetcher.py"

### 3.1: Combine all batches

In [7]:
import os
import pandas as pd

batch_files = [f for f in os.listdir('batches') if f.startswith('batch_') and f.endswith('.csv')]


batch_nums = [int(f.split('_')[1].split('.')[0]) for f in batch_files]
num_batches = max(batch_nums) + 1


combined_df = pd.concat([pd.read_csv(os.path.join('batches', f'batch_{i}.csv')) 
                        for i in range(num_batches) 
                        if os.path.exists(os.path.join('batches', f'batch_{i}.csv'))])

combined_df.to_csv('combined.csv', index=False)
print("Combined all batches into combined.csv")

Combined all batches into combined.csv


### 3.2: Improve Inaccurate References

For Francis Bacon, William James, etc.

In [43]:
df = pd.read_csv('v8.csv')

initial_count = len(df)

mask = ~(
    (df['referenced_author'] == 'Rand, Ayn') & 
    (~df['context'].str.contains('Ayn', case=False, na=False))
)

df_filtered = df[mask]

df_filtered.to_csv('v8.csv', index=False)

removed_count = initial_count - len(df_filtered)
print(f"Removed {removed_count}")
print(f"Remaining references: {len(df_filtered)}")

Removed 222
Remaining references: 98107


### 3.3: Remove Duplicates

In [15]:
df = pd.read_csv('v5.csv')

total_rows_before = len(df)

df_unique = df.drop_duplicates()

df_unique.to_csv('v6.csv', index=False)

total_rows_after = len(df_unique)
rows_removed = total_rows_before - total_rows_after

print(f"Total rows before: {total_rows_before}")
print(f"Total rows after: {total_rows_after}")
print(f"Duplicate rows removed: {rows_removed}")

Total rows before: 1480148
Total rows after: 319593
Duplicate rows removed: 1160555


### 3.4: Select Authors to Keep

Create a selective list of authors for the visualization tool.

In [13]:
import pandas as pd

# Create list of authors to keep
authors_to_keep = [
    "Plato", "Aristotle", "Kant, Immanuel", "Homer",
    "Augustine, Saint, Bishop of Hippo", "Darwin, Charles", "Newton, Isaac",
    "Locke, John", "Shakespeare, William", "Cicero, Marcus Tullius",
    "Hegel, Georg Wilhelm Friedrich", "Goethe, Johann Wolfgang von",
    "Napoleon I, Emperor of the French", "Bahá'u'lláh", "Descartes, René",
    "Milton, John", "Lincoln, Abraham", "Wesley, John", "Plutarch",
    "Spinoza, Benedictus de", "Orr, Charles Ebert", "Hume, David",
    "Origen", "Schopenhauer, Arthur", "Mach, Ernst", "Virgil",
    "Voltaire", "Dionysius, of Alexandria, Saint", "Rousseau, Jean-Jacques",
    "Dante Alighieri", "Nietzsche, Friedrich Wilhelm",
    "Seneca, Lucius Annaeus", "Euclid", "Xenophon", "Hobbes, Thomas",
    "Berkeley, George", "Paine, Thomas",
    "Diogenes Laertius", "Emerson, Ralph Waldo", "Strauss, David Friedrich",
    "Melanchthon, Philipp", "Bergson, Henri", "Euripides",
    "Moore, G. E. (George Edward)", "Schiller, Friedrich", "Whitefield, George",
    "Huxley, Thomas Henry", "Pascal, Blaise", "Plotinus", "Franklin, Benjamin",
    "Marx, Karl", "Leibniz, Gottfried Wilhelm, Freiherr von", "Carlyle, Thomas",
    "Confucius", "Frazer, James George", "Russell, Bertrand", "Rand, Ayn",
    "Ambrose, Saint, Bishop of Milan", "Grimm, Jacob", "Freud, Sigmund", "Turing",
    
    "Lucretius Carus, Titus", "Wilde, Oscar", "Erasmus, Desiderius",
    "Montaigne, Michel de", "Cole, Lawrence Thomas", "Ptolemy",
    "Einstein, Albert", "Sophocles", "Bentham, Jeremy", "Hippolytus, Antipope",
    "Aristophanes", "Maimonides, Moses", "Lucian, of Samosata", "Ferri, Enrico",
    "Twain, Mark", "Jefferson, Thomas",
    "Tylor, Edward B. (Edward Burnett)",
    "Coleridge, Samuel Taylor", "Aurelius, Emperor of Rome", "Maimon, Solomon",
    "Whewell, William",
    "James, William", "Wundt, Wilhelm Max", "Aquinas, Saint",
    "Herschel, John F. W. (John Frederick William)", "Swedenborg, Emanuel",
    "Fletcher, Horace", "Theophrastus", "Faraday, Michael", "Epictetus",
    "Dewey, John", "Mill, John Stuart", "Focault",
    "Ober, Charles K. (Charles Kellogg)", "Laplace, Pierre Simon, marquis de",
    "Barrow, Isaac", "Tyndall, John", "Grote, George",
    "Herbart, Johann Friedrich", "Hervey, Walter Lowrie",
    "Giovanni, da Parma", "Jowett, John Henry", "Haeckel, Ernst", "Wittgenstein",
    
    "Gladden, Washington", "Smith, Adam", "Döllinger, Johann Joseph Ignaz von",
    "Boyle, Robert", "Croce, Benedetto", "Camus, Jean-Pierre",
    "Buchanan, James", "Tolstoy, Leo, graf",
    "Harnack, Adolf von", "Luther", "Bosanquet, Bernard",
    "Bunyan, John", "Livingstone, W. P. (William Pringle)",
    "Melville, Herman", "Sidgwick, Henry", "Priestley, Joseph",
    "Hesse, Hermann", "Dickens, Charles", "Gandhi, Mahatma",
    "Whitman, Walt", "Machiavelli, Niccolò", "Roosevelt, Theodore",
    "Churchill, Winston", "Jung, C. G. (Carl Gustav)",
    "Taft, William H. (William Howard)", "Schlegel, Friedrich von", "Boethius",
    "Fitzgerald, F. Scott (Francis Scott)", "Bacon, Francis",
    "Whitehead, Alfred North", "Avicenna",
    "Liebig, Justus, Freiherr von", "Atkinson, William Walker",
    "Michelet, Jules", "Agricola, Georg",
    "Holbach, Paul Henri Thiry, baron d'", "Blavatsky, H. P. (Helena Petrovna)",
    "Venn, John", "Iamblichus", "Keller, Helen", "Frege",
    "Tocqueville, Alexis de", "Toland, John", "Harrington, James",
    "Thoreau, Henry David", "Stoughton, John",
    "Spurgeon, Caroline F. E. (Caroline Frances Eleanor)", "Nassau, Robert Hamill",
    "Carnegie, Andrew", "Husserl",
    "Scheele, Carl Wilhelm", "Douglass", "Brentano, Franz",
    "Boole, Mary Everest", "Sartre", "Halévy, Daniel",
    "Nostradamus", "Frothingham, Octavius Brooks", "Oxonian", "Kafka, Franz",
    "Taine, Hippolyte", "Hügel, Friedrich, Freiherr von", "Mussolini, Benito",
    "Beauvoir", "Dostoyevsky, Fyodor", "Gödel", "Austen, Jane",
    "Schwegler, Albert", "Heidegger", "Guevara", 
]

# Read the original CSV
df = pd.read_csv('v7.csv')

# Filter to keep only rows where both authors are in our list
filtered_df = df[
    (df['referencing_author'].isin(authors_to_keep)) & 
    (df['referenced_author'].isin(authors_to_keep))
]

# Save filtered dataset
filtered_df.to_csv('v10.csv', index=False)

# Print statistics
print(f"Original dataset size: {len(df)}")
print(f"Filtered dataset size: {len(filtered_df)}")
print(f"References removed: {len(df) - len(filtered_df)}")

Original dataset size: 294970
Filtered dataset size: 97762
References removed: 197208


### 3.5: View Current Authors

In [1]:
import pandas as pd

references_df = pd.read_csv('v7.csv')

incoming_refs = references_df['referenced_author'].value_counts()

outgoing_refs = references_df['referencing_author'].value_counts()


print("\nTop most referenced authors (incoming references):")
print("------------------------------------------------")
for author, count in incoming_refs.head(1000).items():
    print(f"{author}: {count} references")

print("\nTop authors by outgoing references:")
print("----------------------------------")
for author, count in outgoing_refs.head(1000).items():
    print(f"{author}: {count} references")



Top most referenced authors (incoming references):
------------------------------------------------
Plato: 18825 references
Aristotle: 12534 references
Kant, Immanuel: 8448 references
Homer: 6474 references
Augustine, Saint, Bishop of Hippo: 5624 references
Darwin, Charles: 4930 references
Newton, Isaac: 4232 references
Locke, John: 3718 references
Shakespeare, William: 3646 references
Cicero, Marcus Tullius: 3637 references
Hegel, Georg Wilhelm Friedrich: 3528 references
Bab, ʻAli Muhammad Shirazi: 3438 references
Goethe, Johann Wolfgang von: 3314 references
Napoleon I, Emperor of the French: 3250 references
Bahá'u'lláh: 3129 references
Milton, John: 3105 references
Descartes, René: 3105 references
Lincoln, Abraham: 3050 references
Wesley, John: 2998 references
Plutarch: 2969 references
Spinoza, Benedictus de: 2792 references
Orr, Charles Ebert: 2734 references
Hume, David: 2718 references
Origen: 2257 references
Schopenhauer, Arthur: 2225 references
Mach, Ernst: 2213 references
Virg

### 3.6: Remove Authors by Number of Incoming / Outgoing References

In [44]:
df = pd.read_csv('test.csv')

incoming_refs = df['referenced_author'].value_counts()
outgoing_refs = df['referencing_author'].value_counts()

print("Initial dataset size:", len(df))
print("Initial unique referenced authors:", len(incoming_refs))

authors_to_remove = [
    author for author in incoming_refs.index 
    if incoming_refs[author] < 500
]

print(f"Found {len(authors_to_remove)} authors with <5 incoming references")

filtered_df = df[
    (~df['referencing_author'].isin(authors_to_remove)) & 
    (~df['referenced_author'].isin(authors_to_remove))
]

new_incoming_refs = filtered_df['referenced_author'].value_counts()
print("\nAfter filtering:")
print("Dataset size:", len(filtered_df))
print("Minimum incoming references:", new_incoming_refs.min())
print("Authors with <5 references:", sum(new_incoming_refs < 5))

filtered_df.to_csv('test2.csv', index=False)

Initial dataset size: 277901
Initial unique referenced authors: 972
Found 125 authors with <5 incoming references

After filtering:
Dataset size: 263307
Minimum incoming references: 1
Authors with <5 references: 14


### 3.7: More Duplicate Cleaning

In [10]:
import pandas as pd

csv_file_path = 'combined.csv'
df = pd.read_csv(csv_file_path)

print("Initial DataFrame:")
print(df.head())


total_rows_before = df.shape[0]
duplicate_rows = df.duplicated()
num_duplicates = duplicate_rows.sum()

print(f"\nTotal rows before removing duplicates: {total_rows_before}")
print(f"Number of duplicate rows: {num_duplicates}")

df_unique = df.drop_duplicates()
total_rows_after = df_unique.shape[0]

print(f"\nTotal rows after removing duplicates: {total_rows_after}")
print(f"Number of rows removed: {total_rows_before - total_rows_after}")

cleaned_csv_file_path = 'combined_filtered_unique.csv'
df_unique.to_csv(cleaned_csv_file_path, index=False)
print(f"\nCleaned data saved to '{cleaned_csv_file_path}'.")

df_verified = pd.read_csv(cleaned_csv_file_path)
num_duplicates_after = df_verified.duplicated().sum()

print(f"\nNumber of duplicate rows in cleaned CSV: {num_duplicates_after}")

if num_duplicates_after == 0:
    print("All duplicate rows have been successfully removed.")
else:
    print("There are still duplicate rows present.")


Initial DataFrame:
  referencing_author  referenced_author match_word     book_filename  \
0           Douglass  Mill, John Stuart       Mill  additional/1.txt   
1           Douglass  Mill, John Stuart       Mill  additional/1.txt   
2           Douglass  Mill, John Stuart       Mill  additional/1.txt   
3           Douglass  Mill, John Stuart       Mill  additional/1.txt   
4           Douglass  Mill, John Stuart       Mill  additional/1.txt   

                                             context  
0  ord. Fortunate, most fortunate occurrence ! — ...  
1  about seven years old, on one of my master's f...  
2  used to steal a bag which was used for carryin...  
3  fully upon us, — its robes already crimsoned w...  
4  slave system, and hastening the glad day of de...  

Total rows before removing duplicates: 1658565
Number of duplicate rows: 1302672

Total rows after removing duplicates: 355893
Number of rows removed: 1302672

Cleaned data saved to 'combined_filtered_unique.csv'.

Nu

## Creating Matrixes

For easier data analysis & visualization

### 4.1: Create Matrix

In [1]:
import pandas as pd
import numpy as np

# Read the data
references_df = pd.read_csv('v7.csv')

# Create the cross-tabulation matrix
reference_matrix = pd.crosstab(
    references_df['referencing_author'],
    references_df['referenced_author']
)

# Fill any NaN values with 0
reference_matrix = reference_matrix.fillna(0)

# Save the matrix to a CSV
reference_matrix.to_csv('expanded.csv')

### 4.2: Create Normalized Matrix

In [2]:
import pandas as pd
import numpy as np
import unicodedata

references_df = pd.read_csv('v7.csv')

def normalize_name(name):
    if pd.isna(name):
        return name
    
    return unicodedata.normalize('NFKD', str(name)).encode('ASCII', 'ignore').decode('ASCII')

references_df['referencing_author'] = references_df['referencing_author'].apply(normalize_name)
references_df['referenced_author'] = references_df['referenced_author'].apply(normalize_name)

reference_matrix = pd.crosstab(
    references_df['referencing_author'],
    references_df['referenced_author']
)

reference_matrix = reference_matrix.fillna(0)
reference_matrix.to_csv('expanded.csv')

In [3]:
# Optional: Display some basic stats about the matrix
print(f"Matrix shape: {reference_matrix.shape}")
print("\nSample of the matrix (top 5x5):")
print(reference_matrix.iloc[:20, :20])

Matrix shape: (1087, 995)

Sample of the matrix (top 5x5):
referenced_author                                Aaberg, J. C. (Jens Christian)  \
referencing_author                                                                
Aaberg, J. C. (Jens Christian)                                                0   
Abdu'l-Baha                                                                   0   
Abercrombie, John                                                             0   
Abhedananda, Swami                                                            0   
Abrahams, Israel                                                              0   
Accum, Friedrich Christian                                                    0   
Ackland, T. S. (Thomas Suter)                                                 0   
Acton, John Emerich Edward Dalberg Acton, Baron                               0   
Addams, Jane                                                                  0   
Adeney, Walter F. (Walter Fr

### 4.3: Test Matrixes

In [4]:
matrix_df = pd.read_csv('classified/ethics_filtered.csv', index_col=0)

references_received = matrix_df.sum()
top_referenced = references_received.sort_values(ascending=False).head(500)

references_given = matrix_df.sum(axis=1)
top_referencers = references_given.sort_values(ascending=False).head(500)

print("Top 10 Most Referenced Authors:")
print("-" * 10)
for author, count in top_referenced.items():
    print(f"{author}: {int(count)} references")

print("\nTop 10 Authors Who Reference Others Most:")
print("-" * 10)
for author, count in top_referencers.items():
    print(f"{author}: {int(count)} references")

Top 10 Most Referenced Authors:
----------
Plato: 4438 references
Aristotle: 3185 references
Kant, Immanuel: 2590 references
Augustine, Saint, Bishop of Hippo: 1398 references
Locke, John: 979 references
Homer: 967 references
Hegel, Georg Wilhelm Friedrich: 739 references
Dionysius, of Alexandria, Saint: 684 references
Cicero, Marcus Tullius: 651 references
Hume, David: 581 references
Descartes, Rene: 579 references
Darwin, Charles: 573 references
Plutarch: 549 references
Spinoza, Benedictus de: 543 references
Leibniz, Gottfried Wilhelm, Freiherr von: 472 references
Xenophon: 336 references
Hobbes, Thomas: 320 references
Seneca, Lucius Annaeus: 318 references
Bentham, Jeremy: 306 references
Milton, John: 302 references
Rousseau, Jean-Jacques: 302 references
Berkeley, George: 295 references
Euclid: 292 references
Montaigne, Michel de: 286 references
Origen: 282 references
Grimm, Jacob: 277 references
Schopenhauer, Arthur: 267 references
Virgil: 256 references
Goethe, Johann Wolfgang von

### 4.4: Filter Author List

In [4]:
import pandas as pd
import unicodedata

def normalize_name(name):
    if pd.isna(name):
        return name
    return unicodedata.normalize('NFKD', str(name)).encode('ASCII', 'ignore').decode('ASCII')

authors_df = pd.read_csv('authors2.csv')
matrix_df = pd.read_csv('expanded.csv', index_col=0)

matrix_authors = set(matrix_df.index) | set(matrix_df.columns)

authors_df['Normalized_Author'] = authors_df['Author'].apply(normalize_name)
matrix_authors = {normalize_name(author) for author in matrix_authors}

filtered_authors = authors_df[authors_df['Normalized_Author'].isin(matrix_authors)]
filtered_authors = filtered_authors.drop('Normalized_Author', axis=1)
filtered_authors.to_csv('expanded_authors.csv', index=False)

print(f"Original number of authors: {len(authors_df)}")
print(f"Filtered number of authors: {len(filtered_authors)}")

Original number of authors: 1131
Filtered number of authors: 1124


In [3]:

import pandas as pd
# Read the matrix
matrix_df = pd.read_csv('matrix.csv', index_col=0)

# Get the top 50 most referenced authors
top_50_authors = matrix_df.sum().sort_values(ascending=False).head(150).index

# Filter matrix to only include top 50 authors in both rows and columns
filtered_matrix = matrix_df.loc[top_50_authors, top_50_authors]

# Save filtered matrix
filtered_matrix.to_csv('top_150_matrix.csv')

print(f"Created test_matrix.csv with shape: {filtered_matrix.shape}")


Created test_matrix.csv with shape: (50, 50)


In [16]:
import pandas as pd

df = pd.read_csv('books_filtered3.csv')

unique_authors = df.groupby('Author').agg({
    'Birth Year': 'first',
    'Death Year': 'first'
}).reset_index()


unique_authors = unique_authors.sort_values('Author')
unique_authors.to_csv('unique_authors_years.csv', index=False)
print(unique_authors.head())
print(f"\nTotal unique authors: {len(unique_authors)}")

                           Author  Birth Year  Death Year
0  Aaberg, J. C. (Jens Christian)      1877.0      1970.0
1               Abercrombie, John      1780.0      1844.0
2              Abhedananda, Swami      1866.0      1939.0
3                Abrahams, Israel      1858.0      1925.0
4      Accum, Friedrich Christian      1769.0      1838.0

Total unique authors: 1130
