In [1]:
import re
import pandas as pd
from openai import OpenAI

In [2]:
with open('_toc.yml', 'r') as infile:
    text = infile.read()

In [3]:
md_fns = re.findall('(Volumes.*?)\n', text)
md_fns = [fn.strip() for fn in md_fns]

In [4]:
meta = []
for md_fn in md_fns:
    with open(md_fn, 'r') as infile:
        md = infile.read()
        
    meta_text = md.split('-->')[0].split('<!--')[-1].splitlines()
    meta_d = {'fn': md_fn}
    for row in meta_text:
        if ':' in row:
            p = row.split(':')
            
            k = p[0]
            v = ':'.join(p[1:])
            
            k,v = k.strip(), v.strip()
            meta_d[k] = v
        if 'author' not in meta_d:
            meta_d['author'] = 'W.E.B. Du Bois'
        if 'journal' not in meta_d:
            meta_d['journal'] = 'The Crisis'
    
    meta_d['text'] = '\n'.join(md.split('-->')[1:]).split('*Citation:*')[0]
    
    meta.append(meta_d)

In [5]:
df = pd.DataFrame(meta)
df.sample(3)

Unnamed: 0,fn,author,journal,title,year,volume,issue,pages,text,index
307,Volumes/23/04/africa_for_the_africans.md,"Du Bois, W.E.B.",The Crisis,Africa for the Africans,1921,23,4,154-155,\n# Africa for the Africans (1921)\n\nThe Asso...,
434,Volumes/04/05/garrison_and_womans_suffrage.md,"Garrison Villard, Fanny",The Crisis,Garrison and Woman's Suffrage,1912,4,5,240-242,\n# Garrison and Woman's Suffrage (1912)\n\n*B...,
168,Volumes/06/01/womans_suffrage.md,"Du Bois, W.E.B.",The Crisis,Woman's Suffrage,1913,6,1,29,\n\n# Woman's Suffrage (1913)\n\nThere seems t...,


In [27]:
dfs= df.sample(50)

corpus = df['text'].values

In [28]:
"""
This is a simple application for sentence embeddings: clustering

Sentences are mapped to sentence embeddings and then agglomerative clustering with a threshold is applied.
"""
from sentence_transformers import SentenceTransformer
from sklearn.cluster import AgglomerativeClustering
import numpy as np

embedder = SentenceTransformer('all-MiniLM-L6-v2')

# Corpus with example sentences

corpus_embeddings = embedder.encode(corpus)

# Normalize the embeddings to unit length
corpus_embeddings = corpus_embeddings /  np.linalg.norm(corpus_embeddings, axis=1, keepdims=True)



In [11]:
# Perform kmean clustering
clustering_model = AgglomerativeClustering(n_clusters=None, distance_threshold=1.2) #, affinity='cosine', linkage='average', distance_threshold=0.4)
clustering_model.fit(corpus_embeddings)
cluster_assignment = clustering_model.labels_

clustered_sentences = {}
for sentence_id, cluster_id in enumerate(cluster_assignment):
    if cluster_id not in clustered_sentences:
        clustered_sentences[cluster_id] = []

    clustered_sentences[cluster_id].append(corpus[sentence_id])

for i, cluster in clustered_sentences.items():
    print("Cluster ", i+1)
    print(len(cluster))
    print("")

Cluster  13
4

Cluster  14
4

Cluster  7
5

Cluster  12
1

Cluster  15
2

Cluster  6
5

Cluster  5
4

Cluster  16
1

Cluster  4
5

Cluster  3
5

Cluster  2
2

Cluster  1
5

Cluster  10
3

Cluster  8
2

Cluster  11
1

Cluster  9
1



In [29]:
from sklearn.cluster import KMeans

In [32]:
# Perform kmean clustering
num_clusters = 25
clustering_model = KMeans(n_clusters=num_clusters, n_init='auto')
clustering_model.fit(corpus_embeddings)
cluster_assignment = clustering_model.labels_

clustered_sentences = [[] for i in range(num_clusters)]
for sentence_id, cluster_id in enumerate(cluster_assignment):
    clustered_sentences[cluster_id].append(corpus[sentence_id])

for i, cluster in enumerate(clustered_sentences):
    print("Cluster ", i+1)
    for a in cluster:
        print([l for l in a.splitlines() if len(l)>1][0])
    print("")

Cluster  1
# Intentions (1923)
# The South in the Saddle (1914)
# The Fourteenth Amendment (1915)
# Disenfranchisement (1925)
# The Possibility of Democracy (1928)
# Young Voters (1932)
# The Strategy of the Negro Voter (1933)
# Voting (1910)
# The Last Word in Politics (1912)
# The Republicans (1915)
# The Oath of the Negro Voter (1918)
# How Shall We Vote (1920)
# The Unreal Campaign (1920)
# Political Straws (1923)
# Vote (1924)
# The N.A.A.C.P. and Parties (1924)
# How Shall We Vote (1924)
# A Third Party (1928)
# The Campaign of 1928 (1928)
# Third Party (1929)
# Herbert Hoover and the South (1929)
# A New Party (1930)
# The Blair Bill (1911)

Cluster  2
# Hail Columbia (1913)
# Hail, Columbia! (1911)
# Coffeeville, Kanasas (1927)
# Roosevelt (1917)
# The Massacre in East St. Louis (1917)
# Houston (1917)
# Thirteen (1918)
# Houston and East St. Louis (1918)
# The Challenge of Detroit (1925)
# Libelous Film (1921)
# Violence (1934)
# Ireland (1916)
# The Flood, the Red Cross and t

# Russia, 1926 (1926)
# The Princess of the Hither Isles (1913)
