In [1]:
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
import numpy as np
import altair as alt
import pandas as pd

from scipy.sparse import lil_matrix
from annoy import AnnoyIndex
from collections import defaultdict
from titlecase import titlecase
from umap import UMAP

from sklearn.decomposition import TruncatedSVD
from sklearn import manifold
from sklearn.decomposition import PCA

from sqlalchemy.dialects.postgresql import array_agg
from sqlalchemy.sql import functions as func

from osp_graphs.v1_db import session, Text, Field, Subfield, SubfieldDocument, Citation, Document

In [3]:
count = func.count(Citation.text_id)

titles, surnames, text_doc_ids = zip(*session
    .query(Text.title, Text.surname, array_agg(Citation.document_id))
    .join(Citation)
    .filter(Text.valid==True)
    .filter(Text.display==True)
    .group_by(Text.id)
    .order_by(count.desc())
    .limit(2000)
    .all())

In [4]:
titles = [titlecase(t.strip(' /.,;')) for t in titles]

In [5]:
surnames = [s.strip(' /.,;').capitalize() for s in surnames]

In [6]:
doc_ids = list(set([did for doc_ids in text_doc_ids for did in doc_ids]))
doc_id_to_idx = {did: i for i, did in enumerate(doc_ids)}

In [7]:
M = lil_matrix((len(doc_ids), len(surnames)))

for sidx, doc_ids in enumerate(text_doc_ids):
    for did in doc_ids:
        didx = doc_id_to_idx[did]
        M[didx, sidx] += 1

In [8]:
M.sum()

787921.0

In [9]:
svd = TruncatedSVD(n_components=20)

In [10]:
svd.fit(M)

TruncatedSVD(algorithm='randomized', n_components=20, n_iter=5,
       random_state=None, tol=0.0)

In [11]:
embedding = svd.components_.T

In [12]:
embedding.shape

(2000, 20)

In [14]:
idx = AnnoyIndex(20)
for i, v in enumerate(embedding):
    idx.add_item(i, v)
    
idx.build(10)

True

In [15]:
df = pd.DataFrame(list(zip(titles, surnames)), columns=('title', 'surname'))

In [16]:
df

Unnamed: 0,title,surname
0,The Elements of Style,Strunk
1,Republic,Plato
2,The Communist Manifesto,Marx
3,Biology,Campbell
4,Frankenstein,Shelley
5,Ethics,Aristotle
6,Leviathan,Hobbes
7,The Prince,Machiavelli
8,Oedipus,Sophocles
9,Hamlet,Shakespeare


In [17]:
df[df.surname.str.contains('Morrison')]

Unnamed: 0,title,surname
42,Beloved,Morrison
139,The Bluest Eye,Morrison
314,Song of Solomon,Morrison
339,Sula,Morrison


In [18]:
df[df.surname.str.contains('Hemingway')]

Unnamed: 0,title,surname
285,The Sun Also Rises,Hemingway
612,A Farewell to Arms,Hemingway
764,Short Stories,Hemingway
809,In Our Time,Hemingway
1014,The Old Man and the Sea,Hemingway
1636,For Whom the Bell Tolls,Hemingway
1839,The Snows of Kilimanjaro,Hemingway


In [19]:
df[df.surname.str.contains('Shakespeare')]

Unnamed: 0,title,surname
9,Hamlet,Shakespeare
44,The Tempest,Shakespeare
60,Henry V,Shakespeare
74,Macbeth,Shakespeare
76,Othello,Shakespeare
99,King Lear,Shakespeare
137,Midsummer N. Dream,Shakespeare
138,Romeo and Juliet,Shakespeare
143,The Complete Works,Shakespeare
216,Twelfth Night,Shakespeare


In [20]:
df[df.surname.str.contains('Marx')]

Unnamed: 0,title,surname
2,The Communist Manifesto,Marx
43,Capital,Marx
157,The German Ideology,Marx
232,Manifesto of the Communist Party,Marx
461,Selected Writings,Marx
1180,Economic and Philosophic Manuscripts of 1844,Marx
1335,The Machine in the Garden,Marx
1455,A Contribution to the Critique of Political Ec...,Marx
1469,Kapital,Marx
1681,The Grundrisse,Marx


In [21]:
def nns_by_tid(tid, n=20):
    for tidx in idx.get_nns_by_item(tid, n):
        print(titles[tidx])

In [22]:
def nns_by_vector(v, n=20):
    for tidx in idx.get_nns_by_vector(v, n):
        print(titles[tidx])

In [23]:
nns_by_tid(43)

Capital
The Grundrisse
A Contribution to the Critique of Political Economy
Suicide
On Marx
The Protestant Ethic and the Spirit of Capitalism
Who Rules America?
Economic and Philosophical Manuscripts
Early Writings
Manifesto of the Communist Party
The Division of Labor in Society
Economic and Philosophic Manuscripts of 1844
The German Ideology
State and Revolution
Lenin
The Russian Revolution
On Being Sane in Insane Places
Unequal Childhoods : Class, Race, and Family Life
Body Ritual Among the Nacirema
Karl Marx


In [24]:
nns_by_tid(26)

Origin of Species
On the Origin of Species
The Descent of Man
The Voyage of the Beagle
An Essay on the Principle of Population
Evolution and Natural Selection
An Essay on Population
The Economic Consequences of the Peace
On Population
Mein Kampf
The Blind Watchmaker
Stalin
The Queen of Spades
Confessions
Lenin
Natural Theology
Civilization and Its Discontents
A Vindication of the Rights of Women
Fairy Tales
The Principles of Psychology


In [28]:
nns_by_vector(np.mean([embedding[42], embedding[285]], 0))

Beloved
Sula
Light in August
House Made of Dawn
The Sound and the Fury
Song of Solomon
Love Medicine
Their Eyes Were Watching God
As I Lay Dying
The Sun Also Rises
Invisible Man
Native Son
Bastard Out of Carolina
The Crying of Lot 49
The Bluest Eye
In Our Time
Ceremony
The Color Purple
The Autobiography of an Ex-Colored Man
Quicksand


In [30]:
nns_by_vector(np.mean([embedding[2], embedding[10]], 0))

The Communist Manifesto
The Revolution of 1905
What Is to Be Done?
Manifesto of the Communist Party
State and Revolution
The Russian Revolution
Lenin
On Marx
Mein Kampf
Economic and Philosophical Manuscripts
Kapital
The Condition of the Working Class in England
The General Theory of Employment
Thus Spoke Zarathustra
Civilization and Its Discontents
The General Theory of Employment, Interest and Money
On Population
An Essay on Population
Body Ritual Among the Nacirema
On Being Sane in Insane Places
