In [9]:
import matplotlib.pyplot as plt
%matplotlib inline

In [10]:
import numpy as np
import altair as alt
import pandas as pd

from scipy.sparse import lil_matrix
from annoy import AnnoyIndex
from collections import defaultdict
from titlecase import titlecase
from umap import UMAP

from sklearn.decomposition import TruncatedSVD
from sklearn import manifold
from sklearn.decomposition import PCA

from sqlalchemy.dialects.postgresql import array_agg
from sqlalchemy.sql import functions as func

from osp_graphs.v1_db import session, Text, Field, Subfield, SubfieldDocument, Citation, Document

In [11]:
count = func.count(Citation.text_id)

titles, surnames, text_doc_ids = zip(*session
    .query(Text.title, Text.surname, array_agg(Citation.document_id))
    .join(Citation)
    .filter(Text.valid==True)
    .filter(Text.display==True)
    .group_by(Text.id)
    .order_by(count.desc())
    .limit(2000)
    .all())

In [12]:
titles = [titlecase(t.strip(' /.,;')) for t in titles]

In [13]:
surnames = [s.strip(' /.,;').capitalize() for s in surnames]

In [14]:
doc_ids = list(set([did for doc_ids in text_doc_ids for did in doc_ids]))
doc_id_to_idx = {did: i for i, did in enumerate(doc_ids)}

In [15]:
M = lil_matrix((len(doc_ids), len(surnames)))

for sidx, doc_ids in enumerate(text_doc_ids):
    for did in doc_ids:
        didx = doc_id_to_idx[did]
        M[didx, sidx] += 1

In [16]:
M.sum()

787921.0

In [17]:
svd = TruncatedSVD(n_components=10)

In [18]:
svd.fit(M)

TruncatedSVD(algorithm='randomized', n_components=10, n_iter=5,
       random_state=None, tol=0.0)

In [36]:
embedding = svd.components_.T

In [37]:
embedding.shape

(2000, 10)

In [38]:
idx = AnnoyIndex(10)
for i, v in enumerate(embedding):
    idx.add_item(i, v)
    
idx.build(10)

True

In [39]:
df = pd.DataFrame(list(zip(titles, surnames)), columns=('title', 'surname'))

In [61]:
df

Unnamed: 0,title,surname
0,The Elements of Style,Strunk
1,Republic,Plato
2,The Communist Manifesto,Marx
3,Biology,Campbell
4,Frankenstein,Shelley
5,Ethics,Aristotle
6,Leviathan,Hobbes
7,The Prince,Machiavelli
8,Oedipus,Sophocles
9,Hamlet,Shakespeare


In [40]:
df[df.surname.str.contains('Marx')]

Unnamed: 0,title,surname
2,The Communist Manifesto,Marx
43,Capital,Marx
157,The German Ideology,Marx
232,Manifesto of the Communist Party,Marx
461,Selected Writings,Marx
1180,Economic and Philosophic Manuscripts of 1844,Marx
1335,The Machine in the Garden,Marx
1455,A Contribution to the Critique of Political Ec...,Marx
1469,Kapital,Marx
1681,The Grundrisse,Marx


In [57]:
df[df.surname.str.contains('Milton')]

Unnamed: 0,title,surname
20,Paradise Lost,Milton
437,Paradise Lost : Book I,Milton
797,Areopagitica,Milton
976,Paradise Lost. Books 1-2,Milton
1563,Lycidas,Milton


In [43]:
def nns_by_tid(tid, n=20):
    for tidx in idx.get_nns_by_item(tid, n):
        print(titles[tidx])

In [47]:
def nns_by_vector(v, n=20):
    for tidx in idx.get_nns_by_vector(v, n):
        print(titles[tidx])

In [45]:
nns_by_tid(43)

Capital
The Grundrisse
A Contribution to the Critique of Political Economy
The German Ideology
The Russian Revolution
Suicide
The Protestant Ethic and the Spirit of Capitalism
Durkheim
The Division of Labor in Society
Unequal Childhoods : Class, Race, and Family Life
The Elementary Forms of Religious Life
Prison Notebooks
From the 'Prison Notebooks'
Economic and Philosophic Manuscripts of 1844
Manifesto of the Communist Party
The Russian Revolution
Who Rules America?
The Origin of the Family, Private Property and the State
Karl Marx
The Rules of Sociological Method


In [64]:
nns_by_tid(26)

Origin of Species
The Descent of Man
On the Origin of Species
The Condition of the Working Class in England
The Subjection of Women
We Have Never Been Modern
A Vindication of the Rights of Women
The Rites of Passage
Tristan and Isolde
The Unconscious
The Voyage of the Beagle
The Interpretation of Dreams
Britons : Forging the Nation, 1707-1837
An Essay on the Principle of Population
The Marriage of Figaro
Kapital
Traviata
La Traviata
Rights of Man
An Essay on Population


In [71]:
nns_by_vector(np.mean([embedding[2], embedding[20]], 0))

On Population
An Essay on Population
An Essay on the Principle of Population
Mein Kampf
The Road to Wigan Pier
Reflections on the Revolution in France
The Economic Consequences of the Peace
An Inquiry Into the Nature and Causes of the Wealth of Nations
Wealth of Nations
What Is to Be Done?
The Revolution of 1905
Stalin
Imperialism
A History of Western Society
The Condition of the Working Class in England
Kapital
Lenin
State and Revolution
Principles of Political Economy
The General Theory of Employment, Interest and Money


In [69]:
np.mean([embedding[24], embedding[26]], 0)

array([ 0.00700565,  0.07868603, -0.1062342 ,  0.00669924,  0.0181963 ,
       -0.00912411, -0.01045703, -0.02161802,  0.08305466, -0.02000347])