In [104]:
import matplotlib.pyplot as plt
%matplotlib inline

In [105]:
import numpy as np
import altair as alt
import pandas as pd

import umap

from scipy.sparse import lil_matrix
from annoy import AnnoyIndex
from collections import defaultdict
from titlecase import titlecase

from sklearn.decomposition import TruncatedSVD
from sklearn import manifold
from sklearn.decomposition import PCA

from sqlalchemy.dialects.postgresql import array_agg
from sqlalchemy.sql import functions as func

from osp_graphs.v1_db import session, Text, Field, Subfield, SubfieldDocument, Citation, Document

In [106]:
count = func.count(Citation.text_id)

res = (session
    .query(Text.surname, Text.title, array_agg(Citation.document_id))
    .join(Citation)
    .filter(Text.valid==True)
    .filter(Text.display==True)
    .group_by(Text.id)
    .order_by(count.desc())
    .limit(1000)
    .all())

In [107]:
surname_docs = defaultdict(set)
surname_titles = defaultdict(set)

for surname, title, doc_ids in res:
    key = surname.strip('.,').lower()
    surname_docs[key].update(doc_ids)
    surname_titles[key].add(title)

In [108]:
surnames = sorted(surname_docs.keys())
surname_to_idx = {s: i for i, s in enumerate(surnames)}

In [109]:
doc_ids = list(set([did for _, _, doc_ids in res for did in doc_ids]))
doc_id_to_idx = {did: i for i, did in enumerate(doc_ids)}

In [110]:
sdm = np.zeros((len(doc_ids), len(surnames)))

for surname in surnames:
    sidx = surname_to_idx[surname]
    for doc_id in surname_docs[surname]:
        didx = doc_id_to_idx[doc_id]
        sdm[didx][sidx] += 1

In [111]:
sdm_sparse = lil_matrix(sdm)

In [112]:
sdm_sparse.sum()

495788.0

In [113]:
svd = TruncatedSVD(n_components=10)

In [114]:
svd.fit(sdm_sparse)

TruncatedSVD(algorithm='randomized', n_components=10, n_iter=5,
       random_state=None, tol=0.0)

In [115]:
svd.components_.T.shape

(642, 10)

In [None]:
embedding = umap.UMAP(n_neighbors=10, min_dist=0.2).fit_transform(sdm)

In [None]:
rows = []
for surname, (x, y) in zip(surnames, embedding):
    rows.append(dict(surname=surname, titles=surname_titles[surname], x=x, y=y))

In [None]:
df = pd.DataFrame(rows)

In [None]:
df.surname = df.surname.apply(lambda s: s.capitalize())

In [None]:
df.titles = df.titles.apply(lambda ts: ', '.join([titlecase(t.strip(' /;.')) for t in ts]))

In [None]:
chart = (alt
    .Chart(df, width=1000, height=700)
    .mark_circle()
    .encode(x='x', y='y', tooltip='titles')
    .mark_text()
    .encode(text='surname'))

In [None]:
chart.interactive()