In [1]:
import matplotlib.pyplot as plt
%matplotlib inline

In [65]:
import numpy as np
import pandas as pd

from scipy.sparse import lil_matrix
from sklearn.decomposition import TruncatedSVD
from annoy import AnnoyIndex

from sqlalchemy.dialects.postgresql import array_agg
from sqlalchemy.sql import functions as func

from osp_graphs.v1_db import session, Text, Field, Subfield, SubfieldDocument, Citation, Document

In [3]:
count = func.count(Citation.text_id)

texts = (session
    .query(Text.id, Text.title, Text.authors, array_agg(Citation.document_id))
    .join(Citation)
    .filter(Text.valid==True)
    .filter(Text.display==True)
    .group_by(Text.id)
    .order_by(count.desc())
    .limit(1000)
    .all())

In [71]:
rows = [
    (t[0], t[1].strip('/ '), t[2][0])
    for t in texts
]

df = pd.DataFrame(rows, columns=('id', 'title', 'author'))

In [81]:
df[df.author.str.contains('Hemingway')]

Unnamed: 0,id,title,author
285,203595,The sun also rises.,"Hemingway, Ernest, 1899-1961."
611,203533,A farewell to arms.,"Hemingway, Ernest, 1899-1961."
763,203507,Short stories.,"Hemingway, Ernest, 1899-1961"
811,1409303,In our time,"Hemingway, Ernest, 1899-1961."


In [83]:
df[df.author.str.contains('Fitzgerald')]

Unnamed: 0,id,title,author
35,853902,The great Gatsby,"Fitzgerald, F. Scott (Francis Scott), 1896-1940."


In [91]:
df[df.author.str.contains('Shakespeare')]

Unnamed: 0,id,title,author
9,607094,Hamlet,"Shakespeare, William, 1564-1616."
44,424130,The tempest,"Shakespeare, William, 1564-1616."
60,967789,Henry V,"Shakespeare, William, 1564-1616."
74,426985,Macbeth,"Shakespeare, William, 1564-1616."
76,957708,Othello.,"Shakespeare, William, 1564-1616."
99,734648,King Lear.,"Shakespeare, William, 1564-1616."
136,3917541,Midsummer N. Dream,"Shakespeare, William, 1564-1616."
138,564864,Romeo and Juliet,"Shakespeare, William, 1564-1616."
143,1015949,The complete works,"Shakespeare, William, 1564-1616."
216,612357,Twelfth night,"Shakespeare, William, 1564-1616."


In [97]:
df[df.author.str.contains('Marx')]

Unnamed: 0,id,title,author
2,810087,The Communist manifesto,"Marx, Karl, 1818-1883."
43,5953255,"Capital,","Marx, Karl, 1818-1883."
158,1397176,The German ideology,"Marx, Karl, 1818-1883."
232,148644,"Manifesto of the Communist party,","Marx, Karl, 1818-1883."
461,547464,Selected writings,"Marx, Karl, 1818-1883."


In [96]:
df[df.author.str.contains('Jameson')]

Unnamed: 0,id,title,author
336,2149839,"Postmodernism, or, The cultural logic of late ...","Jameson, Fredric."


In [102]:
df[df.author.str.contains('Conrad')]

Unnamed: 0,id,title,author
14,2593372,Heart of darkness,"Conrad, Joseph, 1857-1924."
873,3608293,Typhoon,"Conrad, Joseph, 1857-1924."


In [106]:
df[df.author.str.contains('Deitel')]

Unnamed: 0,id,title,author
32,2595909,C : how to program,"Deitel, Harvey M., 1945-"
316,3014534,Java : how to program,"Deitel, Paul J."
371,9643576,Java.,"Deitel, Paul J."


In [10]:
text_ids = [t[0] for t in texts]
text_id_to_idx = {tid: i for i, tid in enumerate(text_ids)}

In [11]:
doc_ids = list(set([i for t in texts for i in t[-1]]))
doc_id_to_idx = {did: i for i, did in enumerate(doc_ids)}

In [13]:
len(text_ids)

1000

In [41]:
len(doc_ids)

225711

In [42]:
tdm = np.zeros((len(doc_ids), len(text_ids)))

In [43]:
for t in texts:
    for did in t[-1]:
        didx = doc_id_to_idx[did]
        tidx = text_id_to_idx[t[0]]
        tdm[didx][tidx] += 1

In [51]:
tdm.shape

(225711, 1000)

In [46]:
tdm.sum()

570610.0

In [47]:
svd = TruncatedSVD(n_components=10)

In [48]:
svd.fit(tdm)

TruncatedSVD(algorithm='randomized', n_components=10, n_iter=5,
       random_state=None, tol=0.0)

In [52]:
svd.components_.T.shape

(1000, 10)

In [53]:
svd.components_.T[0]

array([ 4.33040780e-02, -1.54850059e-02, -7.26197304e-04, -3.26942832e-02,
        6.38129505e-02,  8.95972922e-01, -1.52567413e-01, -3.79251349e-01,
        9.01681994e-02, -2.16820014e-02])

In [55]:
idx = AnnoyIndex(10)
for i, v in enumerate(svd.components_.T):
    idx.add_item(i, v)
    
idx.build(10)

True

In [101]:
for tidx in idx.get_nns_by_item(336, 20):
    print(texts[tidx][1])

Postmodernism, or, The cultural logic of late capitalism /
The black Atlantic : modernity and double consciousness /
Masculinities /
Subculture, the meaning of style /
Simulations /
The location of culture /
The history of sexuality /
The practice of everyday life /
Black skin, white masks /
Gender trouble : feminism and the subversion of identity /
Society of the spectacle /
Simulacra and simulation /
Discipline and punish : the birth of the prison /
Dialectic of enlightenment /
The postmodern condition : a report on knowledge /
Course in general linguistics /
THE CULTURE INDUSTRY
Life on the screen : identity in the age of the Internet /
Fundamental methods of mathematical economics /
Compulsory heterosexuality and lesbian existence /


In [85]:
svd.components_.T[285] - svd.components_.T[35]

array([-0.05839442,  0.01371266, -0.06545626,  0.02957595,  0.04176697,
       -0.01438533, -0.09723236, -0.00922089, -0.02935313, -0.01031321])

In [89]:
for tidx in idx.get_nns_by_vector(svd.components_.T[285] - svd.components_.T[35], 10):
    print(texts[tidx][1])

An essay on criticism /
Preface to Lyrical ballads. /
Ode to a nightingale /
The rights of woman /
A vindication of the rights of woman /
Lyrical ballads 1805 /
Songs of experience.
Kubla Khan
Songs of innocence and experience.
Songs of innocence.


In [90]:
for tidx in idx.get_nns_by_vector(svd.components_.T[35] - svd.components_.T[285], 10):
    print(texts[tidx][1])

The Lone Ranger and Tonto fistfight in heaven /
The awakening /
The great Gatsby /
The red badge of courage,
I know why the caged bird sings.
Huckleberry Finn /
Invisible man /
Adventures of Huckleberry Finn /
Sister Carrie /
Winesburg, Ohio /


In [93]:
for tidx in idx.get_nns_by_vector(svd.components_.T[138] - svd.components_.T[9], 10):
    print(texts[tidx][1])

Macroeconomics /
Introductory econometrics : a modern approach /
Florida State University
THE CULTURE INDUSTRY
Data and computer communications /
Doing Gender
Selected writings, 1877-1930.
Microeconomic theory /
Power /
Discipline and punish : the birth of the prison /


In [100]:
for tidx in idx.get_nns_by_vector(svd.components_.T[2] + svd.components_.T[336], 20):
    print(texts[tidx][1])

The Communist manifesto /
Manifesto of the Communist party,
The German ideology /
The great transformation /
On Marx /
The division of labor in society.
Karl Marx /
The Protestant Ethic and the spirit of capitalism /
Capital,
The Russian revolution,
Marx /
The three worlds of capitalism /
The three worlds of welfare capitalism /
Economic growth /
Suicide.
The rules of sociological method.
Power : a radical view /
Institutional and economic change /
The power elite.
The modern world-system /


In [107]:
for tidx in idx.get_nns_by_item(32, 20):
    print(texts[tidx][1])

C : how to program /
Java : how to program /
Java.
Starting out with C++ : from control structures through objects /
The C++ programming language /
Charlotte Temple /
Project Evaluation
The Gettysburg address
The way to rainy mountain
Autobiography.
Civil disobedience
America : a narrative history /
Billy Budd;
Walden /
GEORGE WASHINGTON
Leaves of grass /
The scarlet letter.
Of Plymouth Plantation, 1620-1647 /
The Declaration of Independence /
Sinners in the hands of an angry God.
