# Author Topic Modeling
### Source: https://nbviewer.jupyter.org/github/rare-technologies/gensim/blob/develop/docs/notebooks/atmodel_tutorial.ipynb

## 1. Processing Text, Vectorize Author List

In [41]:
import nltk
import json
content = []

content = []
with open('total.json') as json_data:
    content = json.load(json_data)

#for line in open('JACS.json', 'r'):
#    content.append(json.loads(line))
    
print(len(content))


5676


In [42]:
# Get all author names and their corresponding document IDs.
author2doc = dict()

i = 0
for entry in content:
    sender = entry['Sender'].replace('\n',' ')
    if not author2doc.get(sender):
        # This is a new author.
        #author2doc[sender] = []
        author2doc[sender] = [i]
    # Add document IDs to author.
    else:
        author2doc[sender].append(i)
    i = i + 1
    
i = 0    
for entry in content:
    receiver = entry['Receiver'].replace('\n',' ')
    if not author2doc.get(receiver):
        # This is a new author.
        author2doc[receiver] = []
        author2doc[receiver] = [i]
    # Add document IDs to author.
    else:
        author2doc[receiver].append(i)
    i = i + 1
    
    
#print(author2doc)


In [43]:
import spacy
nlp = spacy.load('en')

In [44]:
abstract = []
for entry in content:
    title = entry['Title'].replace('\n',' ')
    #sender = entry['Sender'].replace('\n',' ')
    #receiver = entry['Receiver'].replace('\n',' ')
    abst = entry['Content'].replace('\n',' ')
    entry_str = title+' '+abst
    abstract.append(entry_str)
#print(abstract)

In [45]:
from nltk.corpus import stopwords
d = {}
stopword = stopwords.words('english')

In [50]:
%%time
processed_docs = []    
for doc in nlp.pipe(abstract, n_threads=4, batch_size=100):
    # Process document using Spacy NLP pipeline.
    
    ents = doc.ents  # Named entities.

    # Keep only words (no numbers, no punctuation).
    # Lemmatize tokens, remove punctuation and remove stopwords.
    doc = [token.lemma_ for token in doc if token.is_alpha and not token.is_stop]

    # Remove common words from a stopword list.
    doc = [token for token in doc if token not in stopword]

    # Add named entities, but only if they are a compound of more than word.
    doc.extend([str(entity) for entity in ents if len(entity) > 1])
    
    processed_docs.append(doc)

CPU times: user 6min 22s, sys: 1min 34s, total: 7min 57s
Wall time: 5min 49s


In [51]:
abstract_all = processed_docs
del processed_docs

In [52]:
from gensim.models import Phrases
# Add bigrams and trigrams to docs (only ones that appear 20 times or more).
bigram = Phrases(abstract_all, min_count=20)
for idx in range(len(abstract_all)):
    for token in bigram[abstract_all[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            abstract_all[idx].append(token)



In [53]:
from gensim.corpora import Dictionary
dictionary = Dictionary(abstract_all)

# Remove rare and common tokens.
# Filter out words that occur too frequently or too rarely.
max_freq = 0.5
min_wordcount = 20
dictionary.filter_extremes(no_below=min_wordcount, no_above=max_freq)

_ = dictionary[0]  # This sort of "initializes" dictionary.id2token.

In [54]:
corpus = [dictionary.doc2bow(doc) for doc in abstract_all]


In [55]:
print('Number of authors: %d' % len(author2doc))
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

Number of authors: 10345
Number of unique tokens: 3897
Number of documents: 5676


In [56]:
#print(len(corpus))
print(len(dictionary.id2token))

3897


In [57]:
from gensim.models import AuthorTopicModel
%time model = AuthorTopicModel(corpus=corpus, num_topics=10, id2word=dictionary.id2token, \
                author2doc=author2doc, chunksize=2000, passes=1, eval_every=0, \
                iterations=1, random_state=1)

CPU times: user 8.2 s, sys: 107 ms, total: 8.31 s
Wall time: 8.5 s


## 2. Select Best Performing Model with highest coherence

In [58]:
%%time
model_list = []
for i in range(5):
    model = AuthorTopicModel(corpus=corpus, num_topics=10, id2word=dictionary.id2token, \
                    author2doc=author2doc, chunksize=2000, passes=100, gamma_threshold=1e-10, \
                    eval_every=0, iterations=1, random_state=i)
    top_topics = model.top_topics(corpus)
    tc = sum([t[1] for t in top_topics])
    model_list.append((model, tc))

CPU times: user 10min 29s, sys: 6.95 s, total: 10min 36s
Wall time: 11min 11s


In [59]:
model, tc = max(model_list, key=lambda x: x[1])
print('Topic coherence: %.3e' %tc)

Topic coherence: -1.855e+01


In [60]:
model.save('/tmp/model.atmodel')

In [61]:
model = AuthorTopicModel.load('/tmp/model.atmodel')

In [62]:
model.show_topics(num_topics=10)

[(0,
  '0.025*"datum" + 0.019*"method" + 0.017*"base" + 0.016*"analysis" + 0.016*"model" + 0.011*"result" + 0.008*"system" + 0.008*"time" + 0.007*"image" + 0.007*"approach"'),
 (1,
  '0.014*"research" + 0.010*"development" + 0.009*"provide" + 0.008*"technology" + 0.008*"support" + 0.007*"use" + 0.007*"approach" + 0.007*"include" + 0.007*"design" + 0.007*"review"'),
 (2,
  '0.024*"case" + 0.020*"finding" + 0.015*"report" + 0.014*"patient" + 0.011*"present" + 0.011*"old" + 0.010*"age" + 0.009*"child" + 0.009*"group" + 0.008*"diagnosis"'),
 (3,
  '0.017*"high" + 0.011*"result" + 0.009*"low" + 0.008*"increase" + 0.008*"phase" + 0.007*"temperature" + 0.007*"water" + 0.006*"material" + 0.006*"system" + 0.006*"rate"'),
 (4,
  '0.023*"effect" + 0.020*"experiment" + 0.011*"task" + 0.011*"participant" + 0.010*"visual" + 0.009*"result" + 0.007*"response" + 0.007*"condition" + 0.007*"suggest" + 0.006*"stimulus"'),
 (5,
  '0.036*"patient" + 0.023*"analysis" + 0.018*"p" + 0.015*"95%" + 0.014*"risk" 

In [98]:
topics = []
i = 1
for topic in model.show_topics(num_topics=10):
    words = []
    for word, prob in model.show_topic(topic[0]):
        words.append(word)
    print('Topic '+str(i)+': ')
    print(words[0:2])
    print(*words)
    print()
    i += 1
    topics.append(words[0:2])

Topic 1: 
['datum', 'method']
datum method base analysis model result system time image approach

Topic 2: 
['research', 'development']
research development provide technology support use approach include design review

Topic 3: 
['case', 'finding']
case finding report patient present old age child group diagnosis

Topic 4: 
['high', 'result']
high result low increase phase temperature water material system rate

Topic 5: 
['effect', 'experiment']
effect experiment task participant visual result response condition suggest stimulus

Topic 6: 
['patient', 'analysis']
patient analysis p 95% risk ci meta group meta_analysis review

Topic 7: 
['reaction', 'c']
reaction c metal complex structure state electron bond formation molecular

Topic 8: 
['cell', 'protein']
cell protein gene expression plant pathway specie target vitro activity

Topic 9: 
['patient', 'health']
patient health care treatment test hiv testing trial clinical intervention

Topic 10: 
['cancer', 'analysis']
cancer analysis

In [152]:
from pprint import pprint

def show_author(name):
    print('\n%s' % name)
    print('Docs:', model.author2doc[name])
    print('Topics:')
    pprint([(topics[topic[0]],topic[1]) for topic in model[name]])

In [153]:
show_author('Jonathan L. Sessler')


Jonathan L. Sessler
Docs: [4005, 4139, 4308, 4706, 4809, 4811]
Topics:
[(['reaction', 'c'], 0.9973405260356156)]


## 3. Plotting the Authors

In [105]:
%%time
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, random_state=0)
smallest_author = 0  # Ignore authors with documents less than this.
authors = [model.author2id[a] for a in model.author2id.keys() if len(model.author2doc[a]) >= smallest_author]
_ = tsne.fit_transform(model.state.gamma[authors, :])  # Result stored in tsne.embedding_

CPU times: user 7min 2s, sys: 32.7 s, total: 7min 35s
Wall time: 5min 25s


In [106]:
# Tell Bokeh to display plots inside the notebook.
from bokeh.io import output_notebook
output_notebook()

In [121]:
from bokeh.models import HoverTool
from bokeh.plotting import figure, show, ColumnDataSource

x = tsne.embedding_[:, 0]
y = tsne.embedding_[:, 1]

author_names = [model.id2author[a] for a in authors]

# Radius of each point corresponds to the number of documents attributed to that author.
scale = 0.4
author_sizes = [len(model.author2doc[a]) for a in author_names]
radii = [size * scale for size in author_sizes]

source = ColumnDataSource(
        data=dict(
            x=x,
            y=y,
            author_names=author_names,
            author_sizes=author_sizes,
            radii=radii,
        )
    )

# Add author names and sizes to mouse-over info.
hover = HoverTool(
        tooltips=[
        ("author", "@author_names"),
        ("size", "@author_sizes"),
        ]
    )

p = figure(tools=[hover, 'crosshair,pan,wheel_zoom,box_zoom,reset,save,lasso_select'])
p.scatter('x', 'y', radius='radii', source=source, fill_alpha=0.6, line_color=None)
show(p)

## 4. Similarity Queries

In [122]:
from gensim.similarities import MatrixSimilarity

# Generate a similarity object for the transformed corpus.
index = MatrixSimilarity(model[list(model.id2author.values())])

# Get similarities to some author.
author_name = 'Yadong Li'
sims = index[model[author_name]]

  if np.issubdtype(vec.dtype, np.int):


In [123]:
# Make a function that returns similarities based on the Hellinger distance.

from gensim import matutils
import pandas as pd

# Make a list of all the author-topic distributions.
author_vecs = [model.get_author_topics(author) for author in model.id2author.values()]

def similarity(vec1, vec2):
    '''Get similarity between two vectors'''
    dist = matutils.hellinger(matutils.sparse2full(vec1, model.num_topics), \
                              matutils.sparse2full(vec2, model.num_topics))
    sim = 1.0 / (1.0 + dist)
    return sim

def get_sims(vec):
    '''Get similarity of vector to all authors.'''
    sims = [similarity(vec, vec2) for vec2 in author_vecs]
    return sims

def get_table(name, top_n=10, smallest_author=1):
    '''
    Get table with similarities, author names, and author sizes.
    Return `top_n` authors as a dataframe.
    
    '''
    
    # Get similarities.
    sims = get_sims(model.get_author_topics(name))

    # Arrange author names, similarities, and author sizes in a list of tuples.
    table = []
    for elem in enumerate(sims):
        author_name = model.id2author[elem[0]]
        sim = elem[1]
        author_size = len(model.author2doc[author_name])
        if author_size >= smallest_author:
            table.append((author_name, sim, author_size))
            
    # Make dataframe and retrieve top authors.
    df = pd.DataFrame(table, columns=['Author', 'Score', 'Size'])
    df = df.sort_values('Score', ascending=False)[:top_n]
    
    return df

In [128]:
get_table('Jonathan L. Sessler')

Unnamed: 0,Author,Score,Size
4273,Jonathan L. Sessler,1.0,6
4215,John F. Hartwig,0.999689,5
7002,Peter J. Stang,0.999241,5
8401,Stephen L. Buchwald,0.999199,9
9593,Yadong Li,0.999119,4
8571,Tae-Lim Choi,0.998891,3
2725,Garret M. Miyake,0.99877,2
3824,Jean-Marie Lehn,0.998614,4
10209,Zheng Huang,0.998586,2
7899,Scott E. Denmark,0.998514,3


In [129]:
get_table('Jonathan L. Sessler', smallest_author=3)

Unnamed: 0,Author,Score,Size
46,Jonathan L. Sessler,1.0,6
45,John F. Hartwig,0.999689,5
76,Peter J. Stang,0.999241,5
92,Stephen L. Buchwald,0.999199,9
118,Yadong Li,0.999119,4
96,Tae-Lim Choi,0.998891,3
36,Jean-Marie Lehn,0.998614,4
86,Scott E. Denmark,0.998514,3
1,Abigail G. Doyle,0.997972,5
70,Niels H. Damrauer,0.997898,3


In [154]:
author_dict = {}
for a in author2doc:
    topic = [(topics[t[0]]) for t in model[a]]
    author_dict[a] = topic

In [158]:
pd.DataFrame.from_dict(author_dict, orient='index')

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
Thomas R Vetter,"[datum, method]","[research, development]","[patient, analysis]","[patient, health]",,,,,,
Markus Müller-Trapet,"[datum, method]","[research, development]","[effect, experiment]","[patient, health]",,,,,,
Bashir Mohammed Abubakar,"[datum, method]","[research, development]","[cell, protein]",,,,,,,
Ye Han,"[datum, method]","[research, development]","[high, result]","[patient, analysis]","[cell, protein]","[cancer, analysis]",,,,
Chang Wang,"[datum, method]","[high, result]","[patient, analysis]",,,,,,,
Matthias Steinfath,"[datum, method]","[research, development]","[case, finding]","[effect, experiment]","[patient, analysis]","[patient, health]",,,,
Irene Ma,"[datum, method]","[case, finding]","[patient, analysis]","[patient, health]","[cancer, analysis]",,,,,
Mustapha Raïssouli,"[datum, method]","[reaction, c]",,,,,,,,
Sebastián Cea-Echenique,"[datum, method]","[research, development]","[reaction, c]",,,,,,,
Peter Gärdenfors,"[datum, method]","[research, development]","[effect, experiment]",,,,,,,
