In [1]:
#!pip install gensim
from gensim import models, corpora
import pandas as pd
import pyLDAvis.gensim

In [2]:
def prep_corpus(fname):
    "reads in file, makes a gensim-usable corpus out of it"

    with open(fname, 'r') as f:
        docs_full = f.readlines()

    docs = [x.split('\t')[1].split() for x in docs_full]
    for doc in docs:
        while '<NUMBER>' in doc:
            doc.remove('<NUMBER>')
        while 'patient' in doc:
            doc.remove('patient')
        while 'cell' in doc:
            doc.remove('cell')

    dictionary = corpora.Dictionary(docs)
    corpus = [dictionary.doc2bow(doc) for doc in docs]

    return corpus, dictionary, docs


def prep_corpus_csv(fname):
    "reads in file, makes a gensim-usable corpus out of it"

    d = pd.read_csv(fname)
        
    docs = []
    for i in range(len(d.index)):
        docs.append(d.iloc[i,2].split())
    
    for doc in docs:
        while '<NUMBER>' in doc:
            doc.remove('<NUMBER>')
        while 'patient' in doc:
            doc.remove('patient')
        while 'cell' in doc:
            doc.remove('cell')

    dictionary = corpora.Dictionary(docs)
    corpus = [dictionary.doc2bow(doc) for doc in docs]

    return corpus, dictionary, docs


In [34]:
corp, dictionary, docs = prep_corpus_csv('sampledata.csv')
corp2, dictionary2, docs2 = prep_corpus('pubmed_subset_13.tsv')


In [22]:

lda_model = models.LdaModel(corpus=corp, 
 num_topics=6, 
 id2word=dictionary, 
 distributed=False, 
 chunksize=2000, 
 passes=5, 
 update_every=1, 
 alpha='auto', 
 eta=None, 
 decay=0.5, 
 offset=1.0, 
 eval_every=10, 
 iterations=50, 
 gamma_threshold=0.001, 
 minimum_probability=0.01, 
 random_state=1, 
 ns_conf=None, 
 minimum_phi_value=0.01, 
 per_word_topics=False, 
 callbacks=None)

In [23]:
lda_visualization = pyLDAvis.gensim.prepare(lda_model, corp, dictionary, sort_topics=False)
pyLDAvis.display(lda_visualization)

In [35]:
lda_model_same = lda_model = models.LdaModel(corpus=corp, 
 num_topics=6, 
 id2word=dictionary, 
 distributed=False, 
 chunksize=2000, 
 passes=5, 
 update_every=1, 
 alpha='auto', 
 eta=None, 
 decay=0.5, 
 offset=1.0, 
 eval_every=10, 
 iterations=50, 
 gamma_threshold=0.001, 
 minimum_probability=0.01, 
 random_state=2, 
 ns_conf=None, 
 minimum_phi_value=0.01, 
 per_word_topics=False, 
 callbacks=None)

In [36]:
df, wordlist = lda_model.diff(lda_model_same)

In [29]:
df = pd.DataFrame(df)
df

Unnamed: 0,0,1,2,3,4,5
0,0.0,0.964302,0.955672,0.940176,0.954978,1.0
1,0.8041,0.0,0.836121,0.831595,0.817461,0.847809
2,0.830991,0.837529,0.0,0.747356,0.800718,0.795875
3,0.84453,0.851702,0.780103,0.0,0.836585,0.825798
4,0.790973,0.805307,0.774175,0.769585,0.0,0.79684
5,0.82111,0.807799,0.767472,0.74347,0.80848,0.0


In [21]:
wordlist = pd.DataFrame(wordlist)
wordlist.iloc[1,0]

[['expression',
  'health',
  'provide',
  'response',
  'TBI',
  'development',
  'year',
  'system',
  'mechanism',
  'injury'],
 ['acid',
  'CI',
  'viral',
  'range',
  'case',
  'receptor',
  'HCV',
  'scan',
  'characteristic',
  'deliver']]

In [46]:
lda_model2 = lda_model = models.LdaModel(corpus=corp2, 
 num_topics=6, 
 id2word=dictionary2, 
 distributed=False, 
 chunksize=2000, 
 passes=5, 
 update_every=1, 
 alpha='auto', 
 eta=None, 
 decay=0.5, 
 offset=1.0, 
 eval_every=10, 
 iterations=50, 
 gamma_threshold=0.001, 
 minimum_probability=0.01, 
 random_state=2, 
 ns_conf=None, 
 minimum_phi_value=0.01, 
 per_word_topics=False, 
 callbacks=None)

In [39]:
lda_visualization = pyLDAvis.gensim.prepare(lda_model2, corp2, dictionary2, sort_topics=False)
pyLDAvis.display(lda_visualization)

In [47]:
df2, wordlist2 = lda_model.diff(lda_model2)

In [48]:
df2 = pd.DataFrame(df2)
df2

Unnamed: 0,0,1,2,3,4,5
0,0.0,0.606706,1.0,0.764545,0.811079,0.912881
1,0.640657,0.0,0.822754,0.488847,0.584167,0.878897
2,0.951692,0.851102,0.0,0.850408,0.631381,0.6023
3,0.817281,0.59619,0.906516,0.0,0.755259,0.962877
4,0.7531,0.569496,0.657047,0.628122,0.0,0.779699
5,0.869571,0.853,0.589819,0.828014,0.757033,0.0


In [45]:
wordlist2 = pd.DataFrame(wordlist2)
wordlist2.iloc[0,0]

[['acid',
  'treat',
  'RNA',
  'express',
  'tissue',
  'target',
  'resistance',
  'model',
  'promote',
  'host'],
 []]