<a href="https://colab.research.google.com/github/maccallumr/Neural-Net-Projects/blob/main/LDA_with_genism_and_NLTK_with_visualization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%pylab inline
import pandas as pd
import pickle as pk
from scipy import sparse as sp
import nltk

Populating the interactive namespace from numpy and matplotlib


In [None]:
##Read in data frame and use header of docs section to get array of docs

p_df = pd.read_csv('Papers_test.csv')
docs = array(p_df['PaperText'])

In [None]:
##Preprocessing: converting to lowercase, tokenizing, lemmatizing etc

from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
nltk.download('wordnet')
nltk.download('omw-1.4')

def docs_preprocessor(docs):
    tokenizer = RegexpTokenizer(r'\w+')
    for idx in range(len(docs)):
        docs[idx] = docs[idx].lower()  # Convert to lowercase.
        docs[idx] = tokenizer.tokenize(docs[idx])  # Split into words.

    # Remove numbers, but not words that contain numbers.
    docs = [[token for token in doc if not token.isdigit()] for doc in docs]
    
    # Remove words that are only one character.
    docs = [[token for token in doc if len(token) > 3] for doc in docs]
    
    # Lemmatize all words in documents.
    lemmatizer = WordNetLemmatizer()
    docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]
  
    return docs

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [None]:
##Preprocessing docs 

docs = docs_preprocessor(docs)

In [None]:
##Genism: Bigram and Trigram topic modeling, breaks up text into small n-word phrases, these are more distincitve than overlapping similar words

from gensim.models import Phrases
# Add bigrams and trigrams to docs (only ones that appear 10 times or more).
bigram = Phrases(docs, min_count=10)
trigram = Phrases(bigram[docs])

for idx in range(len(docs)):
    for token in bigram[docs[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            docs[idx].append(token)
    for token in trigram[docs[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            docs[idx].append(token)



In [None]:
##Remove rare and common tokens, leaving only unique tokens

from gensim.corpora import Dictionary

# Create a dictionary representation of the documents.
dictionary = Dictionary(docs)
print('Number of unique words in initital documents:', len(dictionary))

# Filter out words that occur less than 10 documents, or more than 20% of the documents.
dictionary.filter_extremes(no_below=10, no_above=0.2)
print('Number of unique words after removing rare and common words:', len(dictionary))

Number of unique words in initital documents: 39534
Number of unique words after removing rare and common words: 6001


In [None]:
##Convert list to bag-of-words format, tokenized and normalized

corpus = [dictionary.doc2bow(doc) for doc in docs]

In [None]:
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

Number of unique tokens: 6001
Number of documents: 403


In [None]:
##Import the LDA model

from gensim.models import LdaModel

In [None]:
# Set training hyperparameters that define the LDA
num_topics = 4
chunksize = 500 # size of the doc looked at every pass
passes = 20 # number of passes through documents
iterations = 400
eval_every = 1  # Don't evaluate model perplexity, takes too much time.

# Make a index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

##Defining the actual model, input is corpus, dictionary)

%time model = LdaModel(corpus=corpus, id2word=id2word, chunksize=chunksize, \
                       alpha='auto', eta='auto', \
                       iterations=iterations, num_topics=num_topics, \
                       passes=passes, eval_every=eval_every)

CPU times: user 37.1 s, sys: 371 ms, total: 37.5 s
Wall time: 37.2 s


In [None]:
##Visualizing LDAs with interactive GUI

!pip install pyLDAvis

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyLDAvis
  Downloading pyLDAvis-3.3.1.tar.gz (1.7 MB)
[K     |████████████████████████████████| 1.7 MB 5.1 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting funcy
  Downloading funcy-1.17-py2.py3-none-any.whl (33 kB)
Building wheels for collected packages: pyLDAvis
  Building wheel for pyLDAvis (PEP 517) ... [?25l[?25hdone
  Created wheel for pyLDAvis: filename=pyLDAvis-3.3.1-py2.py3-none-any.whl size=136898 sha256=476cd9a9d3f8fa4095e4331bda4784fb37232778219ddcee314e7d674cba0d61
  Stored in directory: /root/.cache/pip/wheels/c9/21/f6/17bcf2667e8a68532ba2fbf6d5c72fdf4c7f7d9abfa4852d2f
Successfully built pyLDAvis
Installing collected packages: funcy, pyLDAvis
Successfully installed funcy-1.

In [None]:
# Gensim, specialized Topic Modeling packages
import gensim, spacy, logging, warnings
import gensim.corpora as corpora
from gensim.utils import lemmatize, simple_preprocess
from gensim.models import CoherenceModel

In [None]:
##Specific distribution that works with google colab

import pyLDAvis.gensim_models

  from collections import Iterable


In [None]:

pyLDAvis.enable_notebook()

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 


In [None]:
##Visualization uses model, corpus, and dictionary

pyLDAvis.gensim_models.prepare(model, corpus, dictionary)

  by='saliency', ascending=False).head(R).drop('saliency', 1)


In [None]:
##The left panel, labeld Intertopic Distance Map, circles represent different topics and the distance between them. Similar topics appear closer and the dissimilar topics farther. The relative size of a topic's circle in the plot corresponds to the relative frequency of the topic in the corpus. An individual topic may be selected for closer scrutiny by clicking on its circle, or entering its number in the "selected topic" box in the upper-left.
##The right panel, include the bar chart of the top 30 terms. When no topic is selected in the plot on the left, the bar chart shows the top-30 most "salient" terms in the corpus. A term's saliency is a measure of both how frequent the term is in the corpus and how "distinctive" it is in distinguishing between different topics. Selecting each topic on the right, modifies the bar chart to show the "relevant" terms for the selected topic. Relevence is defined as in footer 2 and can be tuned by parameter  λ , smaller  λ  gives higher weight to the term's distinctiveness while larger  λ s corresponds to probablity of the term occurance per topics.
##Therefore, to get a better sense of terms per topic we'll use  λ =0.

##Evaluate model
## 1) Divide docs into two parts, see if topics assigned to them are similar, similar is good
## 2) Compare randomly selected docs with one another, less simlar is good
from sklearn.metrics.pairwise import cosine_similarity

p_df['tokenz'] = docs

docs1 = p_df['tokenz'].apply(lambda l: l[:int0(len(l)/2)])
docs2 = p_df['tokenz'].apply(lambda l: l[int0(len(l)/2):])

In [None]:
##Transform data

corpus1 = [dictionary.doc2bow(doc) for doc in docs1]
corpus2 = [dictionary.doc2bow(doc) for doc in docs2]

# Using the corpus LDA model tranformation
lda_corpus1 = model[corpus1]
lda_corpus2 = model[corpus2]

In [None]:
from collections import OrderedDict
def get_doc_topic_dist(model, corpus, kwords=False):
    
    '''
    LDA transformation, for each doc only returns topics with non-zero weight
    This function makes a matrix transformation of docs in the topic space.
    '''
    top_dist =[]
    keys = []

    for d in corpus:
        tmp = {i:0 for i in range(num_topics)}
        tmp.update(dict(model[d]))
        vals = list(OrderedDict(tmp).values())
        top_dist += [array(vals)]
        if kwords:
            keys += [array(vals).argmax()]

    return array(top_dist), keys

In [None]:
##Modeling metrics stated above regarding document similarity and disimilarity

top_dist1, _ = get_doc_topic_dist(model, lda_corpus1)
top_dist2, _ = get_doc_topic_dist(model, lda_corpus2)

print("Intra similarity: cosine similarity for corresponding parts of a doc(higher is better):")
print(mean([cosine_similarity(c1.reshape(1, -1), c2.reshape(1, -1))[0][0] for c1,c2 in zip(top_dist1, top_dist2)]))

random_pairs = np.random.randint(0, len(p_df['PaperText']), size=(400, 2))

print("Inter similarity: cosine similarity between random parts (lower is better):")
print(np.mean([cosine_similarity(top_dist1[i[0]].reshape(1, -1), top_dist2[i[1]].reshape(1, -1)) for i in random_pairs]))


Intra similarity: cosine similarity for corresponding parts of a doc(higher is better):
0.94125724
Inter similarity: cosine similarity between random parts (lower is better):
0.38875213


In [None]:
##Looking at top n terms for a given topic

def explore_topic(lda_model, topic_number, topn, output=True):
    """
    accept a ldamodel, a topic number and top n vocabs of interest
    prints a formatted list of the topn terms
    """
    terms = []
    for term, frequency in lda_model.show_topic(topic_number, topn=topn):
        terms += [term]
        if output:
            print(u'{:20} {:.3f}'.format(term, round(frequency, 3)))
    
    return terms

In [None]:
##Visualize top n terms for all topics
topic_summaries = []
print(u'{:20} {}'.format(u'term', u'frequency') + u'\n')
for i in range(num_topics):
    print('Topic '+str(i)+' |---------------------\n')
    tmp = explore_topic(model,topic_number=i, topn=10, output=True )
#     print tmp[:5]
    topic_summaries += [tmp[:5]]
    print

term                 frequency

Topic 0 |---------------------

matrix_completion    0.006
rank_matrix          0.006
sample_complexity    0.005
tensor               0.005
singular_value       0.005
recovery             0.004
active_learning      0.004
covariance_matrix    0.004
data_set             0.003
probability_least    0.003
Topic 1 |---------------------

convolutional        0.006
recurrent            0.005
fully_connected      0.005
deep_learning        0.005
generative_model     0.005
recurrent_neural     0.005
hidden_unit          0.005
hidden_layer         0.005
lstm                 0.005
neuron               0.004
Topic 2 |---------------------

regret               0.011
convergence_rate     0.007
bandit               0.006
step_size            0.006
policy               0.005
reward               0.005
regret_bound         0.004
game                 0.004
online_learning      0.004
line_search          0.004
Topic 3 |---------------------

gaussian_process     0.007
mar

In [None]:
##Manually assign labels to the topics

top_labels = {0: 'Statistics', 1:'Numerical Analysis', 2:'Online Learning', 3:'Deep Learning'}


In [None]:
##Preprocessing data for T-sne

import re
import nltk
nltk.download('stopwords')

##Get standard stop words
from nltk.corpus import stopwords

##Select language
stops = set(stopwords.words('english'))

##Preprocess docs as simplified list of words
def paper_to_wordlist( paper, remove_stopwords=True ):
    '''
        Function converts text to a sequence of words,
        Returns a list of words.
    '''
    lemmatizer = WordNetLemmatizer()
    # 1. Remove non-letters
    paper_text = re.sub("[^a-zA-Z]"," ", paper)
    # 2. Convert words to lower case and split them
    words = paper_text.lower().split()
    # 3. Remove stop words
    words = [w for w in words if not w in stops]
    # 4. Remove short words
    words = [t for t in words if len(t) > 2]
    # 5. lemmatizing
    words = [nltk.stem.WordNetLemmatizer().lemmatize(t) for t in words]

    return(words)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
##tf-id Vectorize our docs into an array that can be fed into a t-sne

from sklearn.feature_extraction.text import TfidfVectorizer
##Probably unecessary to redownload 
nltk.download('wordnet')
nltk.download('omw-1.4')

##Converts raw documents to tf-id matrix (term frequency-inverse document, used to show how "important" a word is)
tvectorizer = TfidfVectorizer(input='content', analyzer = 'word', lowercase=True, stop_words='english',\
                                  tokenizer=paper_to_wordlist, ngram_range=(1, 3), min_df=40, max_df=0.20,\
                                  norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=True)

dtm = tvectorizer.fit_transform(p_df['PaperText']).toarray()

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
  % sorted(inconsistent)


In [None]:
##Reset top_dist array, below is same as get_doc_topic_dist function defined above

top_dist =[]
for d in corpus:
    tmp = {i:0 for i in range(num_topics)}
    tmp.update(dict(model[d]))
    vals = list(OrderedDict(tmp).values())
    top_dist += [array(vals)]
print(top_dist)

[array([0.        , 0.15355919, 0.        , 0.84634811]), array([0.83035231, 0.        , 0.16955094, 0.        ]), array([0.        , 0.        , 0.62798071, 0.37192503]), array([0.09080809, 0.        , 0.        , 0.90909809]), array([0.        , 0.05722365, 0.        , 0.94267696]), array([0.99986076, 0.        , 0.        , 0.        ]), array([0.67880607, 0.13359219, 0.14151393, 0.04608782], dtype=float32), array([0.06961618, 0.17219949, 0.326398  , 0.43178633], dtype=float32), array([0.        , 0.99988312, 0.        , 0.        ]), array([0.14976016, 0.        , 0.85015899, 0.        ]), array([0.        , 0.99988359, 0.        , 0.        ]), array([0.59564143, 0.16367395, 0.        , 0.23294686]), array([0.87670553, 0.        , 0.12321318, 0.        ]), array([0.        , 0.        , 0.11374485, 0.88617295]), array([0.        , 0.79151851, 0.        , 0.20837881]), array([0.35387406, 0.        , 0.64602411, 0.        ]), array([0.        , 0.50629753, 0.        , 0.4936308 ]), 

In [None]:
##Get topic distribution and features (terms)
##lda_keys is the topic number that we later use to assign clusters and colors

top_dist, lda_keys= get_doc_topic_dist(model, corpus, True)
features = tvectorizer.get_feature_names()

##Top dist is an array where each entry is a vector showing the probabilities of each topic for that specific doc

##Check below to see how this works
##print(top_dist[0])
##print(lda_keys[0])


[0.         0.15355889 0.         0.8463484 ]
3



Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.



In [None]:

##Array for top defining terms (features) for the topic
top_ws = []
for n in range(len(dtm)):
    inds = int0(argsort(dtm[n])[::-1][:4])
    tmp = [features[i] for i in inds]
    
    top_ws += [' '.join(tmp)]
    
##Make column to include representative terms
p_df['Text_Rep'] = pd.DataFrame(top_ws)
##Make column to show topic number
p_df['clusters'] = pd.DataFrame(lda_keys)
##Fill empty entires with 10, this is associated with the gray color
p_df['clusters'].fillna(10, inplace=True)

cluster_colors = {0: 'blue', 1: 'green', 2: 'yellow', 3: 'red', 4: 'skyblue', 5:'salmon', 6:'orange', 7:'maroon', 8:'crimson', 9:'black', 10:'gray'}
##Make color column based on the cluster (i.e. topic number) based on the above key
p_df['colors'] = p_df['clusters'].apply(lambda l: cluster_colors[l])

##Show data frame with line below
##p_df

Unnamed: 0,Id,Title,EventType,PdfName,Abstract,PaperText,tokenz,Text_Rep,clusters,colors,X_tsne,Y_tsne,Z_tsne
0,5677,Double or Nothing: Multiplicative Incentive Me...,Poster,5677-double-or-nothing-multiplicative-incentiv...,Crowdsourcing has gained immense popularity in...,Double or Nothing: Multiplicative\nIncentive M...,"[double, nothing, multiplicative, incentive, m...",mechanism answer confidence multiplicative,3,red,-66.470879,76.462151,-1.693241
1,5941,Learning with Symmetric Label Noise: The Impor...,Spotlight,5941-learning-with-symmetric-label-noise-the-i...,Convex potential minimisation is the de facto ...,Learning with Symmetric Label Noise: The\nImpo...,"[learning, with, symmetric, label, noise, impo...",robustness learner svm equivalence,0,blue,-12.480157,21.149488,80.180473
2,6019,Algorithmic Stability and Uniform Generalization,Poster,6019-algorithmic-stability-and-uniform-general...,One of the central questions in statistical le...,Algorithmic Stability and Uniform Generalizati...,"[algorithmic, stability, uniform, generalizati...",stability vol training example parametric,2,yellow,12.586023,5.593279,-80.323257
3,6035,Adaptive Low-Complexity Sequential Inference f...,Poster,6035-adaptive-low-complexity-sequential-infere...,We develop a sequential low-complexity inferen...,Adaptive Low-Complexity Sequential Inference f...,"[adaptive, complexity, sequential, inference, ...",dirichlet mixture model concentration hyperpar...,3,red,-15.464137,116.217377,4.060631
4,5978,Covariance-Controlled Adaptive Langevin Thermo...,Poster,5978-covariance-controlled-adaptive-langevin-t...,Monte Carlo sampling for Bayesian posterior in...,Covariance-Controlled Adaptive Langevin\nTherm...,"[covariance, controlled, adaptive, langevin, t...",covariance matrix article maintaining letter,3,red,-55.759094,95.894974,-40.058178
...,...,...,...,...,...,...,...,...,...,...,...,...,...
398,5792,Efficient Learning of Continuous-Time Hidden M...,Poster,5792-efficient-learning-of-continuous-time-hid...,The Continuous-Time Hidden Markov Model (CT-HM...,Efficient Learning of Continuous-Time Hidden\n...,"[efficient, learning, continuous, time, hidden...",vol soft structural functional,3,red,8.855574,101.817635,-57.286678
399,5674,Expectation Particle Belief Propagation,Poster,5674-expectation-particle-belief-propagation.pdf,We propose an original particle-based implemen...,Expectation Particle Belief Propagation\n\nThi...,"[expectation, particle, belief, propagation, t...",proposal belief exponential family propagation,3,red,-14.737282,63.156094,-71.552116
400,5756,Latent Bayesian melding for integrating indivi...,Spotlight,5756-latent-bayesian-melding-for-integrating-i...,"In many statistical problems, a more coarse-gr...",Latent Bayesian melding for integrating indivi...,"[latent, bayesian, melding, integrating, indiv...",energy population modelling summary,3,red,31.937590,87.898689,-21.068617
401,5745,Distributionally Robust Logistic Regression,Spotlight,5745-distributionally-robust-logistic-regressi...,This paper proposes a distributionally robust ...,Distributionally Robust Logistic Regression\n\...,"[distributionally, robust, logistic, regressio...",logistic regression radius ball confidence,2,yellow,22.945602,-29.644821,18.223076


In [None]:
##Import T-sne packages

from sklearn.manifold import TSNE
##define dimensions of T-sne
tsne = TSNE(n_components=3)
##fit the t-sne with the preprocessed topic distribution from above
X_tsne = tsne.fit_transform(top_dist)


The default initialization in TSNE will change from 'random' to 'pca' in 1.2.


The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2.



In [None]:
##Alternative and not-as-cool data visualization 
'''
##Define data from dims of t-sne

p_df['X_tsne'] =X_tsne[:, 0]
p_df['Y_tsne'] =X_tsne[:, 1]
p_df['Z_tsne'] =X_tsne[:, 2] ##Me
'''

In [None]:
##Alternative and not-as-cool data visualization 
'''
from bokeh.plotting import figure, show, output_notebook, save#, output_file
from bokeh.models import HoverTool, value, LabelSet, Legend, ColumnDataSource
output_notebook()
'''

In [None]:
##Alternative and not-as-cool data visualization 
'''
source = ColumnDataSource(dict(
    x=p_df['X_tsne'],
    y=p_df['Y_tsne'],
    z=p_df['Z_tsne'], ##Me
    color=p_df['colors'],
    label=p_df['clusters'].apply(lambda l: top_labels[l]),
#     msize= p_df['marker_size'],
    topic_key= p_df['clusters'],
    title= p_df[u'Title'],
    content = p_df['Text_Rep']
))
'''

In [None]:
##Alternative and not-as-cool data visualization 
'''
title = 'T-SNE visualization of topics'

plot_lda = figure(plot_width=1000, plot_height=600,
                     title=title, tools="pan,wheel_zoom,box_zoom,reset,hover",
                     x_axis_type=None, y_axis_type=None, min_border=1)

plot_lda.scatter(x='x', y='y', legend='label', source=source,
                 color='color', alpha=0.8, size=10)#'msize', )

# hover tools
hover = plot_lda.select(dict(type=HoverTool))
hover.tooltips = {"content": "Title: @title, KeyWords: @content - Topic: @topic_key "}
plot_lda.legend.location = "top_left"

show(plot_lda)

#save the plot
# save(plot_lda, '{}.html'.format(title))
'''


'legend' keyword is deprecated, use explicit 'legend_label', 'legend_field', or 'legend_group' keywords instead



In [None]:
##Required configuration for using plotly in google colab 

def configure_plotly_browser_state():
  import IPython
  display(IPython.core.display.HTML('''
        <script src="/static/components/requirejs/require.js"></script>
        <script>
          requirejs.config({
            paths: {
              base: '/static/base',
              plotly: 'https://cdn.plot.ly/plotly-latest.min.js?noext',
            },
          });
        </script>
        '''))

In [None]:
##Testing data

#print(X_tsne[:, 0])
#X_tsne[:, 1]
#X_tsne[:, 2] ##Me

In [None]:
##Importing plotly

import plotly.graph_objects as go

In [None]:
##Defining array where each doc has a color (based on topic) assigned to it from the color column in the df defined earlier

color_t=p_df['colors']
print(color_t)
print(color_t.dtype)

0         red
1        blue
2      yellow
3         red
4         red
        ...  
398       red
399       red
400       red
401    yellow
402     green
Name: colors, Length: 403, dtype: object
object


In [None]:
##3d visualization of t-sne
##Plotting t-sne with the color key array defined above

data=[go.Scatter3d(x=X_tsne[:, 0], y=X_tsne[:, 1], z=X_tsne[:, 2], mode='markers', marker=dict(size=12, 
        color=color_t,
        colorscale='Viridis',   # choose a colorscale
        opacity=0.8
    ))]
                                  
layout = go.Layout(margin=dict(l=0,r=0), height = 800,width = 800)
fig = go.Figure(data = data, layout = layout)
fig.show(renderer="colab")