In [1]:
# Usual imports
import numpy as np
import pandas as pd
from tqdm import tqdm
import string
import matplotlib.pyplot as plt
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.manifold import TSNE
import concurrent.futures
import time
import pyLDAvis.sklearn
from pylab import bone, pcolor, colorbar, plot, show, rcParams, savefig
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline
import os
# print(os.listdir("../input"))

# Plotly based imports for visualization
from plotly import tools
import chart_studio.plotly as py
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.figure_factory as ff

# spaCy based imports
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English
!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.1.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.1.0/en_core_web_lg-3.1.0-py3-none-any.whl (777.1 MB)
[K     |████████████████████████████████| 777.1 MB 14 kB/s  eta 0:00:01    |▉                               | 21.1 MB 2.3 MB/s eta 0:05:24     |██▊                             | 66.9 MB 2.7 MB/s eta 0:04:21     |████████▋                       | 207.8 MB 2.5 MB/s eta 0:03:46     |████████▊                       | 211.8 MB 2.6 MB/s eta 0:03:41     |████████▉                       | 213.4 MB 2.6 MB/s eta 0:03:39     |████████████████████████▉       | 602.1 MB 3.0 MB/s eta 0:00:59     |██████████████████████████▊     | 650.1 MB 2.5 MB/s eta 0:00:51
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')


In [2]:
papers= pd.read_excel('ACM_output.xlsx')
papers.head()

Unnamed: 0,Title,type,total_downloads,total_citations,date,author_1,author_2,link,abstract
0,Abenteuer informatik: hands-on exhibits for le...,RESEARCH-ARTICLE,258,4,2012-11-01,Jens Gallenbacher,,https://doi.org/10.1145/2481449.2481487,Computational thinking is one of the pillars o...
1,Reflections on outreach programs in CS classes...,RESEARCH-ARTICLE,510,11,2012-02-01,Renate Thies,Jan Vahrenhold,https://doi.org/10.1145/2157136.2157281,To provide a unified view of any scientific fi...
2,Internationalization of computer science educa...,RESEARCH-ARTICLE,341,6,2010-03-01,Sarah Douglas,Art Farley,https://doi.org/10.1145/1734263.1734404,Internationalization of computer science educa...
3,Exploring the K-12 computer science curriculum...,RESEARCH-ARTICLE,36,0,2020-10-01,Meize Guo,Anne Ottenbreit-Leftwich,https://doi.org/10.1145/3421590.3421594,In order to create early exposure and to guide...
4,A music context for teaching introductory comp...,RESEARCH-ARTICLE,346,18,2009-07-01,Ananya Misra,Douglas Blank,https://doi.org/10.1145/1562877.1562955,"We describe myro.chuck, a Python module for co..."


In [3]:
# Creating a spaCy object
nlp = spacy.load('en_core_web_lg')

spaCy also comes with a built-in named entity visualizer that lets you check your model's predictions in your browser. You can pass in one or more Doc objects and start a web server, export HTML files or view the visualization directly from a Jupyter Notebook.

# Named Entity Recognition

Named Entity Recognition is an information extraction task where named entities in unstructured sentences are located and classified in some pre-defined categories such as the person names, organizations, locations, medical codes, time expressions, quantities, monetary values, percentages, etc.

In [4]:
doc = nlp(papers["abstract"][4])
spacy.displacy.render(doc, style='ent',jupyter=True)

In [5]:
punctuations = string.punctuation
stopwords = list(STOP_WORDS)

# Lemmatization

It is the process of grouping together the inflected forms of a word so they can be analysed as a single item, identified by the word's lemma, or dictionary form. Words like "ran" and "running" are converted to "run" to avoid having words with similar meanings in our data.

In [6]:
review = str(" ".join([i.lemma_ for i in doc]))
print(review)

we describe myro.chuck , a Python module for control music synthesis , and its application to teach introductory computer science . the module be build within the Myro framework use the ChucK programming language , and be use in an introductory computer science course combine robot , graphic and music . the result support the value of music in engage student and broaden their view of computer science .


In [7]:
doc = nlp(review)
spacy.displacy.render(doc, style='ent',jupyter=True)

The sentence looks much different now that it is lemmatized.

# Parts of Speech tagging


This is the process of marking up a word in a text (corpus) as corresponding to a particular part of speech,[1] based on both its definition and its context—i.e., its relationship with adjacent and related words in a phrase, sentence, or paragraph. A simplified form of this is commonly taught to school-age children, in the identification of words as nouns, verbs, adjectives, adverbs, etc.

In [8]:
# POS tagging
for i in nlp(review):
    print(i,"=>",i.pos_)

we => PRON
describe => VERB
myro.chuck => NUM
, => PUNCT
a => DET
Python => PROPN
module => NOUN
for => ADP
control => NOUN
music => NOUN
synthesis => NOUN
, => PUNCT
and => CCONJ
its => PRON
application => NOUN
to => PART
teach => VERB
introductory => ADJ
computer => NOUN
science => NOUN
. => PUNCT
the => DET
module => NOUN
be => AUX
build => VERB
within => ADP
the => DET
Myro => PROPN
framework => NOUN
use => VERB
the => DET
ChucK => PROPN
programming => NOUN
language => NOUN
, => PUNCT
and => CCONJ
be => VERB
use => NOUN
in => ADP
an => DET
introductory => ADJ
computer => NOUN
science => NOUN
course => NOUN
combine => VERB
robot => NOUN
, => PUNCT
graphic => ADJ
and => CCONJ
music => NOUN
. => PUNCT
the => DET
result => NOUN
support => VERB
the => DET
value => NOUN
of => ADP
music => NOUN
in => ADP
engage => NOUN
student => NOUN
and => CCONJ
broaden => VERB
their => PRON
view => NOUN
of => ADP
computer => NOUN
science => NOUN
. => PUNCT


In [9]:
# Parser for reviews
parser = English()
def spacy_tokenizer(sentence):
    mytokens = parser(sentence)
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in nlp(sentence) ]
    mytokens = [ word for word in mytokens if word not in stopwords and word not in punctuations ]
    mytokens = " ".join([i for i in mytokens])
    return mytokens

In [10]:
tqdm.pandas()
papers["processed_abstract"] = papers["abstract"].progress_apply(spacy_tokenizer)

100%|██████████| 1000/1000 [00:24<00:00, 40.50it/s]


In [11]:
papers[['abstract','processed_abstract']].head(6)

Unnamed: 0,abstract,processed_abstract
0,Computational thinking is one of the pillars o...,computational thinking pillar acm csta standar...
1,To provide a unified view of any scientific fi...,provide unified view scientific field outreach...
2,Internationalization of computer science educa...,internationalization computer science educatio...
3,In order to create early exposure and to guide...,order create early exposure guide talent compu...
4,"We describe myro.chuck, a Python module for co...",describe myro.chuck python module control musi...
5,This work presents an approach how student-cen...,work present approach student center computer ...


# What is topic-modelling?


In machine learning and natural language processing, a topic model is a type of statistical model for discovering the abstract "topics" that occur in a collection of documents. Topic modeling is a frequently used text-mining tool for discovery of hidden semantic structures in a text body. Intuitively, given that a document is about a particular topic, one would expect particular words to appear in the document more or less frequently: "dog" and "bone" will appear more often in documents about dogs, "cat" and "meow" will appear in documents about cats, and "the" and "is" will appear equally in both. A document typically concerns multiple topics in different proportions; thus, in a document that is 10% about cats and 90% about dogs, there would probably be about 9 times more dog words than cat words.

The "topics" produced by topic modeling techniques are clusters of similar words. A topic model captures this intuition in a mathematical framework, which allows examining a set of documents and discovering, based on the statistics of the words in each, what the topics might be and what each document's balance of topics is. It involves various techniques of dimensionality reduction(mostly non-linear) and unsupervised learning like LDA, SVD, autoencoders etc.

Source: Wikipedia

In [12]:
# Creating a vectorizer
vectorizer = CountVectorizer(min_df=5, max_df=0.9, stop_words='english', lowercase=True, 
                             token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')

data_vectorized = vectorizer.fit_transform(papers["processed_abstract"])

In [13]:
NUM_TOPICS = 10

In [14]:
# Latent Dirichlet Allocation Model
lda = LatentDirichletAllocation(n_components=NUM_TOPICS, max_iter=10, learning_method='online',verbose=True)
data_lda = lda.fit_transform(data_vectorized)

iteration: 1 of max_iter: 10
iteration: 2 of max_iter: 10
iteration: 3 of max_iter: 10
iteration: 4 of max_iter: 10
iteration: 5 of max_iter: 10
iteration: 6 of max_iter: 10
iteration: 7 of max_iter: 10
iteration: 8 of max_iter: 10
iteration: 9 of max_iter: 10
iteration: 10 of max_iter: 10


In [15]:
# Non-Negative Matrix Factorization Model
nmf = NMF(n_components=NUM_TOPICS)
data_nmf = nmf.fit_transform(data_vectorized) 

In [16]:
# Latent Semantic Indexing Model using Truncated SVD
lsi = TruncatedSVD(n_components=NUM_TOPICS)
data_lsi = lsi.fit_transform(data_vectorized)

In [17]:
# Functions for printing keywords for each topic
def selected_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]]) 

In [18]:
# Keywords for topics clustered by Latent Dirichlet Allocation
print("LDA Model:")
selected_topics(lda, vectorizer)

LDA Model:
Topic 0:
[('student', 127.60166474621336), ('software', 82.94175018402183), ('program', 63.381017228829684), ('college', 58.131162668724485), ('exam', 55.79049028127986), ('physical', 55.211495441031985), ('group', 50.08499833536321), ('class', 43.828956908818064), ('project', 40.53338059016386), ('year', 39.66207403224869)]
Topic 1:
[('student', 216.33738008970852), ('study', 140.03033567353123), ('woman', 118.35091621850496), ('research', 98.37244219084423), ('gender', 94.184439290401), ('female', 84.78334976418998), ('social', 73.75610610210438), ('result', 72.67751980451604), ('programming', 64.17908143268087), ('factor', 62.941064172368854)]
Topic 2:
[('robot', 55.54249324383043), ('quantum', 52.800489422297495), ('innovation', 30.7700802260983), ('database', 20.291979754413724), ('robotic', 17.956293201791556), ('network', 16.932448914786452), ('new', 16.06744025755558), ('physics', 15.497326064150677), ('internet', 14.87763215191303), ('mining', 13.31988589446136)]
To

In [19]:
# Keywords for topics clustered by Latent Semantic Indexing
print("NMF Model:")
selected_topics(nmf, vectorizer)

NMF Model:
Topic 0:
[('student', 11.841895146329492), ('study', 1.0271850030775798), ('experience', 0.8077371170222261), ('work', 0.5436873653377242), ('learning', 0.5392731643878721), ('learn', 0.5204636474102546), ('result', 0.48941845647818644), ('increase', 0.4757433693037556), ('engagement', 0.46340113234417396), ('high', 0.4568145565985025)]
Topic 1:
[('research', 5.565407625747055), ('study', 2.657836029371493), ('education', 2.3985254047450755), ('paper', 1.1059938829218106), ('result', 1.0113098129512474), ('area', 0.7813743045354075), ('compute', 0.7276118177218086), ('field', 0.6530065332551099), ('present', 0.6493080574909873), ('information', 0.6447187364032616)]
Topic 2:
[('course', 8.05015037353611), ('teach', 1.1541407425682302), ('major', 0.6902568622041483), ('student', 0.627848799704771), ('university', 0.5650839200192734), ('offer', 0.5310541286963466), ('topic', 0.451658844104543), ('level', 0.4047265702847715), ('new', 0.37479777947483034), ('include', 0.365586114

In [20]:
# Keywords for topics clustered by Non-Negative Matrix Factorization
print("LSI Model:")
selected_topics(lsi, vectorizer)

LSI Model:
Topic 0:
[('student', 0.6341386097693504), ('course', 0.2839540226934436), ('use', 0.20494908252945304), ('programming', 0.14121829872831598), ('study', 0.13261625386605308), ('paper', 0.1314224662179994), ('school', 0.12602802770463414), ('program', 0.12027289916341224), ('computing', 0.11464499514411107), ('education', 0.11084735221253801)]
Topic 1:
[('computing', 0.25289817603289005), ('teacher', 0.23317461861727068), ('research', 0.21475245930782236), ('education', 0.20504321042309223), ('use', 0.18442550769120192), ('paper', 0.14608648409893318), ('program', 0.1201078332925205), ('design', 0.11103657161469974), ('school', 0.1091427802303546), ('curriculum', 0.0992089031081561)]
Topic 2:
[('course', 0.8556219527490541), ('teach', 0.12461923626804097), ('computing', 0.07626673441918316), ('cloud', 0.06786383270748213), ('major', 0.06439911518017653), ('data', 0.06298811600291482), ('offer', 0.055169185462211445), ('mobile', 0.05080463544582148), ('new', 0.0500885907041106

In [21]:
# Transforming an individual sentence
text = spacy_tokenizer("spaCy also comes with a built-in named entity visualizer that lets you check your model's predictions in your browser. You can pass in one or more Doc objects and start a web server, export HTML files or view the visualization directly from a Jupyter Notebook.")
x = lda.transform(vectorizer.transform([text]))[0]
print(x)

[0.00714552 0.16014486 0.00714542 0.37911129 0.00714343 0.4107318
 0.00714588 0.00714348 0.00714511 0.00714321]


The index in the above list with the largest value represents the most dominant topic for the given review.

# How to interpret this graph?

1- Topics on the left while their respective keywords are on the right.

2- Larger topics are more frequent and closer the topics, more the similarity.

3- Selection of keywords is based on their frequency and discriminancy.

Hover over the topics on the left to get information about their keywords on the right.

In [22]:
pyLDAvis.enable_notebook()
dash = pyLDAvis.sklearn.prepare(lda, data_vectorized, vectorizer, mds='tsne')
dash

# Visualizing LSI(SVD) scatterplot

We will be visualizing our data for 2 topics to see similarity between keywords which is measured by distance with the markers using LSI model

In [23]:
svd_2d = TruncatedSVD(n_components=2)
data_2d = svd_2d.fit_transform(data_vectorized)

In [24]:
trace = go.Scattergl(
    x = data_2d[:,0],
    y = data_2d[:,1],
    mode = 'markers',
    marker = dict(
        color = '#FFBAD2',
        line = dict(width = 1)
    ),
    text = vectorizer.get_feature_names(),
    hovertext = vectorizer.get_feature_names(),
    hoverinfo = 'text' 
)
data = [trace]
iplot(data, filename='scatter-mode')

# The text version of scatter plot looks messy but you can zoom it for great results

In [25]:
trace = go.Scattergl(
    x = data_2d[:,0],
    y = data_2d[:,1],
    mode = 'text',
    marker = dict(
        color = '#FFBAD2',
        line = dict(width = 1)
    ),
    text = vectorizer.get_feature_names()
)
data = [trace]
iplot(data, filename='text-scatter-mode')

# Let's see what happens when we use a spaCy based bigram tokenizer for topic modelling

In [26]:
def spacy_bigram_tokenizer(phrase):
    doc = parser(phrase) # create spacy object
    token_not_noun = []
    notnoun_noun_list = []
    noun = ""

    for item in doc:
        if item.pos_ != "NOUN": # separate nouns and not nouns
            token_not_noun.append(item.text)
        if item.pos_ == "NOUN":
            noun = item.text
        
        for notnoun in token_not_noun:
            notnoun_noun_list.append(notnoun + " " + noun)

    return " ".join([i for i in notnoun_noun_list])

In [27]:
bivectorizer = CountVectorizer(min_df=5, max_df=0.9, stop_words='english', lowercase=True, ngram_range=(1,2))
bigram_vectorized = bivectorizer.fit_transform(papers["processed_abstract"])

# LDA for bigram data

In [28]:
bi_lda = LatentDirichletAllocation(n_components=NUM_TOPICS, max_iter=10, learning_method='online',verbose=True)
data_bi_lda = bi_lda.fit_transform(bigram_vectorized)

iteration: 1 of max_iter: 10
iteration: 2 of max_iter: 10
iteration: 3 of max_iter: 10
iteration: 4 of max_iter: 10
iteration: 5 of max_iter: 10
iteration: 6 of max_iter: 10
iteration: 7 of max_iter: 10
iteration: 8 of max_iter: 10
iteration: 9 of max_iter: 10
iteration: 10 of max_iter: 10


# Topics for bigram model

In [29]:
print("Bi-LDA Model:")
selected_topics(bi_lda, bivectorizer)

Bi-LDA Model:
Topic 0:
[('student', 1453.3901759257847), ('course', 842.7610266659372), ('use', 411.4356737775034), ('cs', 374.6398885341862), ('study', 289.5795315021002), ('programming', 287.2363331154939), ('experience', 235.02203409119406), ('result', 215.71483825564349), ('teach', 208.5358604408345), ('paper', 206.7140709289856)]
Topic 1:
[('research', 315.6586275743142), ('information', 202.14933682472906), ('technology', 153.91321143623145), ('datum', 139.10303230387072), ('study', 137.57896999785197), ('education', 129.24258499293603), ('computing', 119.47768720687371), ('field', 114.8881086633722), ('paper', 109.17551987025614), ('cs', 108.27870906505997)]
Topic 2:
[('student', 477.5255159941592), ('school', 378.89232829667066), ('use', 252.99882347103474), ('teacher', 238.54636940487902), ('learn', 184.88031375289054), ('concept', 167.84057723661684), ('paper', 166.13113802603402), ('cs', 161.7943209190052), ('education', 161.02829826825229), ('curriculum', 160.76757399280154

In [30]:
bi_dash = pyLDAvis.sklearn.prepare(bi_lda, bigram_vectorized, bivectorizer, mds='tsne')
bi_dash