In [1]:
!pip install --upgrade gensim

Requirement already up-to-date: gensim in c:\users\sydney\appdata\local\programs\python\python38-32\lib\site-packages (3.8.3)


In [2]:
import pandas as pd
import numpy as np
import altair as alt
import seaborn as sns
import nltk
import re

In [3]:
from sklearn.cluster import KMeans
import gensim.corpora as corpora
from gensim.sklearn_api import TfIdfTransformer

In [4]:
df = pd.read_csv('./IEEE VIS papers 1990-2018 - Main dataset.csv')

In [5]:
df = df.dropna(subset=['Abstract'])

In [6]:
stopwords = nltk.corpus.stopwords.words('english')
wn = nltk.WordNetLemmatizer()

In [7]:
df['Abstract'] = [abstract.lower() for abstract in df['Abstract']]
df.iloc[0]['Abstract']

'the use of critical point analysis to generate representations of the vector field topology of numerical flow data sets is discussed. critical points are located and characterized in a two-dimensional domain, which may be either a two-dimensional flow field or the tangential velocity field near a three-dimensional body. tangent curves are then integrated out along the principal directions of certain classes of critical points. the points and curves are linked to form a skeleton representing the two-dimensional vector field topology. when generated from the tangential velocity field near a body in a three-dimensional flow, the skeleton includes the critical points and curves which provide a basis for analyzing the three-dimensional structure of the flow separation.&lt;&lt;etx&gt;&gt;'

In [8]:
df['Abstract'] = [abstract[:-19] for abstract in df['Abstract']]
df.iloc[0]['Abstract']

'the use of critical point analysis to generate representations of the vector field topology of numerical flow data sets is discussed. critical points are located and characterized in a two-dimensional domain, which may be either a two-dimensional flow field or the tangential velocity field near a three-dimensional body. tangent curves are then integrated out along the principal directions of certain classes of critical points. the points and curves are linked to form a skeleton representing the two-dimensional vector field topology. when generated from the tangential velocity field near a body in a three-dimensional flow, the skeleton includes the critical points and curves which provide a basis for analyzing the three-dimensional structure of the flow separation.'

In [9]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [10]:
def remove_punct(text):
    text_nopunct = "".join([char for char in text if char not in string.punctuation])
    return text_nopunct

df['Abstract'] = df['Abstract'].apply(lambda x: remove_punct(x))

df.iloc[0]['Abstract']

'the use of critical point analysis to generate representations of the vector field topology of numerical flow data sets is discussed critical points are located and characterized in a twodimensional domain which may be either a twodimensional flow field or the tangential velocity field near a threedimensional body tangent curves are then integrated out along the principal directions of certain classes of critical points the points and curves are linked to form a skeleton representing the twodimensional vector field topology when generated from the tangential velocity field near a body in a threedimensional flow the skeleton includes the critical points and curves which provide a basis for analyzing the threedimensional structure of the flow separation'

In [11]:
def tokenize(text):
    tokens = re.split("\W+", text)
    return tokens

df['Abstract'] = df['Abstract'].apply(lambda x: tokenize(x))

df.iloc[0]['Abstract']

['the',
 'use',
 'of',
 'critical',
 'point',
 'analysis',
 'to',
 'generate',
 'representations',
 'of',
 'the',
 'vector',
 'field',
 'topology',
 'of',
 'numerical',
 'flow',
 'data',
 'sets',
 'is',
 'discussed',
 'critical',
 'points',
 'are',
 'located',
 'and',
 'characterized',
 'in',
 'a',
 'twodimensional',
 'domain',
 'which',
 'may',
 'be',
 'either',
 'a',
 'twodimensional',
 'flow',
 'field',
 'or',
 'the',
 'tangential',
 'velocity',
 'field',
 'near',
 'a',
 'threedimensional',
 'body',
 'tangent',
 'curves',
 'are',
 'then',
 'integrated',
 'out',
 'along',
 'the',
 'principal',
 'directions',
 'of',
 'certain',
 'classes',
 'of',
 'critical',
 'points',
 'the',
 'points',
 'and',
 'curves',
 'are',
 'linked',
 'to',
 'form',
 'a',
 'skeleton',
 'representing',
 'the',
 'twodimensional',
 'vector',
 'field',
 'topology',
 'when',
 'generated',
 'from',
 'the',
 'tangential',
 'velocity',
 'field',
 'near',
 'a',
 'body',
 'in',
 'a',
 'threedimensional',
 'flow',
 'the

In [12]:
def remove_stopwords(tokenized_list):
    text = [word for word in tokenized_list if word not in stopwords]
    return text

df['Abstract'] = df['Abstract'].apply(lambda x: remove_stopwords(x))

df.iloc[0]['Abstract']

['use',
 'critical',
 'point',
 'analysis',
 'generate',
 'representations',
 'vector',
 'field',
 'topology',
 'numerical',
 'flow',
 'data',
 'sets',
 'discussed',
 'critical',
 'points',
 'located',
 'characterized',
 'twodimensional',
 'domain',
 'may',
 'either',
 'twodimensional',
 'flow',
 'field',
 'tangential',
 'velocity',
 'field',
 'near',
 'threedimensional',
 'body',
 'tangent',
 'curves',
 'integrated',
 'along',
 'principal',
 'directions',
 'certain',
 'classes',
 'critical',
 'points',
 'points',
 'curves',
 'linked',
 'form',
 'skeleton',
 'representing',
 'twodimensional',
 'vector',
 'field',
 'topology',
 'generated',
 'tangential',
 'velocity',
 'field',
 'near',
 'body',
 'threedimensional',
 'flow',
 'skeleton',
 'includes',
 'critical',
 'points',
 'curves',
 'provide',
 'basis',
 'analyzing',
 'threedimensional',
 'structure',
 'flow',
 'separation']

In [13]:
def lemmatizing(tokenized_text):
    text = [wn.lemmatize(word) for word in tokenized_text]
    return text

df['Abstract'] = df['Abstract'].apply(lambda x: lemmatizing(x))

df.iloc[0]['Abstract']

['use',
 'critical',
 'point',
 'analysis',
 'generate',
 'representation',
 'vector',
 'field',
 'topology',
 'numerical',
 'flow',
 'data',
 'set',
 'discussed',
 'critical',
 'point',
 'located',
 'characterized',
 'twodimensional',
 'domain',
 'may',
 'either',
 'twodimensional',
 'flow',
 'field',
 'tangential',
 'velocity',
 'field',
 'near',
 'threedimensional',
 'body',
 'tangent',
 'curve',
 'integrated',
 'along',
 'principal',
 'direction',
 'certain',
 'class',
 'critical',
 'point',
 'point',
 'curve',
 'linked',
 'form',
 'skeleton',
 'representing',
 'twodimensional',
 'vector',
 'field',
 'topology',
 'generated',
 'tangential',
 'velocity',
 'field',
 'near',
 'body',
 'threedimensional',
 'flow',
 'skeleton',
 'includes',
 'critical',
 'point',
 'curve',
 'provide',
 'basis',
 'analyzing',
 'threedimensional',
 'structure',
 'flow',
 'separation']

In [14]:
df['Abstract']

1       [use, critical, point, analysis, generate, rep...
2       [author, discus, fast, flow, analysis, softwar...
3       [vis5d, system, provides, highly, interactive,...
4       [author, present, simple, procedural, interfac...
5       [idea, technique, visualizing, volumetric, dat...
                              ...                        
3103    [completing, text, analysis, task, continuous,...
3104    [social, data, chart, visual, presentation, qu...
3105    [constructing, latent, vector, representation,...
3106    [present, smartexplore, novel, visual, analyti...
3107    [deep, neural, network, dnns, vulnerable, mali...
Name: Abstract, Length: 3035, dtype: object

In [15]:
# Create Dictionary
id2word = corpora.Dictionary(df['Abstract'])

model = TfIdfTransformer(dictionary=id2word)

In [16]:
id2word.doc2bow(df['Abstract'].iloc[9])

[(9, 1),
 (24, 1),
 (49, 1),
 (56, 1),
 (119, 3),
 (125, 2),
 (127, 1),
 (128, 1),
 (152, 1),
 (168, 2),
 (181, 2),
 (201, 1),
 (209, 1),
 (334, 2),
 (341, 1),
 (342, 1),
 (343, 1),
 (344, 1),
 (345, 1),
 (346, 1),
 (347, 1),
 (348, 1),
 (349, 1),
 (350, 1),
 (351, 2),
 (352, 1),
 (353, 1),
 (354, 1),
 (355, 1),
 (356, 1),
 (357, 1),
 (358, 1),
 (359, 1),
 (360, 1),
 (361, 1),
 (362, 1),
 (363, 1),
 (364, 1),
 (365, 1),
 (366, 1),
 (367, 1),
 (368, 1),
 (369, 1),
 (370, 1),
 (371, 1),
 (372, 1),
 (373, 1),
 (374, 2),
 (375, 3),
 (376, 1),
 (377, 2),
 (378, 1),
 (379, 3),
 (380, 1),
 (381, 2),
 (382, 1),
 (383, 1),
 (384, 1),
 (385, 1)]

In [17]:
# Create Corpus: Term Document Frequency
corpus = [id2word.doc2bow(text) for text in df['Abstract']]

num_docs = id2word.num_docs
num_terms = len(id2word.keys())

In [18]:
for doc in corpus[:1]:
    print([[id, id2word[id], freq] for id, freq in doc])

[[0, 'along', 1], [1, 'analysis', 1], [2, 'analyzing', 1], [3, 'basis', 1], [4, 'body', 2], [5, 'certain', 1], [6, 'characterized', 1], [7, 'class', 1], [8, 'critical', 4], [9, 'curve', 3], [10, 'data', 1], [11, 'direction', 1], [12, 'discussed', 1], [13, 'domain', 1], [14, 'either', 1], [15, 'field', 5], [16, 'flow', 4], [17, 'form', 1], [18, 'generate', 1], [19, 'generated', 1], [20, 'includes', 1], [21, 'integrated', 1], [22, 'linked', 1], [23, 'located', 1], [24, 'may', 1], [25, 'near', 2], [26, 'numerical', 1], [27, 'point', 5], [28, 'principal', 1], [29, 'provide', 1], [30, 'representation', 1], [31, 'representing', 1], [32, 'separation', 1], [33, 'set', 1], [34, 'skeleton', 2], [35, 'structure', 1], [36, 'tangent', 1], [37, 'tangential', 2], [38, 'threedimensional', 3], [39, 'topology', 2], [40, 'twodimensional', 3], [41, 'use', 1], [42, 'vector', 2], [43, 'velocity', 2]]


In [19]:
tfidf_corpus = model.fit_transform(corpus)

In [20]:
# construct an array of tf-idf vectors
from gensim.matutils import corpus2dense, corpus2csc

corpus_tfidf_dense = corpus2dense(tfidf_corpus, num_terms, num_docs)

In [21]:
corpus_tfidf_dense.shape

(16319, 3035)

In [22]:
x = corpus_tfidf_dense.T

In [23]:
words = []
for i in range(x.shape[1]):
    words.append(id2word[i])

In [24]:
mat = pd.DataFrame(data=corpus_tfidf_dense, index=words)

In [25]:
mat = mat[mat.max(axis=1) > 0.2]
wordtfidf = pd.DataFrame(data=mat.values.T, columns=mat.index)

In [26]:
wordtfidf

Unnamed: 0,along,analysis,analyzing,basis,body,certain,class,critical,curve,direction,...,olfactory,viscent,cvo,pae,skewness,nonvisualization,smartexplore,datapath,datapaths,dnns
0,0.074446,0.029241,0.070333,0.092756,0.217708,0.087174,0.088603,0.315138,0.257473,0.078983,...,0.0,0.0,0.0,0.0,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000
1,0.000000,0.147708,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000
2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000
3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000
4,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3030,0.000000,0.037524,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000
3031,0.000000,0.012650,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.51407,0.000000,0.000000,0.000000,0.000000,0.000000
3032,0.000000,0.017536,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000
3033,0.000000,0.088468,0.070929,0.000000,0.000000,0.000000,0.000000,0.000000,0.086552,0.000000,...,0.0,0.0,0.0,0.0,0.00000,0.399447,0.399447,0.000000,0.000000,0.000000


In [27]:
type(x)

numpy.ndarray

In [28]:
tfidf_df = pd.DataFrame(wordtfidf)

In [29]:
tfidf_df

Unnamed: 0,along,analysis,analyzing,basis,body,certain,class,critical,curve,direction,...,olfactory,viscent,cvo,pae,skewness,nonvisualization,smartexplore,datapath,datapaths,dnns
0,0.074446,0.029241,0.070333,0.092756,0.217708,0.087174,0.088603,0.315138,0.257473,0.078983,...,0.0,0.0,0.0,0.0,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000
1,0.000000,0.147708,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000
2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000
3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000
4,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3030,0.000000,0.037524,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000
3031,0.000000,0.012650,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.51407,0.000000,0.000000,0.000000,0.000000,0.000000
3032,0.000000,0.017536,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000
3033,0.000000,0.088468,0.070929,0.000000,0.000000,0.000000,0.000000,0.000000,0.086552,0.000000,...,0.0,0.0,0.0,0.0,0.00000,0.399447,0.399447,0.000000,0.000000,0.000000


## Clustering

In [30]:
cluster_model = KMeans(n_clusters=4)

In [31]:
cluster = cluster_model.fit(tfidf_df)

In [32]:
cluster.labels_

array([1, 3, 3, ..., 3, 3, 3])

In [33]:
tfidf_cluster_df = pd.DataFrame(tfidf_df)

In [34]:
tfidf_cluster_df.shape

(3035, 6894)

In [35]:
tfidf_cluster_df

Unnamed: 0,along,analysis,analyzing,basis,body,certain,class,critical,curve,direction,...,olfactory,viscent,cvo,pae,skewness,nonvisualization,smartexplore,datapath,datapaths,dnns
0,0.074446,0.029241,0.070333,0.092756,0.217708,0.087174,0.088603,0.315138,0.257473,0.078983,...,0.0,0.0,0.0,0.0,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000
1,0.000000,0.147708,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000
2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000
3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000
4,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3030,0.000000,0.037524,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000
3031,0.000000,0.012650,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.51407,0.000000,0.000000,0.000000,0.000000,0.000000
3032,0.000000,0.017536,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000
3033,0.000000,0.088468,0.070929,0.000000,0.000000,0.000000,0.000000,0.000000,0.086552,0.000000,...,0.0,0.0,0.0,0.0,0.00000,0.399447,0.399447,0.000000,0.000000,0.000000


In [36]:
sample = tfidf_cluster_df[["along","analysis"]]

In [37]:
alt.Chart(sample).mark_circle(size=60).encode(
    x="along:Q",
    y="analysis:Q",
    color="Pred:N"
)

1. Use the word as column name
2. Calculate the average vector for each cluster
    group by cluster, mean of all tfidf values, calculate the average value across all columns
3. For each cluster group, get the words with the highest tfidf value for each cluster
    argmax() Find argmax for each cluster and plot bar chart for each cluster


In [38]:
# top 10 words for each cluster

group_key_words = []
for label in range(4):
    # get the sum tf-idf for each word, do sum() across rows for each column
    group_df = tfidf_cluster_df[cluster.labels_ == label].mean(axis=0)
    # sort the tf-idf values
    to_sort = [{'freq': group_df[x], 'word': x} for x in group_df.index]
    to_sort = sorted(to_sort, key=lambda d: d['freq'], reverse=True)
    # add the words to the list
    for i in range(10):
        group_key_words.append([label, to_sort[i]['word'], to_sort[i]['freq']])
keyword_df = pd.DataFrame(data=group_key_words, columns=['label', 'keyword', 'tfidf'])

In [39]:
keyword_df

Unnamed: 0,label,keyword,tfidf
0,0,surface,0.109694
1,0,mesh,0.082049
2,0,algorithm,0.035338
3,0,triangle,0.034741
4,0,shape,0.026979
5,0,vertex,0.025144
6,0,method,0.024693
7,0,point,0.024658
8,0,simplification,0.024323
9,0,model,0.02429


In [44]:
alt.Chart(keyword_df[keyword_df['label']==2]).mark_bar().encode(
    x=alt.X('tfidf:Q',  scale=alt.Scale(domain=[0, 0.1]), title='label'),
    y=alt.Y('keyword:N', sort='-x')
).properties(
    width = 200
)

In [49]:
chart = alt.hconcat()

for label in range(4):
    chart |= alt.Chart(keyword_df[keyword_df['label']==label]).mark_bar().encode(
    x=alt.X('tfidf:Q', scale=alt.Scale(domain=[0,0.12]),title='cluster'+str(label)),
    y=alt.Y('keyword:N', sort='-x')
).properties(
    width = 50
)
chart

cluster0 more about surface rendering?

different clusters are about different topics
cluster3 is different from other clusters


next step:
1. understand how the tracks are different from each other
2. for different tracks, want to know how it is different to other tracks, make a large document that contains all the abstracts for each track/conference (create 4 different documents, each contains all the abstracts for that track, generate tfidf for each document)
3. 