In [1]:
import pandas as pd
import numpy as np
import altair as alt
import seaborn as sns
import nltk
import re

import gensim.corpora as corpora
from gensim.sklearn_api import TfIdfTransformer



In [2]:
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
df = pd.read_csv("./IEEE VIS papers 1990-2018 - Main dataset.csv")

In [4]:
df = df.dropna(subset=['Abstract'])

In [5]:
stopwords = nltk.corpus.stopwords.words('english')
wn = nltk.WordNetLemmatizer()
tfidf = TfidfVectorizer()

In [6]:
df['Abstract'] = [abstract.lower() for abstract in df['Abstract']]
df.iloc[0]['Abstract']

'the use of critical point analysis to generate representations of the vector field topology of numerical flow data sets is discussed. critical points are located and characterized in a two-dimensional domain, which may be either a two-dimensional flow field or the tangential velocity field near a three-dimensional body. tangent curves are then integrated out along the principal directions of certain classes of critical points. the points and curves are linked to form a skeleton representing the two-dimensional vector field topology. when generated from the tangential velocity field near a body in a three-dimensional flow, the skeleton includes the critical points and curves which provide a basis for analyzing the three-dimensional structure of the flow separation.&lt;&lt;etx&gt;&gt;'

In [7]:
df['Abstract'] = [abstract[:-19] for abstract in df['Abstract']]
df.iloc[0]['Abstract']

'the use of critical point analysis to generate representations of the vector field topology of numerical flow data sets is discussed. critical points are located and characterized in a two-dimensional domain, which may be either a two-dimensional flow field or the tangential velocity field near a three-dimensional body. tangent curves are then integrated out along the principal directions of certain classes of critical points. the points and curves are linked to form a skeleton representing the two-dimensional vector field topology. when generated from the tangential velocity field near a body in a three-dimensional flow, the skeleton includes the critical points and curves which provide a basis for analyzing the three-dimensional structure of the flow separation.'

In [8]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [9]:
def remove_punct(text):
    text_nopunct = "".join([char for char in text if char not in string.punctuation])
    return text_nopunct

df['Abstract'] = df['Abstract'].apply(lambda x: remove_punct(x))

df.iloc[0]['Abstract']

'the use of critical point analysis to generate representations of the vector field topology of numerical flow data sets is discussed critical points are located and characterized in a twodimensional domain which may be either a twodimensional flow field or the tangential velocity field near a threedimensional body tangent curves are then integrated out along the principal directions of certain classes of critical points the points and curves are linked to form a skeleton representing the twodimensional vector field topology when generated from the tangential velocity field near a body in a threedimensional flow the skeleton includes the critical points and curves which provide a basis for analyzing the threedimensional structure of the flow separation'

In [10]:
def tokenize(text):
    tokens = re.split("\W+", text)
    return tokens

df['Abstract'] = df['Abstract'].apply(lambda x: tokenize(x))

df.iloc[0]['Abstract']

['the',
 'use',
 'of',
 'critical',
 'point',
 'analysis',
 'to',
 'generate',
 'representations',
 'of',
 'the',
 'vector',
 'field',
 'topology',
 'of',
 'numerical',
 'flow',
 'data',
 'sets',
 'is',
 'discussed',
 'critical',
 'points',
 'are',
 'located',
 'and',
 'characterized',
 'in',
 'a',
 'twodimensional',
 'domain',
 'which',
 'may',
 'be',
 'either',
 'a',
 'twodimensional',
 'flow',
 'field',
 'or',
 'the',
 'tangential',
 'velocity',
 'field',
 'near',
 'a',
 'threedimensional',
 'body',
 'tangent',
 'curves',
 'are',
 'then',
 'integrated',
 'out',
 'along',
 'the',
 'principal',
 'directions',
 'of',
 'certain',
 'classes',
 'of',
 'critical',
 'points',
 'the',
 'points',
 'and',
 'curves',
 'are',
 'linked',
 'to',
 'form',
 'a',
 'skeleton',
 'representing',
 'the',
 'twodimensional',
 'vector',
 'field',
 'topology',
 'when',
 'generated',
 'from',
 'the',
 'tangential',
 'velocity',
 'field',
 'near',
 'a',
 'body',
 'in',
 'a',
 'threedimensional',
 'flow',
 'the

In [11]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/junyuan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [12]:
def remove_stopwords(tokenized_list):
    text = [word for word in tokenized_list if word not in stopwords]
    return text

df['Abstract'] = df['Abstract'].apply(lambda x: remove_stopwords(x))

df.iloc[0]['Abstract']

['use',
 'critical',
 'point',
 'analysis',
 'generate',
 'representations',
 'vector',
 'field',
 'topology',
 'numerical',
 'flow',
 'data',
 'sets',
 'discussed',
 'critical',
 'points',
 'located',
 'characterized',
 'twodimensional',
 'domain',
 'may',
 'either',
 'twodimensional',
 'flow',
 'field',
 'tangential',
 'velocity',
 'field',
 'near',
 'threedimensional',
 'body',
 'tangent',
 'curves',
 'integrated',
 'along',
 'principal',
 'directions',
 'certain',
 'classes',
 'critical',
 'points',
 'points',
 'curves',
 'linked',
 'form',
 'skeleton',
 'representing',
 'twodimensional',
 'vector',
 'field',
 'topology',
 'generated',
 'tangential',
 'velocity',
 'field',
 'near',
 'body',
 'threedimensional',
 'flow',
 'skeleton',
 'includes',
 'critical',
 'points',
 'curves',
 'provide',
 'basis',
 'analyzing',
 'threedimensional',
 'structure',
 'flow',
 'separation']

In [13]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/junyuan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [14]:
def lemmatizing(tokenized_text):
    text = [wn.lemmatize(word) for word in tokenized_text]
    return text

df['Abstract'] = df['Abstract'].apply(lambda x: lemmatizing(x))

df.iloc[0]['Abstract']

['use',
 'critical',
 'point',
 'analysis',
 'generate',
 'representation',
 'vector',
 'field',
 'topology',
 'numerical',
 'flow',
 'data',
 'set',
 'discussed',
 'critical',
 'point',
 'located',
 'characterized',
 'twodimensional',
 'domain',
 'may',
 'either',
 'twodimensional',
 'flow',
 'field',
 'tangential',
 'velocity',
 'field',
 'near',
 'threedimensional',
 'body',
 'tangent',
 'curve',
 'integrated',
 'along',
 'principal',
 'direction',
 'certain',
 'class',
 'critical',
 'point',
 'point',
 'curve',
 'linked',
 'form',
 'skeleton',
 'representing',
 'twodimensional',
 'vector',
 'field',
 'topology',
 'generated',
 'tangential',
 'velocity',
 'field',
 'near',
 'body',
 'threedimensional',
 'flow',
 'skeleton',
 'includes',
 'critical',
 'point',
 'curve',
 'provide',
 'basis',
 'analyzing',
 'threedimensional',
 'structure',
 'flow',
 'separation']

In [15]:
df['Abstract']

1       [use, critical, point, analysis, generate, rep...
2       [author, discus, fast, flow, analysis, softwar...
3       [vis5d, system, provides, highly, interactive,...
4       [author, present, simple, procedural, interfac...
5       [idea, technique, visualizing, volumetric, dat...
                              ...                        
3103    [completing, text, analysis, task, continuous,...
3104    [social, data, chart, visual, presentation, qu...
3105    [constructing, latent, vector, representation,...
3106    [present, smartexplore, novel, visual, analyti...
3107    [deep, neural, network, dnns, vulnerable, mali...
Name: Abstract, Length: 3035, dtype: object

In [16]:
# Create Dictionary
id2word = corpora.Dictionary(df['Abstract'])

model = TfIdfTransformer(dictionary=id2word)

In [31]:
id2word.doc2bow(df['Abstract'].iloc[9])

[(9, 1),
 (24, 1),
 (49, 1),
 (56, 1),
 (119, 3),
 (125, 2),
 (127, 1),
 (128, 1),
 (152, 1),
 (168, 2),
 (181, 2),
 (201, 1),
 (209, 1),
 (334, 2),
 (341, 1),
 (342, 1),
 (343, 1),
 (344, 1),
 (345, 1),
 (346, 1),
 (347, 1),
 (348, 1),
 (349, 1),
 (350, 1),
 (351, 2),
 (352, 1),
 (353, 1),
 (354, 1),
 (355, 1),
 (356, 1),
 (357, 1),
 (358, 1),
 (359, 1),
 (360, 1),
 (361, 1),
 (362, 1),
 (363, 1),
 (364, 1),
 (365, 1),
 (366, 1),
 (367, 1),
 (368, 1),
 (369, 1),
 (370, 1),
 (371, 1),
 (372, 1),
 (373, 1),
 (374, 2),
 (375, 3),
 (376, 1),
 (377, 2),
 (378, 1),
 (379, 3),
 (380, 1),
 (381, 2),
 (382, 1),
 (383, 1),
 (384, 1),
 (385, 1)]

In [18]:
# Create Corpus: Term Document Frequency
corpus = [id2word.doc2bow(text) for text in df['Abstract']]

num_docs = id2word.num_docs
num_terms = len(id2word.keys())

In [19]:
for doc in corpus[:1]:
    print([[id, id2word[id], freq] for id, freq in doc])

[[0, 'along', 1], [1, 'analysis', 1], [2, 'analyzing', 1], [3, 'basis', 1], [4, 'body', 2], [5, 'certain', 1], [6, 'characterized', 1], [7, 'class', 1], [8, 'critical', 4], [9, 'curve', 3], [10, 'data', 1], [11, 'direction', 1], [12, 'discussed', 1], [13, 'domain', 1], [14, 'either', 1], [15, 'field', 5], [16, 'flow', 4], [17, 'form', 1], [18, 'generate', 1], [19, 'generated', 1], [20, 'includes', 1], [21, 'integrated', 1], [22, 'linked', 1], [23, 'located', 1], [24, 'may', 1], [25, 'near', 2], [26, 'numerical', 1], [27, 'point', 5], [28, 'principal', 1], [29, 'provide', 1], [30, 'representation', 1], [31, 'representing', 1], [32, 'separation', 1], [33, 'set', 1], [34, 'skeleton', 2], [35, 'structure', 1], [36, 'tangent', 1], [37, 'tangential', 2], [38, 'threedimensional', 3], [39, 'topology', 2], [40, 'twodimensional', 3], [41, 'use', 1], [42, 'vector', 2], [43, 'velocity', 2]]


In [24]:
tfidf_corpus = model.fit_transform(corpus)

In [25]:
# construct an array of tf-idf vectors
from gensim.matutils import corpus2dense, corpus2csc

corpus_tfidf_dense = corpus2dense(tfidf_corpus, num_terms, num_docs)

In [26]:
corpus_tfidf_dense.shape

(16319, 3035)

In [38]:
X = corpus_tfidf_dense.T

In [40]:
type(X)

numpy.ndarray

## Clustering

In [41]:
cluster_model = KMeans(n_clusters=4)

In [43]:
cluster = cluster_model.fit(X)

In [None]:
''' 
    explain the cluster 
1. generate the avg tfidf for different clusters
2. what are the first 10,20 most important words that are used in the abstracts of this cluster

'''


