In [1]:
import pandas as pd
import numpy as np
import altair as alt
import seaborn as sns
import nltk
import re

In [2]:
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
df = pd.read_csv("./IEEE VIS papers 1990-2018 - Main dataset.csv")

In [4]:
df = df.dropna(subset=['Abstract'])

In [5]:
stopwords = nltk.corpus.stopwords.words('english')
wn = nltk.WordNetLemmatizer()
tfidf = TfidfVectorizer()

In [6]:
df['Abstract'] = [abstract.lower() for abstract in df['Abstract']]
df.iloc[0]['Abstract']

'the use of critical point analysis to generate representations of the vector field topology of numerical flow data sets is discussed. critical points are located and characterized in a two-dimensional domain, which may be either a two-dimensional flow field or the tangential velocity field near a three-dimensional body. tangent curves are then integrated out along the principal directions of certain classes of critical points. the points and curves are linked to form a skeleton representing the two-dimensional vector field topology. when generated from the tangential velocity field near a body in a three-dimensional flow, the skeleton includes the critical points and curves which provide a basis for analyzing the three-dimensional structure of the flow separation.&lt;&lt;etx&gt;&gt;'

In [7]:
df['Abstract'] = [abstract[:-19] for abstract in df['Abstract']]
df.iloc[0]['Abstract']

'the use of critical point analysis to generate representations of the vector field topology of numerical flow data sets is discussed. critical points are located and characterized in a two-dimensional domain, which may be either a two-dimensional flow field or the tangential velocity field near a three-dimensional body. tangent curves are then integrated out along the principal directions of certain classes of critical points. the points and curves are linked to form a skeleton representing the two-dimensional vector field topology. when generated from the tangential velocity field near a body in a three-dimensional flow, the skeleton includes the critical points and curves which provide a basis for analyzing the three-dimensional structure of the flow separation.'

In [8]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [9]:
def remove_punct(text):
    text_nopunct = "".join([char for char in text if char not in string.punctuation])
    return text_nopunct

df['Abstract'] = df['Abstract'].apply(lambda x: remove_punct(x))

df.iloc[0]['Abstract']

'the use of critical point analysis to generate representations of the vector field topology of numerical flow data sets is discussed critical points are located and characterized in a twodimensional domain which may be either a twodimensional flow field or the tangential velocity field near a threedimensional body tangent curves are then integrated out along the principal directions of certain classes of critical points the points and curves are linked to form a skeleton representing the twodimensional vector field topology when generated from the tangential velocity field near a body in a threedimensional flow the skeleton includes the critical points and curves which provide a basis for analyzing the threedimensional structure of the flow separation'

In [10]:
def tokenize(text):
    tokens = re.split("\W+", text)
    return tokens

df['Abstract'] = df['Abstract'].apply(lambda x: tokenize(x))

df.iloc[0]['Abstract']

['the',
 'use',
 'of',
 'critical',
 'point',
 'analysis',
 'to',
 'generate',
 'representations',
 'of',
 'the',
 'vector',
 'field',
 'topology',
 'of',
 'numerical',
 'flow',
 'data',
 'sets',
 'is',
 'discussed',
 'critical',
 'points',
 'are',
 'located',
 'and',
 'characterized',
 'in',
 'a',
 'twodimensional',
 'domain',
 'which',
 'may',
 'be',
 'either',
 'a',
 'twodimensional',
 'flow',
 'field',
 'or',
 'the',
 'tangential',
 'velocity',
 'field',
 'near',
 'a',
 'threedimensional',
 'body',
 'tangent',
 'curves',
 'are',
 'then',
 'integrated',
 'out',
 'along',
 'the',
 'principal',
 'directions',
 'of',
 'certain',
 'classes',
 'of',
 'critical',
 'points',
 'the',
 'points',
 'and',
 'curves',
 'are',
 'linked',
 'to',
 'form',
 'a',
 'skeleton',
 'representing',
 'the',
 'twodimensional',
 'vector',
 'field',
 'topology',
 'when',
 'generated',
 'from',
 'the',
 'tangential',
 'velocity',
 'field',
 'near',
 'a',
 'body',
 'in',
 'a',
 'threedimensional',
 'flow',
 'the

In [11]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Sydney\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [12]:
def remove_stopwords(tokenized_list):
    text = [word for word in tokenized_list if word not in stopwords]
    return text

df['Abstract'] = df['Abstract'].apply(lambda x: remove_stopwords(x))

df.iloc[0]['Abstract']

['use',
 'critical',
 'point',
 'analysis',
 'generate',
 'representations',
 'vector',
 'field',
 'topology',
 'numerical',
 'flow',
 'data',
 'sets',
 'discussed',
 'critical',
 'points',
 'located',
 'characterized',
 'twodimensional',
 'domain',
 'may',
 'either',
 'twodimensional',
 'flow',
 'field',
 'tangential',
 'velocity',
 'field',
 'near',
 'threedimensional',
 'body',
 'tangent',
 'curves',
 'integrated',
 'along',
 'principal',
 'directions',
 'certain',
 'classes',
 'critical',
 'points',
 'points',
 'curves',
 'linked',
 'form',
 'skeleton',
 'representing',
 'twodimensional',
 'vector',
 'field',
 'topology',
 'generated',
 'tangential',
 'velocity',
 'field',
 'near',
 'body',
 'threedimensional',
 'flow',
 'skeleton',
 'includes',
 'critical',
 'points',
 'curves',
 'provide',
 'basis',
 'analyzing',
 'threedimensional',
 'structure',
 'flow',
 'separation']

In [13]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Sydney\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [14]:
def lemmatizing(tokenized_text):
    text = [wn.lemmatize(word) for word in tokenized_text]
    return text

df['Abstract'] = df['Abstract'].apply(lambda x: lemmatizing(x))

df.iloc[0]['Abstract']

['use',
 'critical',
 'point',
 'analysis',
 'generate',
 'representation',
 'vector',
 'field',
 'topology',
 'numerical',
 'flow',
 'data',
 'set',
 'discussed',
 'critical',
 'point',
 'located',
 'characterized',
 'twodimensional',
 'domain',
 'may',
 'either',
 'twodimensional',
 'flow',
 'field',
 'tangential',
 'velocity',
 'field',
 'near',
 'threedimensional',
 'body',
 'tangent',
 'curve',
 'integrated',
 'along',
 'principal',
 'direction',
 'certain',
 'class',
 'critical',
 'point',
 'point',
 'curve',
 'linked',
 'form',
 'skeleton',
 'representing',
 'twodimensional',
 'vector',
 'field',
 'topology',
 'generated',
 'tangential',
 'velocity',
 'field',
 'near',
 'body',
 'threedimensional',
 'flow',
 'skeleton',
 'includes',
 'critical',
 'point',
 'curve',
 'provide',
 'basis',
 'analyzing',
 'threedimensional',
 'structure',
 'flow',
 'separation']

In [15]:
df['Abstract']

1       [use, critical, point, analysis, generate, rep...
2       [author, discus, fast, flow, analysis, softwar...
3       [vis5d, system, provides, highly, interactive,...
4       [author, present, simple, procedural, interfac...
5       [idea, technique, visualizing, volumetric, dat...
                              ...                        
3103    [completing, text, analysis, task, continuous,...
3104    [social, data, chart, visual, presentation, qu...
3105    [constructing, latent, vector, representation,...
3106    [present, smartexplore, novel, visual, analyti...
3107    [deep, neural, network, dnns, vulnerable, mali...
Name: Abstract, Length: 3035, dtype: object

In [30]:
tfidf = TfidfVectorizer()

tfidf_matrix = [tfidf.fit_transform(abstract) for abstract in df['Abstract']]

In [31]:
tfidf_matrix

[<71x44 sparse matrix of type '<class 'numpy.float64'>'
 	with 71 stored elements in Compressed Sparse Row format>,
 <38x30 sparse matrix of type '<class 'numpy.float64'>'
 	with 38 stored elements in Compressed Sparse Row format>,
 <67x48 sparse matrix of type '<class 'numpy.float64'>'
 	with 67 stored elements in Compressed Sparse Row format>,
 <53x40 sparse matrix of type '<class 'numpy.float64'>'
 	with 53 stored elements in Compressed Sparse Row format>,
 <69x52 sparse matrix of type '<class 'numpy.float64'>'
 	with 69 stored elements in Compressed Sparse Row format>,
 <76x55 sparse matrix of type '<class 'numpy.float64'>'
 	with 76 stored elements in Compressed Sparse Row format>,
 <71x60 sparse matrix of type '<class 'numpy.float64'>'
 	with 71 stored elements in Compressed Sparse Row format>,
 <79x53 sparse matrix of type '<class 'numpy.float64'>'
 	with 79 stored elements in Compressed Sparse Row format>,
 <67x44 sparse matrix of type '<class 'numpy.float64'>'
 	with 67 stored

In [32]:
tfidf_matrix = [tfidf.todense() for tfidf in tfidf_matrix]

In [33]:
tfidf_matrix

[matrix([[0., 0., 0., ..., 1., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.]]),
 matrix([[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.]]),
 matrix([[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.]]),
 matrix([[1., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 

In [37]:
tfidf_sample = tfidf_matrix[0]

In [38]:
tfidf_sample

matrix([[0., 0., 0., ..., 1., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])

In [39]:
tfidf_sample_df = pd.DataFrame(tfidf_sample, dtype=object)

In [40]:
tfidf_sample_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,34,35,36,37,38,39,40,41,42,43
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
67,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
68,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
69,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [35]:
tfidf_df = pd.DataFrame(tfidf_matrix, dtype=object)

In [36]:
tfidf_df

Unnamed: 0,0
0,[[[[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0....
1,[[[[[0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0....
2,[[[[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0....
3,[[[[[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0....
4,[[[[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0....
...,...
3030,[[[[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0....
3031,[[[[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0....
3032,[[[[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0....
3033,[[[[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0....


In [45]:
tfidf_df.iloc[0]

0    [[[[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0....
Name: 0, dtype: object


In [None]:
cluster_model = KMeans(n_clusters=4)

In [None]:
cluster = cluster_model.fit(df)