In [221]:
import pandas as pd
import numpy as np
import altair as alt
import seaborn as sns
import nltk
import re

In [222]:
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer

In [268]:
df = pd.read_csv("./IEEE VIS papers 1990-2018 - Main dataset.csv")

In [269]:
df = df.dropna(subset=['Abstract'])

In [270]:
stopwords = nltk.corpus.stopwords.words('english')
wn = nltk.WordNetLemmatizer()
tfidf = TfidfVectorizer()

In [271]:
df['Abstract'] = [abstract.lower() for abstract in df['Abstract']]
df.iloc[0]['Abstract']

'the use of critical point analysis to generate representations of the vector field topology of numerical flow data sets is discussed. critical points are located and characterized in a two-dimensional domain, which may be either a two-dimensional flow field or the tangential velocity field near a three-dimensional body. tangent curves are then integrated out along the principal directions of certain classes of critical points. the points and curves are linked to form a skeleton representing the two-dimensional vector field topology. when generated from the tangential velocity field near a body in a three-dimensional flow, the skeleton includes the critical points and curves which provide a basis for analyzing the three-dimensional structure of the flow separation.&lt;&lt;etx&gt;&gt;'

In [272]:
df['Abstract'] = [abstract[:-19] for abstract in df['Abstract']]
df.iloc[0]['Abstract']

'the use of critical point analysis to generate representations of the vector field topology of numerical flow data sets is discussed. critical points are located and characterized in a two-dimensional domain, which may be either a two-dimensional flow field or the tangential velocity field near a three-dimensional body. tangent curves are then integrated out along the principal directions of certain classes of critical points. the points and curves are linked to form a skeleton representing the two-dimensional vector field topology. when generated from the tangential velocity field near a body in a three-dimensional flow, the skeleton includes the critical points and curves which provide a basis for analyzing the three-dimensional structure of the flow separation.'

In [273]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [274]:
def remove_punct(text):
    text_nopunct = "".join([char for char in text if char not in string.punctuation])
    return text_nopunct

df['Abstract'] = df['Abstract'].apply(lambda x: remove_punct(x))

df.iloc[0]['Abstract']

'the use of critical point analysis to generate representations of the vector field topology of numerical flow data sets is discussed critical points are located and characterized in a twodimensional domain which may be either a twodimensional flow field or the tangential velocity field near a threedimensional body tangent curves are then integrated out along the principal directions of certain classes of critical points the points and curves are linked to form a skeleton representing the twodimensional vector field topology when generated from the tangential velocity field near a body in a threedimensional flow the skeleton includes the critical points and curves which provide a basis for analyzing the threedimensional structure of the flow separation'

In [275]:
def tokenize(text):
    tokens = re.split("\W+", text)
    return tokens

df['Abstract'] = df['Abstract'].apply(lambda x: tokenize(x))

df.iloc[0]['Abstract']

['the',
 'use',
 'of',
 'critical',
 'point',
 'analysis',
 'to',
 'generate',
 'representations',
 'of',
 'the',
 'vector',
 'field',
 'topology',
 'of',
 'numerical',
 'flow',
 'data',
 'sets',
 'is',
 'discussed',
 'critical',
 'points',
 'are',
 'located',
 'and',
 'characterized',
 'in',
 'a',
 'twodimensional',
 'domain',
 'which',
 'may',
 'be',
 'either',
 'a',
 'twodimensional',
 'flow',
 'field',
 'or',
 'the',
 'tangential',
 'velocity',
 'field',
 'near',
 'a',
 'threedimensional',
 'body',
 'tangent',
 'curves',
 'are',
 'then',
 'integrated',
 'out',
 'along',
 'the',
 'principal',
 'directions',
 'of',
 'certain',
 'classes',
 'of',
 'critical',
 'points',
 'the',
 'points',
 'and',
 'curves',
 'are',
 'linked',
 'to',
 'form',
 'a',
 'skeleton',
 'representing',
 'the',
 'twodimensional',
 'vector',
 'field',
 'topology',
 'when',
 'generated',
 'from',
 'the',
 'tangential',
 'velocity',
 'field',
 'near',
 'a',
 'body',
 'in',
 'a',
 'threedimensional',
 'flow',
 'the

In [276]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Sydney\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [253]:
def remove_stopwords(tokenized_list):
    text = [word for word in tokenized_list if word not in stopwords]
    return text

df['Abstract'] = df['Abstract'].apply(lambda x: remove_stopwords(x))

df.iloc[0]['Abstract']

['use',
 'critical',
 'point',
 'analysis',
 'generate',
 'representations',
 'vector',
 'field',
 'topology',
 'numerical',
 'flow',
 'data',
 'sets',
 'discussed',
 'critical',
 'points',
 'located',
 'characterized',
 'two',
 'dimensional',
 'domain',
 'may',
 'either',
 'two',
 'dimensional',
 'flow',
 'field',
 'tangential',
 'velocity',
 'field',
 'near',
 'three',
 'dimensional',
 'body',
 'tangent',
 'curves',
 'integrated',
 'along',
 'principal',
 'directions',
 'certain',
 'classes',
 'critical',
 'points',
 'points',
 'curves',
 'linked',
 'form',
 'skeleton',
 'representing',
 'two',
 'dimensional',
 'vector',
 'field',
 'topology',
 'generated',
 'tangential',
 'velocity',
 'field',
 'near',
 'body',
 'three',
 'dimensional',
 'flow',
 'skeleton',
 'includes',
 'critical',
 'points',
 'curves',
 'provide',
 'basis',
 'analyzing',
 'three',
 'dimensional',
 'structure',
 'flow',
 'separation',
 '']

In [277]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Sydney\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [278]:
def lemmatizing(tokenized_text):
    text = [wn.lemmatize(word) for word in tokenized_text]
    return text

df['Abstract'] = df['Abstract'].apply(lambda x: lemmatizing(x))

df.iloc[0]['Abstract']

['the',
 'use',
 'of',
 'critical',
 'point',
 'analysis',
 'to',
 'generate',
 'representation',
 'of',
 'the',
 'vector',
 'field',
 'topology',
 'of',
 'numerical',
 'flow',
 'data',
 'set',
 'is',
 'discussed',
 'critical',
 'point',
 'are',
 'located',
 'and',
 'characterized',
 'in',
 'a',
 'twodimensional',
 'domain',
 'which',
 'may',
 'be',
 'either',
 'a',
 'twodimensional',
 'flow',
 'field',
 'or',
 'the',
 'tangential',
 'velocity',
 'field',
 'near',
 'a',
 'threedimensional',
 'body',
 'tangent',
 'curve',
 'are',
 'then',
 'integrated',
 'out',
 'along',
 'the',
 'principal',
 'direction',
 'of',
 'certain',
 'class',
 'of',
 'critical',
 'point',
 'the',
 'point',
 'and',
 'curve',
 'are',
 'linked',
 'to',
 'form',
 'a',
 'skeleton',
 'representing',
 'the',
 'twodimensional',
 'vector',
 'field',
 'topology',
 'when',
 'generated',
 'from',
 'the',
 'tangential',
 'velocity',
 'field',
 'near',
 'a',
 'body',
 'in',
 'a',
 'threedimensional',
 'flow',
 'the',
 'skele

In [309]:
df['Abstract']

1       [the, use, of, critical, point, analysis, to, ...
2       [the, author, discus, fast, flow, analysis, so...
3       [the, vis5d, system, provides, highly, interac...
4       [the, author, present, a, simple, procedural, ...
5       [some, idea, and, technique, for, visualizing,...
                              ...                        
3103    [completing, text, analysis, task, is, a, cont...
3104    [social, data, chart, visual, presentation, of...
3105    [constructing, latent, vector, representation,...
3106    [we, present, smartexplore, a, novel, visual, ...
3107    [deep, neural, network, dnns, are, vulnerable,...
Name: Abstract, Length: 3035, dtype: object

In [310]:
tfidf = TfidfVectorizer()

tfidf_df = tfidf.fit_transform(df['Abstract'])

AttributeError: 'list' object has no attribute 'lower'

In [None]:
cluster_model = KMeans(n_clusters=4)

In [None]:
cluster = cluster_model.fit(df)