In [2]:
from zipfile import ZipFile


with ZipFile('abstracts.zip', 'r') as zipObj:
    zipObj.extractall()


In [16]:
import os

rootdir = 'abstracts'
documents = []

for sdir, dirs, files in os.walk(rootdir):      
    for file in files:
        
        with open(os.path.join(sdir,file), "rt", encoding="ISO-8859-1") as f:
            documents.append([line.strip() for line in f.readlines()])
            
abstracts = []
for d in documents:
    s = ""
    for i, t in enumerate(d):
        if "Abstract" in t:
            for t in d[i+1:]:
                s += f" {t}"
                
    abstracts.append(s)
print(len(abstracts))


134616


In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Set parameters and initialize
tfidf_vectorizer = TfidfVectorizer(min_df=2, use_idf=True, sublinear_tf=True, max_df=0.8, max_features=10000)

# Calcualate term-document matrix with tf-idf scores
tfidf_matrix = tfidf_vectorizer.fit_transform(abstracts)

# Check matrix shape
tfidf_matrix.toarray().shape # N_docs x N_terms


(134616, 10000)

In [19]:
from collections import Counter
terms_in_docs = tfidf_vectorizer.inverse_transform(tfidf_matrix)
token_counter = Counter()
for terms in terms_in_docs:
    token_counter.update(terms)

for term, count in token_counter.most_common(20):
    print("%d\t%s" % (count, term))


107049	for
106162	will
104838	is
95198	be
89786	on
86282	with
79945	research
79385	are
77951	that
77776	by
73434	as
67041	from
66950	an
61325	at
61119	these
60749	project
55330	which
45435	have
43445	new
41971	study


In [20]:
features = tfidf_vectorizer.get_feature_names()

print("Sample word:", features[1000])
print("Occurs in %d documents" % len([x for x in tfidf_matrix.toarray()[:][1000] if x > 0]))
print("out of %d documents" % len(tfidf_matrix.toarray()))


Sample word: bear
Occurs in 62 documents
out of 134616 documents


In [21]:
matrix_sample = tfidf_matrix[:1000]
from sklearn.cluster import KMeans

# Do clustering
km = KMeans(n_clusters=30, random_state=123, verbose=0)
km.fit(matrix_sample)


KMeans(n_clusters=30, random_state=123)

In [22]:
import heapq, numpy as np

# Custom function to print top keywords for each cluster
def print_clusters(matrix, clusters, n_keywords=10):
    for cluster in range(min(clusters), max(clusters)+1):
        cluster_docs = [i for i, c in enumerate(clusters) if c == cluster]
        print("Cluster: %d (%d docs)" % (cluster, len(cluster_docs)))
        
        # Keep scores for top n terms
        new_matrix = np.zeros((len(cluster_docs), matrix.shape[1]))
        for cluster_i, doc_vec in enumerate(matrix[cluster_docs].toarray()):
            for idx, score in heapq.nlargest(n_keywords, enumerate(doc_vec), key=lambda x:x[1]):
                new_matrix[cluster_i][idx] = score

        # Aggregate scores for kept top terms
        keywords = heapq.nlargest(n_keywords, zip(new_matrix.sum(axis=0), features))
        print(', '.join([w for s,w in keywords]))
        print()


In [23]:
print_clusters(matrix_sample, km.labels_)


Cluster: 0 (22 docs)
rings, algebras, algebraic, theory, ring, representation, cohomology, finite, groups, commutative

Cluster: 1 (17 docs)
postdoctoral, dr, infrastructural, professor, sponsorship, supervision, ds, chemistry, williams, synthesis

Cluster: 2 (146 docs)
organic, ice, gas, mexico, compounds, exchange, river, reactions, spectrometer, selenium

Cluster: 3 (44 docs)
students, student, projects, tech, minority, 1990, cornell, reu, expert, oral

Cluster: 4 (46 docs)
reu, faculty, minority, undergraduates, students, participants, summer, engineering, letters, recruit

Cluster: 5 (60 docs)
workshop, continuation, timely, successful, indian, particle, dr, nuclear, ion, action

Cluster: 6 (25 docs)
conference, 1990, international, june, vermont, august, september, japan, real, analysts

Cluster: 7 (13 docs)
vessels, ship, crew, oceanographic, vessel, operate, equipment, projects, operates, constructed

Cluster: 8 (9 docs)
algebraic, geometry, arithmetic, whole, quarter, varietie

In [24]:

# Fast and simple tokenization
new_vectorizer = TfidfVectorizer()
word_tokenizer = new_vectorizer.build_tokenizer()
tokenized_text = [word_tokenizer(doc) for doc in abstracts]

# Train LDA model
from gensim import corpora, models
dictionary = corpora.Dictionary(tokenized_text)
lda_corpus = [dictionary.doc2bow(text) for text in tokenized_text]
lda_model = models.LdaModel(lda_corpus, id2word=dictionary, num_topics=10)


2021-02-24 11:01:31,446 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2021-02-24 11:01:32,979 : INFO : adding document #10000 to Dictionary(56698 unique tokens: ['BODY', 'BR', 'HREF', 'HTML', 'awd_1990_00']...)
2021-02-24 11:01:34,523 : INFO : adding document #20000 to Dictionary(85780 unique tokens: ['BODY', 'BR', 'HREF', 'HTML', 'awd_1990_00']...)
2021-02-24 11:01:36,082 : INFO : adding document #30000 to Dictionary(112035 unique tokens: ['BODY', 'BR', 'HREF', 'HTML', 'awd_1990_00']...)
2021-02-24 11:01:37,748 : INFO : adding document #40000 to Dictionary(144174 unique tokens: ['BODY', 'BR', 'HREF', 'HTML', 'awd_1990_00']...)
2021-02-24 11:01:39,524 : INFO : adding document #50000 to Dictionary(176883 unique tokens: ['BODY', 'BR', 'HREF', 'HTML', 'awd_1990_00']...)
2021-02-24 11:01:41,417 : INFO : adding document #60000 to Dictionary(207044 unique tokens: ['BODY', 'BR', 'HREF', 'HTML', 'awd_1990_00']...)
2021-02-24 11:01:43,422 : INFO : adding document #70000 to Dict

2021-02-24 11:02:29,992 : INFO : topic #2 (0.100): 0.047*"and" + 0.044*"the" + 0.030*"of" + 0.022*"to" + 0.018*"in" + 0.017*"will" + 0.015*"students" + 0.013*"The" + 0.011*"teachers" + 0.010*"school"
2021-02-24 11:02:30,002 : INFO : topic diff=0.215971, rho=0.447214
2021-02-24 11:02:30,006 : INFO : PROGRESS: pass 0, at document #12000/134616
2021-02-24 11:02:31,720 : INFO : merging changes from 2000 documents into a model of 134616 documents
2021-02-24 11:02:31,895 : INFO : topic #9 (0.100): 0.455*"txt" + 0.230*"HREF" + 0.230*"BR" + 0.001*"of" + 0.001*"BODY" + 0.001*"HTML" + 0.001*"the" + 0.001*"and" + 0.000*"to" + 0.000*"in"
2021-02-24 11:02:31,899 : INFO : topic #3 (0.100): 0.058*"of" + 0.058*"the" + 0.025*"and" + 0.023*"to" + 0.022*"in" + 0.014*"is" + 0.014*"will" + 0.013*"theory" + 0.012*"The" + 0.011*"on"
2021-02-24 11:02:31,903 : INFO : topic #7 (0.100): 0.055*"of" + 0.046*"the" + 0.034*"to" + 0.032*"and" + 0.019*"in" + 0.018*"will" + 0.016*"is" + 0.015*"be" + 0.013*"for" + 0.012

2021-02-24 11:02:45,082 : INFO : topic #0 (0.100): 0.044*"the" + 0.035*"of" + 0.017*"in" + 0.013*"at" + 0.010*"and" + 0.010*"to" + 0.010*"physics" + 0.009*"high" + 0.009*"will" + 0.009*"for"
2021-02-24 11:02:45,086 : INFO : topic #6 (0.100): 0.073*"the" + 0.062*"of" + 0.044*"and" + 0.027*"in" + 0.026*"to" + 0.018*"will" + 0.010*"The" + 0.010*"is" + 0.009*"be" + 0.008*"on"
2021-02-24 11:02:45,097 : INFO : topic diff=0.156977, rho=0.288675
2021-02-24 11:02:45,101 : INFO : PROGRESS: pass 0, at document #26000/134616
2021-02-24 11:02:46,710 : INFO : merging changes from 2000 documents into a model of 134616 documents
2021-02-24 11:02:46,887 : INFO : topic #1 (0.100): 0.066*"the" + 0.051*"and" + 0.038*"of" + 0.027*"to" + 0.021*"will" + 0.021*"for" + 0.018*"The" + 0.015*"in" + 0.012*"be" + 0.010*"is"
2021-02-24 11:02:46,892 : INFO : topic #2 (0.100): 0.039*"and" + 0.039*"the" + 0.027*"students" + 0.025*"of" + 0.023*"will" + 0.022*"in" + 0.021*"to" + 0.011*"The" + 0.011*"teachers" + 0.010*"fa

2021-02-24 11:02:57,563 : INFO : topic #8 (0.100): 0.058*"the" + 0.055*"of" + 0.054*"and" + 0.033*"in" + 0.030*"to" + 0.017*"will" + 0.015*"The" + 0.015*"for" + 0.014*"research" + 0.011*"is"
2021-02-24 11:02:57,566 : INFO : topic #9 (0.100): 0.467*"txt" + 0.234*"BR" + 0.234*"HREF" + 0.001*"BODY" + 0.001*"HTML" + 0.000*"of" + 0.000*"the" + 0.000*"and" + 0.000*"to" + 0.000*"in"
2021-02-24 11:02:57,577 : INFO : topic diff=0.100730, rho=0.229416
2021-02-24 11:03:00,147 : INFO : -7.887 per-word bound, 236.7 perplexity estimate based on a held-out corpus of 2000 documents with 367724 words
2021-02-24 11:03:00,148 : INFO : PROGRESS: pass 0, at document #40000/134616
2021-02-24 11:03:01,778 : INFO : merging changes from 2000 documents into a model of 134616 documents
2021-02-24 11:03:01,954 : INFO : topic #1 (0.100): 0.062*"the" + 0.054*"and" + 0.037*"of" + 0.029*"to" + 0.022*"for" + 0.019*"will" + 0.018*"The" + 0.014*"in" + 0.010*"be" + 0.010*"is"
2021-02-24 11:03:01,959 : INFO : topic #6 (0.

2021-02-24 11:03:12,670 : INFO : topic #4 (0.100): 0.064*"the" + 0.057*"of" + 0.032*"and" + 0.029*"in" + 0.029*"to" + 0.015*"is" + 0.014*"will" + 0.013*"that" + 0.011*"be" + 0.011*"The"
2021-02-24 11:03:12,675 : INFO : topic #1 (0.100): 0.060*"the" + 0.056*"and" + 0.036*"of" + 0.030*"to" + 0.023*"for" + 0.020*"will" + 0.017*"The" + 0.014*"in" + 0.010*"is" + 0.010*"be"
2021-02-24 11:03:12,679 : INFO : topic #6 (0.100): 0.070*"the" + 0.060*"of" + 0.046*"and" + 0.026*"to" + 0.025*"in" + 0.019*"will" + 0.009*"The" + 0.009*"is" + 0.009*"be" + 0.008*"on"
2021-02-24 11:03:12,689 : INFO : topic diff=0.094648, rho=0.196116
2021-02-24 11:03:12,694 : INFO : PROGRESS: pass 0, at document #54000/134616
2021-02-24 11:03:14,324 : INFO : merging changes from 2000 documents into a model of 134616 documents
2021-02-24 11:03:14,511 : INFO : topic #4 (0.100): 0.064*"the" + 0.057*"of" + 0.032*"and" + 0.029*"in" + 0.028*"to" + 0.015*"is" + 0.014*"will" + 0.012*"that" + 0.011*"be" + 0.010*"The"
2021-02-24 11

2021-02-24 11:03:27,571 : INFO : topic #2 (0.100): 0.050*"and" + 0.033*"the" + 0.032*"students" + 0.027*"in" + 0.022*"to" + 0.020*"of" + 0.020*"will" + 0.012*"science" + 0.010*"for" + 0.010*"The"
2021-02-24 11:03:27,576 : INFO : topic #4 (0.100): 0.064*"the" + 0.056*"of" + 0.032*"and" + 0.030*"in" + 0.028*"to" + 0.015*"is" + 0.014*"will" + 0.014*"that" + 0.011*"be" + 0.010*"The"
2021-02-24 11:03:27,580 : INFO : topic #7 (0.100): 0.052*"the" + 0.050*"of" + 0.035*"and" + 0.033*"to" + 0.019*"in" + 0.016*"is" + 0.015*"for" + 0.012*"The" + 0.012*"that" + 0.011*"be"
2021-02-24 11:03:27,585 : INFO : topic #1 (0.100): 0.059*"the" + 0.056*"and" + 0.036*"of" + 0.029*"to" + 0.024*"for" + 0.019*"will" + 0.016*"The" + 0.013*"in" + 0.010*"research" + 0.009*"is"
2021-02-24 11:03:27,596 : INFO : topic diff=0.066937, rho=0.174078
2021-02-24 11:03:27,601 : INFO : PROGRESS: pass 0, at document #68000/134616
2021-02-24 11:03:29,241 : INFO : merging changes from 2000 documents into a model of 134616 docume

2021-02-24 11:03:42,897 : INFO : topic #7 (0.100): 0.052*"the" + 0.049*"of" + 0.035*"and" + 0.033*"to" + 0.019*"in" + 0.016*"is" + 0.015*"for" + 0.012*"that" + 0.012*"The" + 0.011*"be"
2021-02-24 11:03:42,901 : INFO : topic #4 (0.100): 0.064*"the" + 0.056*"of" + 0.032*"and" + 0.030*"in" + 0.028*"to" + 0.015*"is" + 0.014*"that" + 0.014*"will" + 0.011*"be" + 0.010*"The"
2021-02-24 11:03:42,905 : INFO : topic #9 (0.100): 0.467*"txt" + 0.234*"BR" + 0.234*"HREF" + 0.001*"HTML" + 0.001*"BODY" + 0.000*"of" + 0.000*"the" + 0.000*"and" + 0.000*"to" + 0.000*"in"
2021-02-24 11:03:42,911 : INFO : topic #1 (0.100): 0.057*"the" + 0.055*"and" + 0.036*"of" + 0.028*"to" + 0.025*"for" + 0.018*"will" + 0.016*"The" + 0.013*"in" + 0.010*"research" + 0.008*"be"
2021-02-24 11:03:42,915 : INFO : topic #6 (0.100): 0.069*"the" + 0.058*"of" + 0.046*"and" + 0.025*"to" + 0.024*"in" + 0.018*"will" + 0.009*"The" + 0.009*"be" + 0.009*"is" + 0.008*"on"
2021-02-24 11:03:42,926 : INFO : topic diff=0.033484, rho=0.158114

2021-02-24 11:03:55,518 : INFO : merging changes from 2000 documents into a model of 134616 documents
2021-02-24 11:03:55,693 : INFO : topic #3 (0.100): 0.066*"of" + 0.060*"the" + 0.026*"and" + 0.026*"in" + 0.023*"to" + 0.016*"is" + 0.015*"theory" + 0.011*"The" + 0.010*"problems" + 0.009*"on"
2021-02-24 11:03:55,698 : INFO : topic #0 (0.100): 0.050*"the" + 0.032*"of" + 0.014*"at" + 0.014*"in" + 0.013*"will" + 0.011*"optical" + 0.010*"high" + 0.009*"be" + 0.009*"The" + 0.009*"to"
2021-02-24 11:03:55,704 : INFO : topic #8 (0.100): 0.063*"the" + 0.058*"and" + 0.055*"of" + 0.031*"in" + 0.029*"to" + 0.018*"will" + 0.016*"The" + 0.015*"research" + 0.014*"for" + 0.011*"on"
2021-02-24 11:03:55,708 : INFO : topic #7 (0.100): 0.051*"the" + 0.048*"of" + 0.035*"and" + 0.033*"to" + 0.019*"in" + 0.016*"is" + 0.014*"for" + 0.013*"that" + 0.012*"The" + 0.011*"be"
2021-02-24 11:03:55,713 : INFO : topic #2 (0.100): 0.051*"and" + 0.034*"students" + 0.033*"the" + 0.028*"in" + 0.024*"to" + 0.019*"of" + 0.0

2021-02-24 11:04:09,910 : INFO : topic #8 (0.100): 0.064*"the" + 0.058*"and" + 0.055*"of" + 0.031*"in" + 0.029*"to" + 0.018*"will" + 0.016*"The" + 0.015*"research" + 0.014*"for" + 0.010*"on"
2021-02-24 11:04:09,920 : INFO : topic diff=0.079395, rho=0.137361
2021-02-24 11:04:09,925 : INFO : PROGRESS: pass 0, at document #108000/134616
2021-02-24 11:04:11,663 : INFO : merging changes from 2000 documents into a model of 134616 documents
2021-02-24 11:04:11,840 : INFO : topic #4 (0.100): 0.061*"the" + 0.056*"of" + 0.033*"and" + 0.030*"in" + 0.028*"to" + 0.015*"is" + 0.014*"that" + 0.014*"will" + 0.011*"be" + 0.010*"The"
2021-02-24 11:04:11,845 : INFO : topic #5 (0.100): 0.060*"of" + 0.054*"the" + 0.046*"and" + 0.023*"to" + 0.022*"in" + 0.020*"will" + 0.015*"be" + 0.013*"The" + 0.010*"is" + 0.010*"for"
2021-02-24 11:04:11,849 : INFO : topic #2 (0.100): 0.052*"and" + 0.034*"students" + 0.033*"the" + 0.027*"in" + 0.024*"to" + 0.020*"of" + 0.016*"will" + 0.012*"science" + 0.011*"undergraduate"

2021-02-24 11:04:25,960 : INFO : topic #8 (0.100): 0.064*"the" + 0.060*"and" + 0.055*"of" + 0.030*"in" + 0.029*"to" + 0.017*"will" + 0.016*"The" + 0.014*"research" + 0.014*"for" + 0.010*"on"
2021-02-24 11:04:25,965 : INFO : topic #1 (0.100): 0.057*"and" + 0.050*"the" + 0.034*"of" + 0.027*"for" + 0.026*"to" + 0.019*"will" + 0.015*"The" + 0.011*"in" + 0.010*"research" + 0.010*"data"
2021-02-24 11:04:25,970 : INFO : topic #9 (0.100): 0.417*"txt" + 0.210*"BR" + 0.209*"HREF" + 0.002*"HTML" + 0.001*"BODY" + 0.000*"a0096206" + 0.000*"a0096204" + 0.000*"a0096205" + 0.000*"a0096202" + 0.000*"a0096209"
2021-02-24 11:04:25,980 : INFO : topic diff=0.069957, rho=0.129099
2021-02-24 11:04:25,984 : INFO : PROGRESS: pass 0, at document #122000/134616
2021-02-24 11:04:27,664 : INFO : merging changes from 2000 documents into a model of 134616 documents
2021-02-24 11:04:27,843 : INFO : topic #6 (0.100): 0.068*"the" + 0.056*"of" + 0.047*"and" + 0.025*"to" + 0.024*"in" + 0.017*"will" + 0.009*"The" + 0.008*

2021-02-24 11:04:39,424 : INFO : topic #9 (0.100): 0.334*"txt" + 0.168*"BR" + 0.168*"HREF" + 0.002*"HTML" + 0.001*"BODY" + 0.000*"a0096244" + 0.000*"a0096143" + 0.000*"a0096211" + 0.000*"a0096135" + 0.000*"a0096030"
2021-02-24 11:04:39,430 : INFO : topic #3 (0.100): 0.068*"of" + 0.062*"the" + 0.028*"and" + 0.026*"in" + 0.024*"to" + 0.018*"is" + 0.013*"theory" + 0.012*"The" + 0.009*"on" + 0.008*"problems"
2021-02-24 11:04:39,436 : INFO : topic #6 (0.100): 0.068*"the" + 0.055*"of" + 0.047*"and" + 0.025*"to" + 0.024*"in" + 0.017*"will" + 0.009*"The" + 0.009*"be" + 0.008*"is" + 0.008*"on"
2021-02-24 11:04:39,440 : INFO : topic #1 (0.100): 0.058*"and" + 0.049*"the" + 0.034*"of" + 0.027*"for" + 0.026*"to" + 0.020*"will" + 0.015*"The" + 0.011*"in" + 0.010*"data" + 0.010*"research"
2021-02-24 11:04:39,451 : INFO : topic diff=0.030247, rho=0.122169
2021-02-24 11:04:40,490 : INFO : -7.706 per-word bound, 208.8 perplexity estimate based on a held-out corpus of 616 documents with 171077 words
2021

In [25]:
import gensim # Make sure you also have cython installed to accelerate computation!

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# Train word2vec model
vectors = gensim.models.Word2Vec(tokenized_text, size=100, window=5, min_count=3, workers=4)


2021-02-24 11:04:41,751 : INFO : collecting all words and their counts
2021-02-24 11:04:41,752 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2021-02-24 11:04:42,068 : INFO : PROGRESS: at sentence #10000, processed 1592257 words, keeping 56698 word types
2021-02-24 11:04:42,373 : INFO : PROGRESS: at sentence #20000, processed 3174072 words, keeping 85780 word types
2021-02-24 11:04:42,722 : INFO : PROGRESS: at sentence #30000, processed 4841275 words, keeping 112035 word types
2021-02-24 11:04:43,092 : INFO : PROGRESS: at sentence #40000, processed 6653130 words, keeping 144174 word types
2021-02-24 11:04:43,491 : INFO : PROGRESS: at sentence #50000, processed 8607974 words, keeping 176883 word types
2021-02-24 11:04:43,917 : INFO : PROGRESS: at sentence #60000, processed 10750215 words, keeping 207044 word types
2021-02-24 11:04:44,347 : INFO : PROGRESS: at sentence #70000, processed 12983247 words, keeping 236305 word types
2021-02-24 11:04:44,792 : INFO :

2021-02-24 11:05:52,732 : INFO : EPOCH 3 - PROGRESS: at 83.33% examples, 1485552 words/s, in_qsize 7, out_qsize 0
2021-02-24 11:05:53,740 : INFO : EPOCH 3 - PROGRESS: at 88.82% examples, 1493257 words/s, in_qsize 6, out_qsize 2
2021-02-24 11:05:54,742 : INFO : EPOCH 3 - PROGRESS: at 94.32% examples, 1498264 words/s, in_qsize 7, out_qsize 0
2021-02-24 11:05:55,746 : INFO : EPOCH 3 - PROGRESS: at 99.62% examples, 1503226 words/s, in_qsize 7, out_qsize 0
2021-02-24 11:05:55,799 : INFO : worker thread finished; awaiting finish of 3 more threads
2021-02-24 11:05:55,800 : INFO : worker thread finished; awaiting finish of 2 more threads
2021-02-24 11:05:55,809 : INFO : worker thread finished; awaiting finish of 1 more threads
2021-02-24 11:05:55,811 : INFO : worker thread finished; awaiting finish of 0 more threads
2021-02-24 11:05:55,812 : INFO : EPOCH - 3 : training on 29762964 raw words (22758318 effective words) took 15.1s, 1503758 effective words/s
2021-02-24 11:05:56,817 : INFO : EPOCH 

In [30]:
words = [list(vectors.wv.vocab.keys())[(i+1)*1500] for i in range(5)]
print(words)

['example', 'supplied', 'Spur', 'matching', 'Cortez']


In [31]:
print(vectors.wv.most_similar(words[0]))
print(vectors.wv.most_similar(words[1]))
print(vectors.wv.most_similar(words[2]))
print(vectors.wv.most_similar(words[3]))
print(vectors.wv.most_similar(words[4]))



[('instance', 0.9057872295379639), ('reason', 0.5728881359100342), ('examp', 0.5016827583312988), ('substitute', 0.49197518825531006), ('explanation', 0.48618289828300476), ('even', 0.4790940284729004), ('accounting', 0.47868359088897705), ('nonmembers', 0.47086846828460693), ('ordinary', 0.4629876911640167), ('accounted', 0.4454946517944336)]
[('replaced', 0.721525251865387), ('delivered', 0.7046450972557068), ('provided', 0.702826738357544), ('staffed', 0.6989473104476929), ('transferred', 0.6924272179603577), ('handled', 0.67039954662323), ('overseen', 0.6643173098564148), ('produced', 0.6569195985794067), ('consumed', 0.6516040563583374), ('accessed', 0.6462578773498535)]
[('STDS', 0.46486616134643555), ('surround', 0.4621392488479614), ('Conductivity', 0.453136146068573), ('Flat', 0.44033509492874146), ('Icefield', 0.4388269782066345), ('matric', 0.43660205602645874), ('Mounds', 0.43209218978881836), ('OII', 0.4320000112056732), ('Larsemann', 0.43066370487213135), ('398', 0.425633