In [1]:
%env TOKENIZERS_PARALLELISM=true
! pip install sentence-transformers 

env: TOKENIZERS_PARALLELISM=true
Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l- \ done
Building wheels for collected packages: sentence-transformers
  Building wheel for sentence-transformers (setup.py) ... [?25l- \ | done
[?25h  Created wheel for sentence-transformers: filename=sentence_transformers-2.2.2-py3-none-any.whl size=125923 sha256=5dc302cd41ac83c786829bf3c50f280ebc16bab6658a1fecdef9033bb0c7482f
  Stored in directory: /root/.cache/pip/wheels/62/f2/10/1e606fd5f02395388f74e7462910fe851042f97238cbbd902f
Successfully built sentence-transformers
Installing collected packages: sentence-transformers
Successfully installed sentence-transformers-2.2.2


In [2]:
with open(file='/kaggle/input/paul-graham-essays/paul_graham_essays.txt', encoding='utf-8', mode='r') as input_fp:
    text = input_fp.read()
essays = text.split('\n')
print('essay count: {}'.format(len(essays)))

essay count: 55


In [3]:
from arrow import now
from collections import Counter
from nltk import pos_tag
from nltk import word_tokenize
from pandas import DataFrame
from pandas import Series
from plotly.express import bar

time_start = now()
count_df = DataFrame(data=[Series(data=dict(Counter([word for (word, part_of_speech) in pos_tag(word_tokenize(essay)) if part_of_speech in {'NNP', 'NNPS'}]))) for essay in essays]).fillna(value=0)
print('noun count time: {}'.format(now() - time_start))
sum_df = count_df.sum(axis=0).to_frame().reset_index()
bar(data_frame=sum_df.sort_values(ascending=False, by=0).head(n=50), x='index', y=0)

noun count time: 0:00:40.951847


TextBlob unfortunately takes more than an hour to do this task; it produces better results, but entirely in lowercase.

In [4]:
from spacy import load
from spacy.lang.en import stop_words as stop_words


spacy_start = now()
nlp = load('en_core_web_lg')
nlp.max_length = 2655263

data = []
for essay in essays:
    document = nlp(essay)
    count = dict(Counter([chunk.text for chunk in document.noun_chunks]))
    series = Series(data=count)
    data.append(series)
spacy_df = DataFrame(data=data)
print('spacy time: {}'.format(now() - spacy_start))

spacy time: 0:02:24.949808


In [5]:
sum_spacy_df = spacy_df.sum(axis=0).to_frame().reset_index()
sorted_df = sum_spacy_df[~sum_spacy_df['index'].str.lower().isin(stop_words.STOP_WORDS)].sort_values(ascending=False, by=0)
bar(data_frame=sorted_df.head(n=50), x='index', y=0)

In [6]:
# https://www.kaggle.com/code/ianalyticsgeek/keywords-extraction-using-bert
from arrow import now
from sentence_transformers import SentenceTransformer

# we can't use more features than we can visualize
MAX_FEATURES = 500

model_start = now()
model = SentenceTransformer('distilbert-base-nli-mean-tokens')
# we need to encode the essays to get the words' relationships to each other
embedding = model.encode(essays)
print('{}: got embeddings'.format(now()))
features = sorted_df.head(n=MAX_FEATURES)['index'].values
feature_counts = sorted_df.head(n=MAX_FEATURES)[0].values
feature_embeddings = model.encode(features)
print('{}: got feature embeddings'.format(now()))
print('model time: {}'.format(now() - model_start))

Downloading .gitattributes:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading 1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading README.md:   0%|          | 0.00/3.99k [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/550 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/265M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/450 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

2023-12-01T20:07:12.265613+00:00: got embeddings


Batches:   0%|          | 0/16 [00:00<?, ?it/s]

2023-12-01T20:07:14.550077+00:00: got feature embeddings
model time: 0:00:16.590560


In [7]:
# now we can project the feature vectors into 2-space to visualize 
from pandas import DataFrame
from plotly.express import scatter
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, random_state=2023, verbose=1, n_iter=10000)
tsne_df = DataFrame(data=tsne.fit_transform(X=feature_embeddings), columns=['t0', 't1'])
tsne_df['word'] = features
tsne_df['count'] = feature_counts
scatter(data_frame=tsne_df, x='t0', y='t1', text='word', height=900, hover_data=['count'] ).update_traces(marker={'size': 1})

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 500 samples in 0.001s...
[t-SNE] Computed neighbors for 500 samples in 0.284s...
[t-SNE] Computed conditional probabilities for sample 500 / 500
[t-SNE] Mean sigma: 4.105580
[t-SNE] KL divergence after 250 iterations with early exaggeration: 59.944176
[t-SNE] KL divergence after 2550 iterations: 0.890005


This is kind of cluttered but it does a good job of retaining names; it probably needs some cleanup to clean up terms that differ only by an article.