In [1]:
! pip install sentence-transformers 

Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l- \ done
Building wheels for collected packages: sentence-transformers
  Building wheel for sentence-transformers (setup.py) ... [?25l- \ | done
[?25h  Created wheel for sentence-transformers: filename=sentence_transformers-2.2.2-py3-none-any.whl size=125923 sha256=52b78721995fb261ac396692dded0b478934a88492dbf4de0faf803f7218210b
  Stored in directory: /root/.cache/pip/wheels/62/f2/10/1e606fd5f02395388f74e7462910fe851042f97238cbbd902f
Successfully built sentence-transformers
Installing collected packages: sentence-transformers
Successfully installed sentence-transformers-2.2.2


In [2]:
with open(file='/kaggle/input/paul-graham-essays/paul_graham_essays.txt', encoding='utf-8', mode='r') as input_fp:
    text = input_fp.read()
essays = text.split('\n')
print('essay count: {}'.format(len(essays)))

essay count: 55


In [3]:
# https://www.kaggle.com/code/ianalyticsgeek/keywords-extraction-using-bert
from arrow import now
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

# we can't use more features than we can visualize
MAX_FEATURES = 600

NGRAM_RANGE = (1, 1)
model_start = now()
count = TfidfVectorizer(ngram_range=NGRAM_RANGE, stop_words='english', max_features=MAX_FEATURES).fit(essays)
features = count.get_feature_names_out()
print('{}: got features'.format(now()))
model = SentenceTransformer('distilbert-base-nli-mean-tokens')
embedding = model.encode(essays)
print('{}: got embeddings'.format(now()))
feature_embeddings = model.encode(features)
print('{}: got feature embeddings'.format(now()))
print('model time: {}'.format(now() - model_start))

2023-11-30T19:43:45.564572+00:00: got features


Downloading .gitattributes:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading 1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading README.md:   0%|          | 0.00/3.99k [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/550 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/265M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/450 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

2023-11-30T19:44:00.794758+00:00: got embeddings


Batches:   0%|          | 0/19 [00:00<?, ?it/s]

2023-11-30T19:44:03.035839+00:00: got feature embeddings
model time: 0:00:18.146638


In [4]:
# post process the features 
keep = set(features)
# filter out numeric values
keep = {item for item in keep if not item.isnumeric()}
# do some basic plural identification and remove the plurals 
plurals = {item for item in keep if (item.endswith('s') and item[:-1] in keep) or (item.endswith('es') and item[:-2] in keep)}
keep = {item for item in keep if item not in plurals}
keep = keep.difference(plurals)
print(len(keep), keep)

534 {'work', 'write', 'position', 'business', 'step', 'simple', 'internet', 'hope', 'matter', 'cost', 'wisdom', 'discover', 'principle', 'programming', 'firm', 'second', 'public', 'used', 'math', 'approach', 'selling', 'yahoo', 'worth', 'microsoft', 'say', 'investment', 'hard', 'far', 'minded', 'sam', 'obvious', 'isn', 'unless', 'spam', 'mind', 'feel', 'imagine', 'series', 'valuable', 'asking', 'patent', 'meant', 'turn', 'steve', 'couldn', 'started', 'living', 'fund', 'possible', 'false', 'case', 'practically', 'head', 'invest', 'self', 'true', 'single', 'asked', 'happened', 'worst', 'past', 'reason', 'wrote', 'looking', 'fix', 'online', 'effect', 'essay', 'universities', 'random', 'low', 'combinator', 'won', 'took', 'data', 'ago', 'hacking', 'likely', 'existing', 'summer', 'hardware', 'money', 'lower', 'thanks', 'actual', 'thing', 'year', 'startup', 'close', 'win', 'till', 'came', 'hire', 'nerds', 'grad', 'silicon', 'kid', 'today', 'competitors', 'hear', 'said', 'class', 'companies', 

In [5]:
from nltk import pos_tag
from nltk import word_tokenize

noun_start = now()
keep_nouns = {word for item in essays for (word, part_of_speech) in pos_tag(word_tokenize(item)) if part_of_speech[:2] == 'NN' and word in keep}
print('filtered nouns in {}'.format(now() - noun_start))
keep = keep_nouns
print(len(keep), keep)

filtered nouns in 0:00:42.477734
348 {'work', 'write', 'position', 'business', 'step', 'internet', 'simple', 'hope', 'matter', 'cost', 'wisdom', 'discover', 'principle', 'programming', 'firm', 'second', 'public', 'math', 'approach', 'selling', 'worth', 'say', 'investment', 'hard', 'spam', 'mind', 'feel', 'imagine', 'series', 'asking', 'patent', 'meant', 'turn', 'living', 'fund', 'case', 'head', 'invest', 'self', 'past', 'reason', 'online', 'fix', 'essay', 'effect', 'universities', 'random', 'data', 'hacking', 'summer', 'money', 'hardware', 'thanks', 'thing', 'year', 'startup', 'close', 'win', 'till', 'hire', 'nerds', 'today', 'grad', 'kid', 'silicon', 'competitors', 'class', 'companies', 'friend', 'century', 'making', 'pick', 'depends', 'power', 'angel', 'word', 'stage', 'trouble', 'web', 'like', 'plus', 'good', 'predict', 'practice', 'cause', 'trick', 'grow', 'taste', 'organizations', 'price', 'saying', 'search', 'rich', 'kind', 'raise', 'mistaken', 'order', 'chance', 'worse', 'qualit

In [6]:
# now we can project the feature vectors into 2-space to visualize 
from pandas import DataFrame
from plotly.express import scatter
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, random_state=2023, verbose=1, n_iter=10000)
tsne_df = DataFrame(data=tsne.fit_transform(X=feature_embeddings), columns=['t0', 't1'])
tsne_df['word'] = features
scatter(data_frame=tsne_df[tsne_df['word'].isin(keep)], x='t0', y='t1', text='word', height=900, ).update_traces(marker={'size': 1})

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 600 samples in 0.001s...
[t-SNE] Computed neighbors for 600 samples in 0.191s...
[t-SNE] Computed conditional probabilities for sample 600 / 600
[t-SNE] Mean sigma: 3.295514
[t-SNE] KL divergence after 250 iterations with early exaggeration: 61.629436
[t-SNE] KL divergence after 4550 iterations: 1.019921
