In [1]:
%env TOKENIZERS_PARALLELISM=true
! pip install sentence-transformers 

env: TOKENIZERS_PARALLELISM=true
Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l- \ done
Building wheels for collected packages: sentence-transformers
  Building wheel for sentence-transformers (setup.py) ... [?25l- \ | done
[?25h  Created wheel for sentence-transformers: filename=sentence_transformers-2.2.2-py3-none-any.whl size=125923 sha256=81fab678e9191825b3f0ba4f744c556ae59ca2ec4d904ec4e44a89251c6d50a1
  Stored in directory: /root/.cache/pip/wheels/62/f2/10/1e606fd5f02395388f74e7462910fe851042f97238cbbd902f
Successfully built sentence-transformers
Installing collected packages: sentence-transformers
Successfully installed sentence-transformers-2.2.2


In [2]:
import pandas as pd

N = 10000
df = pd.concat(objs=[pd.read_csv(filepath_or_buffer='/kaggle/input/tinystories-narrative-classification/train.csv'),
                     pd.read_csv(filepath_or_buffer='/kaggle/input/tinystories-narrative-classification/validation.csv'),
                    ]).sample(n=N, random_state=2023).dropna(subset='text').reset_index()
df['character count'] = df['text'].str.len()
df['token count'] = df['text'].apply(lambda x: len(x.split()))
df.head()

Unnamed: 0,index,text,character count,token count
0,94727,Susie was very excited today. She was going to...,723,129
1,1575890,"Once upon a time, there was a little girl who ...",941,179
2,1947657,Once upon a time there was a brave boy named J...,679,132
3,49524,Ben and Lily are twins. They like to play with...,1260,250
4,1823885,Sara and Ben were going to the airport with Mo...,1354,265


In [3]:
df.shape

(10000, 4)

In [4]:
from arrow import now
from collections import Counter
from spacy import load
from spacy.lang.en import stop_words as stop_words

spacy_start = now()
nlp = load('en_core_web_lg')

data = []
for index, row in df.iterrows():
    document = nlp(row['text'])
    count = dict(Counter([chunk.text for chunk in document.noun_chunks]))
    data.append(pd.Series(data=count))
spacy_df = pd.DataFrame(data=data)
# noun extraction for 10k documents should take about eight minutes 
print('spacy time: {}'.format(now() - spacy_start))

spacy time: 0:05:56.066085


In [5]:
from plotly.express import bar
sum_spacy_df = spacy_df.sum(axis=0).to_frame().reset_index()
sorted_df = sum_spacy_df[~sum_spacy_df['index'].str.lower().isin(stop_words.STOP_WORDS)].sort_values(ascending=False, by=0)
bar(data_frame=sorted_df.head(n=50), x='index', y=0)

Most of these stories are about someone named Lily.

In [6]:
# https://www.kaggle.com/code/ianalyticsgeek/keywords-extraction-using-bert
from arrow import now
from sentence_transformers import SentenceTransformer

# we can't use more features than we can visualize
MAX_FEATURES = 500

model_start = now()
model = SentenceTransformer('distilbert-base-nli-mean-tokens')
# we need to encode the essays to get the words' relationships to each other
embedding = model.encode(df['text'].values.tolist())
print('{}: got embeddings'.format(now()))
features = sorted_df.head(n=MAX_FEATURES)['index'].values
feature_counts = sorted_df.head(n=MAX_FEATURES)[0].values
feature_embeddings = model.encode(features)
print('{}: got feature embeddings'.format(now()))
# this will take 13-14 minutes for 10k documents
print('model time: {}'.format(now() - model_start))

Downloading .gitattributes:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading 1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading README.md:   0%|          | 0.00/3.99k [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/550 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/265M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/450 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

Batches:   0%|          | 0/313 [00:00<?, ?it/s]

2023-12-02T21:55:54.302560+00:00: got embeddings


Batches:   0%|          | 0/16 [00:00<?, ?it/s]

2023-12-02T21:55:56.313844+00:00: got feature embeddings
model time: 0:12:42.107380


In [7]:
# now we can project the feature vectors into 2-space to visualize 
from plotly.express import scatter
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, random_state=2023, verbose=1, n_iter=10000)
tsne_df = pd.DataFrame(data=tsne.fit_transform(X=feature_embeddings), columns=['t0', 't1'])
tsne_df['word'] = features
tsne_df['count'] = feature_counts
scatter(data_frame=tsne_df, x='t0', y='t1', text='word', height=900, hover_data=['count'] ).update_traces(marker={'size': 1})

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 500 samples in 0.001s...
[t-SNE] Computed neighbors for 500 samples in 0.167s...
[t-SNE] Computed conditional probabilities for sample 500 / 500
[t-SNE] Mean sigma: 5.240034
[t-SNE] KL divergence after 250 iterations with early exaggeration: 61.156448
[t-SNE] KL divergence after 2900 iterations: 0.754957


The big takeaway here is that simple stories really look like simple stories when we view them this way.