In [1]:
! pip install --user sentence-transformers 

Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l- \ done
Building wheels for collected packages: sentence-transformers
  Building wheel for sentence-transformers (setup.py) ... [?25l- \ | done
[?25h  Created wheel for sentence-transformers: filename=sentence_transformers-2.2.2-py3-none-any.whl size=125923 sha256=ec47e4ce080d90f681e162ae73d3b1add69ce140cd363ca920e7ba305ef55791
  Stored in directory: /root/.cache/pip/wheels/62/f2/10/1e606fd5f02395388f74e7462910fe851042f97238cbbd902f
Successfully built sentence-transformers
Installing collected packages: sentence-transformers
Successfully installed sentence-transformers-2.2.2


In [2]:
import pandas as pd
from json import loads
from ast import literal_eval
df = pd.read_csv(filepath_or_buffer='/kaggle/input/text-and-meta-data-analysis/train.csv', )
df['perplexity_score'] = df['meta'].apply(func=lambda x: literal_eval(x)['perplexity_score'])
df['pile_set_name'] = df['meta'].apply(func=lambda x: literal_eval(x)['pile_set_name'])
df = df.drop(columns=['meta'])
df = df.dropna(subset=['text'])
df.head()

Unnamed: 0,text,perplexity_score,pile_set_name
0,"It is done, and submitted. You can play “Survi...",327.0,Pile-CC
1,"<?xml version=""1.0"" encoding=""UTF-8""?>\r\n<seg...",977.7,Github
2,Article content\n\nHuman behavior has a tremen...,609.4,OpenWebText2
3,Topic: reinvent midnight madness\n\nAmazon ann...,262.3,Pile-CC
4,About Grand Slam Fishing Charters\n\nAs a fami...,858.9,Pile-CC


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 99996 entries, 0 to 99999
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   text              99996 non-null  object 
 1   perplexity_score  99996 non-null  float64
 2   pile_set_name     99996 non-null  object 
dtypes: float64(1), object(2)
memory usage: 3.1+ MB


In [4]:
# Let's clean up the text
df['clean'] = df['text'].str.replace('\n', ' ') # remove embedded newlines
clean_df = df.drop(columns=['text', 'perplexity_score', 'pile_set_name', ]).rename(columns={'clean': 'text'})
clean_df.sample(n=5, random_state=2023)

Unnamed: 0,text
31934,Flickr Open / Getty Images Old paint peeling f...
18263,Privacy Policy Samparknow is always there to ...
12731,Limited CapacityfullAdding this to your schedu...
76803,"Introduction {#S0001} ============ Recently, ..."
62813,Q: When I run rails server it is showing erro...


In [5]:
from arrow import now
from sklearn.feature_extraction.text import CountVectorizer
from sentence_transformers import SentenceTransformer

N = 1200
MAX_FEATURES = 200

# while we are testing let's take a small sample
sample_df = clean_df.sample(n=N, random_state=2023)
fit_data = sample_df['text'].values.tolist()

NGRAM_RANGE = (1, 1)
model_start = now()
count = CountVectorizer(ngram_range=NGRAM_RANGE, stop_words='english', max_features=MAX_FEATURES).fit(fit_data)
features = count.get_feature_names_out()
print('{}: got features'.format(now()))
model = SentenceTransformer('distilbert-base-nli-mean-tokens')
embedding = model.encode(fit_data)
print('{}: got embeddings'.format(now()))
feature_embeddings = model.encode(features)
print('{}: got feature embeddings'.format(now()))
print('model time: {}'.format(now() - model_start))

2023-11-29T22:32:36.521461+00:00: got features


Downloading .gitattributes:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading 1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading README.md:   0%|          | 0.00/3.99k [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/550 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/265M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/450 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

Batches:   0%|          | 0/38 [00:00<?, ?it/s]

2023-11-29T22:34:54.755743+00:00: got embeddings


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

2023-11-29T22:34:55.531363+00:00: got feature embeddings
model time: 0:02:20.572538


In [6]:
from sklearn.metrics.pairwise import cosine_similarity

TOP_N = N // 20

distances = cosine_similarity(embedding, feature_embeddings)
# these are the top phrases; how do we get the keywords for each document?
phrases = [features[index] for index in distances.argsort()[0][-TOP_N:]]
phrases

['year',
 'file',
 'light',
 '24',
 '25',
 'common',
 'nature',
 'power',
 '14',
 'people',
 'degree',
 'com',
 '18',
 'left',
 'model',
 'law',
 'table',
 'development',
 'related',
 'results',
 'figure',
 'code',
 'conditions',
 'general',
 'human',
 'better',
 'frac',
 'animals',
 'long',
 'years',
 '22',
 'little',
 'new',
 'change',
 '50',
 'man',
 'end',
 '30',
 'evidence',
 'large',
 'help',
 'high',
 'need',
 'test',
 'work',
 'home',
 'small',
 'study',
 'different',
 'important',
 'treatment',
 'family',
 'false',
 'mathcal',
 'mathbf',
 '0x0000',
 'patients',
 'class',
 'old',
 'police']

In [7]:
from plotly.express import scatter
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, random_state=2023, verbose=1)
tsne_df = pd.DataFrame(data=tsne.fit_transform(X=feature_embeddings), columns=['t0', 't1'])
scatter(data_frame=tsne_df, x='t0', y='t1', text=features, height=900, ).update_traces(marker={'size': 1})

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 200 samples in 0.001s...
[t-SNE] Computed neighbors for 200 samples in 0.068s...
[t-SNE] Computed conditional probabilities for sample 200 / 200
[t-SNE] Mean sigma: 3.314540
[t-SNE] KL divergence after 250 iterations with early exaggeration: 65.594070
[t-SNE] KL divergence after 1000 iterations: 0.746157
