In [1]:
import pandas as pd
from json import loads
from ast import literal_eval
df = pd.read_csv(filepath_or_buffer='/kaggle/input/text-and-meta-data-analysis/train.csv', )
df['perplexity_score'] = df['meta'].apply(func=lambda x: literal_eval(x)['perplexity_score'])
df['pile_set_name'] = df['meta'].apply(func=lambda x: literal_eval(x)['pile_set_name'])
df = df.drop(columns=['meta'])
df = df.dropna(subset=['text'])
df.head()

Unnamed: 0,text,perplexity_score,pile_set_name
0,"It is done, and submitted. You can play “Survi...",327.0,Pile-CC
1,"<?xml version=""1.0"" encoding=""UTF-8""?>\r\n<seg...",977.7,Github
2,Article content\n\nHuman behavior has a tremen...,609.4,OpenWebText2
3,Topic: reinvent midnight madness\n\nAmazon ann...,262.3,Pile-CC
4,About Grand Slam Fishing Charters\n\nAs a fami...,858.9,Pile-CC


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 99996 entries, 0 to 99999
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   text              99996 non-null  object 
 1   perplexity_score  99996 non-null  float64
 2   pile_set_name     99996 non-null  object 
dtypes: float64(1), object(2)
memory usage: 3.1+ MB


In [3]:
df['xml'] = df['text'].apply(func=lambda x: x.startswith('<?xml'))
print('We have {} rows that are XML.'.format(df['xml'].sum()))

We have 321 rows that are XML.


In [4]:
from numpy import log
# Let's clean up the text
df['clean'] = df['text'].str.replace('\n', ' ') # remove newlines
# extract the clean data as a seprate DataFrame
clean_df = df[~df['xml']].drop(columns=['text', 'perplexity_score', 'pile_set_name', 'xml']).rename(columns={'clean': 'text'})
clean_df['text length'] = clean_df['text'].str.len()
clean_df['log text length'] = log(clean_df['text length'])
clean_df.head()

Unnamed: 0,text,text length,log text length
0,"It is done, and submitted. You can play “Survi...",13274,9.493563
2,Article content Human behavior has a tremendo...,1235,7.118826
3,Topic: reinvent midnight madness Amazon annou...,384,5.950643
4,About Grand Slam Fishing Charters As a family...,1754,7.469654
5,Q: Why was Mundungus banned from the Hog's He...,2014,7.607878


In [5]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text

text_input = tf.keras.layers.Input(shape=(), dtype=tf.string)
preprocessor = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3')
encoder_inputs = preprocessor(text_input)
encoder = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4', trainable=True)
outputs = encoder(encoder_inputs)
pooled_output = outputs['pooled_output']
sequence_output = outputs['sequence_output']
embedding_model = tf.keras.Model(text_input, pooled_output)
embedding_model.compile(optimizer='adam', loss='mse')

In [6]:
from sklearn.manifold import TSNE
from arrow import now

N = 2000
# we're going to modify this so we need to make a copy
tsne_df = clean_df.sample(n=N, random_state=2023).copy()
tsne = TSNE(n_components=2, verbose=1, random_state=2023)
tsne_start = now()
tsne_df[['t0', 't1']] = tsne.fit_transform(X=embedding_model(tf.constant(tsne_df['text'].values.tolist())).numpy())
print('done in {}'.format(now() - tsne_start))

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 2000 samples in 0.003s...
[t-SNE] Computed neighbors for 2000 samples in 0.262s...
[t-SNE] Computed conditional probabilities for sample 1000 / 2000
[t-SNE] Computed conditional probabilities for sample 2000 / 2000
[t-SNE] Mean sigma: 1.418686
[t-SNE] KL divergence after 250 iterations with early exaggeration: 72.194382
[t-SNE] KL divergence after 1000 iterations: 1.327663
done in 0:09:17.319654


In [7]:
from plotly.express import scatter
tsne_df['short'] = tsne_df['text'].str[:40]
scatter(data_frame=tsne_df, x='t0', y='t1', hover_name='short', color='log text length')

In [8]:
from plotly.express import histogram
histogram(data_frame=tsne_df, x='log text length')