In [10]:
import pandas as pd
import tensorflow as tf
import numpy as np
from sklearn.manifold import TSNE
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import os

In [11]:
# Load your dataset
df = pd.read_csv('data.csv', header=None, names=['text'])

In [14]:
df.describe()

Unnamed: 0,text
count,8266
unique,8266
top,foolishness
freq,1


In [12]:
# Tokenize the words
tokenizer = Tokenizer(num_words=8000, oov_token="<OOV>")
tokenizer.fit_on_texts(df['text'])

sequences = tokenizer.texts_to_sequences(df['text'])
padded_sequences = pad_sequences(sequences, padding='post', maxlen=1)

In [None]:
import en

In [28]:
# Create and train a simple model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=8000, output_dim=16),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dense(1)
])

In [29]:
labels = np.zeros((len(df), 1))

In [31]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
history = model.fit(padded_sequences, labels, epochs=5)

Epoch 1/5
[1m259/259[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 635us/step - accuracy: 1.0000 - loss: 0.0012
Epoch 2/5
[1m259/259[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 586us/step - accuracy: 1.0000 - loss: 1.1921e-07
Epoch 3/5
[1m259/259[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 582us/step - accuracy: 1.0000 - loss: 1.1921e-07
Epoch 4/5
[1m259/259[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 611us/step - accuracy: 1.0000 - loss: 1.1921e-07
Epoch 5/5
[1m259/259[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 610us/step - accuracy: 1.0000 - loss: 1.1921e-07


In [32]:
embeddings = model.layers[0].get_weights()[0]

In [33]:
# Run t-SNE
tsne = TSNE(n_components=3, n_iter=1000, random_state=0)
tsne_results = tsne.fit_transform(embeddings)

In [25]:
# # Save embeddings and metadata
# if not os.path.exists('logs'):
#     os.makedirs('logs')
# #
# np.savetxt('logs/vecs.tsv', tsne_results, delimiter='\t')
# metadata = tokenizer.word_index
# with open('logs/meta.tsv', 'w') as f:
#     for word, index in sorted(metadata.items(), key=lambda item: item[1]):
#         f.write(f"{word}\n")

In [36]:
# Set up a logs directory, so TensorBoard knows where to look for files.
log_dir = 'logs'
if not os.path.exists(log_dir):
    os.makedirs(log_dir)

# Example: Assuming `tsne_results` contains your embedding vectors
# Save embeddings to a TSV file
np.savetxt(os.path.join(log_dir, 'vecs.tsv'), tsne_results, delimiter='\t')

# Save metadata
# Assuming `tokenizer.word_index` is a dictionary with words and their indices
metadata = tokenizer.word_index

# Write metadata to a TSV file
with open(os.path.join(log_dir, 'meta.tsv'), 'w') as f:
    for word, index in sorted(metadata.items(), key=lambda item: item[1]):
        f.write(f"{word}\n")

In [37]:

# Save the weights we want to analyze as a variable. Note that the first
# value represents any unknown word, which is not in the metadata, here
# we will remove this value.
weights = tf.Variable(model.layers[0].get_weights()[0][1:])
# Create a checkpoint from embedding, the filename and key are the
# name of the tensor.
checkpoint = tf.train.Checkpoint(embedding=weights)
checkpoint.save(os.path.join(log_dir, "embedding.ckpt"))



'logs\\embedding.ckpt-1'

In [38]:
# Set up config.
config = projector.ProjectorConfig()
embedding = config.embeddings.add()
# The name of the tensor will be suffixed by `/.ATTRIBUTES/VARIABLE_VALUE`.
embedding.tensor_name = "embedding/.ATTRIBUTES/VARIABLE_VALUE"
embedding.metadata_path = 'metadata.tsv'
projector.visualize_embeddings(log_dir, config)

In [40]:
# Now run tensorboard against on log data we just saved.
%tensorboard --logdir /logs