# Load dataset and pre-processing

In [1]:
df_summary = 0

In [None]:
# This code is downloading the notebook from GitHub and running it
import requests
from pathlib import Path
url = "https://raw.githubusercontent.com/nbakas/NLP/refs/heads/main/02-Preprocessing.ipynb"
filename = url.split("/")[-1]
local_path = Path.cwd() / filename
response = requests.get(url)
response.raise_for_status()
local_path.write_bytes(response.content)
%run $local_path

In [None]:
df_summary

# Embeddings

In [None]:
# Convert the summaries to a list of sentences
my_corpus = df_summary.tolist()
print(len(my_corpus))
my_corpus[:5]  # Display first 5 sentences to verify

In [None]:
# Tokenize the corpus
tokenized_corpus = [sentence.lower().split() for sentence in my_corpus]
tokenized_corpus

In [None]:
from collections import Counter
# Create a list of unique words
unique_words = [word for sublist in tokenized_corpus for word in sublist]
print(len(set(unique_words)))
unique_words[:5]

In [None]:
word_counts = Counter(unique_words)
word_counts

In [None]:
unique_words = [word for word, count in word_counts.most_common(len(set(unique_words)))]
unique_words[:5]

In [7]:
# Create a word index dictionary
word_index = {word: idx for idx, word in enumerate(unique_words)}

In [8]:
# Define context window as a parameter
context_window = 3

In [None]:
# Initialize the contingency matrix
import numpy as np
contingency_matrix = np.zeros((len(unique_words), len(unique_words)), dtype=int)

# Populate the contingency matrix with the defined context window
for iter_sentence, sentence in enumerate(tokenized_corpus):
    for i in range(len(sentence)):
        for j in range(max(0, i - context_window), min(len(sentence), i + context_window + 1)):
            if i != j:
                if sentence[i] in word_index and sentence[j] in word_index:
                    contingency_matrix[word_index[sentence[i]], word_index[sentence[j]]] += 1
    if iter_sentence % 100_000 == 0:
        print(iter_sentence)

In [None]:
contingency_matrix

In [11]:
embeddings_dim = 2

In [None]:
nof_obj = contingency_matrix.shape[0]
np.random.seed(0)
embeddings_matrix = 2*np.random.rand(nof_obj, embeddings_dim)-1
print(embeddings_matrix.shape)
embeddings_matrix

In [None]:
sum_weights_all = []
idxs_all = []
nof_nearest_neighbors = 5
for i in range(nof_obj):
    weights = contingency_matrix[i]
    idxs = np.arange(len(weights))[np.arange(len(weights)) != i]
    most_similar_idxs = np.argsort(weights[idxs])[::-1][:nof_nearest_neighbors]
    idxs_all.append(idxs[most_similar_idxs])
    sum_weights = np.sum(weights[idxs][most_similar_idxs])
    sum_weights_all.append(sum_weights)
    if i%1000 == 0:
        print(i)

In [14]:
from copy import deepcopy

In [15]:
# nof_iterations = contingency_matrix.shape[0]//20
nof_iterations = 1

In [None]:
# convergence_history = []
for _ in range(nof_iterations):
    for i in range(nof_obj):
        if sum_weights_all[i] != 0:
            # prev_opti_xy = deepcopy(embeddings_matrix)
            embeddings_matrix[i] = np.sum(contingency_matrix[i][idxs_all[i], None] * embeddings_matrix[idxs_all[i]], axis=0) / sum_weights_all[i]
            # convergence_history.append(np.linalg.norm(embeddings_matrix - prev_opti_xy))
        if i%1000 == 0:
            print(_, i)

In [None]:
embeddings_matrix

In [None]:
embeddings_matrix.shape

In [None]:
import plotly.express as px
import plotly.graph_objects as go
import pandas as pd

nof_keep_words = 100
# Create a DataFrame for plotting
df = pd.DataFrame(embeddings_matrix[:nof_keep_words], columns=['x', 'y'])
df['word'] = unique_words[:nof_keep_words]
df['count'] = [word_counts[word] for word in unique_words[:nof_keep_words]]  # Assuming word_counts is a dictionary

# Create a scatter plot using Plotly with size proportional to word counts
fig = px.scatter(df, x='x', y='y', text='word', size='count', 
                 title='Top 20 Unique Words in Embeddings Space',
                 hover_data=['count'])
fig.update_traces(textfont_size=15)  # Adjust the font size here

# Add pairwise connecting lines with thickness proportional to contingency_matrix
max_weight = np.max(contingency_matrix)/10
threshold_weight = np.quantile(contingency_matrix[contingency_matrix>0], 0.9999)
for i in range(nof_keep_words):
    for j in range(i + 1, nof_keep_words):
        weight = contingency_matrix[i, j]
        if weight > threshold_weight:  
            fig.add_trace(go.Scatter(x=[df['x'][i], df['x'][j]], 
                                     y=[df['y'][i], df['y'][j]], 
                                     mode='lines',
                                     line=dict(width=weight/max_weight, color='rgba(0,0,0,0.2)'),
                                     showlegend=False))

# Show the plot
fig.show()

In [None]:
threshold_weight

In [None]:
# based on contingency_matrix find the top 5 words related to 'organic'
word_based = 'chocolate'
top_related_words = np.argsort(contingency_matrix[word_index[word_based]])[::-1][:5]
top_related_words = [unique_words[i] for i in top_related_words]
print(top_related_words)

In [None]:
# Assuming df_summary is a DataFrame with a column 'Summary' containing the text data
word1 = 'chocolate'
word2 = 'best'
word3 = 'dark'

# Find all entries in df_summary that include word1, word2, or word3
matching_entries = df_summary[df_summary.apply(lambda x: all(word in x for word in [word1, word2, word3]))]

# Display the matching entries
print(matching_entries)
