# Exercise 4 - Part A

### Necessary imports

In [None]:
from sentence_transformers import SentenceTransformer, util
from scipy.spatial import distance
import matplotlib.pyplot as plt
import numpy as np
import random
from nltk import tokenize

### Data loading

In [None]:
sentences = []
with open("data/data.txt", "r", encoding='utf8') as f:
    lines = f.read().replace("\n", " ")
    sentences = tokenize.sent_tokenize(lines)

### Obtaining the embeddings of the sentences

In [None]:
# Bert Embeddings for each sentence
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
bert_embeddings_definition = model.encode(sentences, convert_to_tensor=True)

### Calculating the occurrence of each word in every sentence 

In [None]:
# Function for calculating three random numbers with a minimum spacing between them

def spreadRandom(theRange, howMany, minSpacing):
    while True:
        candidate = sorted([random.randint(*theRange) for _ in range(howMany)])
        minDiff = min([ candidate[i+1]-candidate[i] for i, _ in enumerate(candidate[:-1])])
        if minDiff >= minSpacing:
            return candidate

In [None]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

def get_bert_embedding(sentence: str) -> np.ndarray:
    bert_embeddings_definition = model.encode(sentence, convert_to_tensor=True)
    return bert_embeddings_definition

In [None]:
def calculate_cosine_similarity(blocks: list) -> list:
    blocks_embeddings = []
    for i in range(len(blocks)):
        block_embedding = model.encode(blocks[i], convert_to_tensor=True)
        blocks_embeddings.append(util.torch.mean(util.cos_sim(block_embedding, block_embedding)).item())

    return blocks_embeddings

In [None]:
def calculate_cosine_similarity_between_blocks(blocks: list) -> list:
    blocks_embeddings = []
    for i in range(len(blocks)):
        block_embeddings = []
        for sentence in blocks[i]:
            block_embedding = model.encode(sentence, convert_to_tensor=True)
            block_embeddings.append(block_embedding)
        blocks_embeddings.append(np.sum(block_embeddings))
    blocks_embeddings[0] = util.cos_sim(blocks_embeddings[0], blocks_embeddings[1])
    blocks_embeddings[1] = util.cos_sim(blocks_embeddings[1], blocks_embeddings[2])

    return blocks_embeddings

In [None]:
# Separate the text into 3 blocks of random dimension
blocks_index = spreadRandom([5, len(sentences) - 5], 2, 10)
blocks = [sentences[0:blocks_index[0]], sentences[blocks_index[0]:blocks_index[1]], sentences[blocks_index[1]:]]

changed = True

while changed:
    # Calculate the cosine similarity inside each block
    changed = False
    blocks_similarity = calculate_cosine_similarity(blocks)
    # blocks_similarity_between = calculate_cosine_similarity_between_blocks(blocks)

    # Calculate the cosine similarity changing the blocks
    new_blocks_index_add = [x + 1 if x + 1 < len(sentences) else x for x in blocks_index]
    new_blocks_add = [sentences[0:new_blocks_index_add[0]], sentences[new_blocks_index_add[0]:new_blocks_index_add[1]], sentences[new_blocks_index_add[1]:]]

    new_blocks_index_sub = [x - 1 if x - 1 < len(sentences) else x for x in blocks_index]
    new_blocks_sub = [sentences[0:new_blocks_index_sub[0]], sentences[new_blocks_index_sub[0]:new_blocks_index_sub[1]], sentences[new_blocks_index_sub[1]:]]

    # Calculate the cosine similarity inside each block
    new_blocks_similarity_add = calculate_cosine_similarity(new_blocks_add)
    new_blocks_similarity_sub = calculate_cosine_similarity(new_blocks_sub)

    # # # Calculate the cosine similarity between adjacent blocks
    # new_blocks_similarity_between_add = calculate_cosine_similarity_between_blocks(new_blocks_add)
    # new_blocks_similarity_between_sub = calculate_cosine_similarity_between_blocks(new_blocks_sub)

    # Change the blocks if the cosine similarity is higher valutating the 3 blocks
    if new_blocks_similarity_add[0] > blocks_similarity[0] \
       and (new_blocks_similarity_add[0] - blocks_similarity[0]) > (new_blocks_similarity_sub[0] - blocks_similarity[0]):
        blocks_index[0] = blocks_index[0] + 1
        blocks[0] = sentences[0:blocks_index[0]]
        changed = True
        print("Changed Block 1 (+1): ", blocks_index)
        continue
    elif new_blocks_similarity_sub[0] > blocks_similarity[0] \
         and (new_blocks_similarity_sub[0] - blocks_similarity[0]) > (new_blocks_similarity_add[0] - blocks_similarity[0]):         
        blocks_index[0] = blocks_index[0] - 1
        blocks[0] = sentences[0:blocks_index[0]]
        changed = True
        print("Changed Block 1 (-1): ", blocks_index)
        continue
    if new_blocks_similarity_add[1] > blocks_similarity[1] \
       and (new_blocks_similarity_add[1] - blocks_similarity[1]) > (new_blocks_similarity_sub[1] - blocks_similarity[1]):
        blocks_index[1] = blocks_index[1] + 1
        blocks[1] = sentences[blocks_index[0]:blocks_index[1]]
        changed = True
        print("Changed Block 2 (+1): ", blocks_index)
        continue
    elif new_blocks_similarity_sub[1] > blocks_similarity[1] \
         and (new_blocks_similarity_sub[1] - blocks_similarity[1]) > (new_blocks_similarity_add[1] - blocks_similarity[1]):
        blocks_index[1] = blocks_index[1] - 1
        blocks[1] = sentences[blocks_index[0]:blocks_index[1]]
        changed = True
        print("Changed Block 2 (-1): ", blocks_index)
        continue

print("Blocks Index: ", blocks_index)

In [None]:
# Print the paragraphs
for sentence in sentences[0:blocks_index[0]]:
    print(sentence)
print("\n\n")
for sentence in sentences[blocks_index[0]:blocks_index[1]]:
    print(sentence)
print("\n\n")
for sentence in sentences[blocks_index[1]:]:
    print(sentence)

In [None]:
sentences[23]