In [2]:
from sentence_transformers import SentenceTransformer
import pandas as pd

In [3]:
def read_text_file(filename):
    with open(filename, 'r') as file:
        return file.read()

def word_windowing(text, window_size, step_size):
    words = text.split()  # Split the text into words
    windows = []
    for i in range(0, len(words) - window_size + 1, step_size):
        window = ' '.join(words[i:i + window_size])  # Join words to form window
        windows.append(window)
    return windows


def perform_window_operation(windows):
    model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
    sentences = []
    embeddings = []

    for sentence in windows:
        embedding = model.encode(sentence)
        sentences.append(sentence)
        embeddings.append(embedding)

    df = pd.DataFrame({'Sentence': sentences, 'Embedding': embeddings})
    return df


In [4]:
filename = 'three_body_problem.txt'
window_size = 100
step_size = 50
text = read_text_file(filename)
windows = word_windowing(text, window_size, step_size)


In [10]:
windows[2]



In [5]:
embedding_df = perform_window_operation(windows)

In [6]:
embedding_df.to_pickle("embeddings/embedding.pkl")

In [11]:
embedding_df.sample(5)

Unnamed: 0,Sentence,Embedding
24,humanity from now on will never be the same. S...,"[-0.028191265, -0.019665377, 0.023111252, 0.01..."
89,to hibernate to conserve oxygen. Once the proc...,"[-0.034970105, -0.008142417, -0.018723663, 0.0..."
67,of Trisolaris's three suns is struck by a rela...,"[-0.021488613, -0.047500413, -0.010815905, 0.0..."
5,"to Tsinghua as a professor, Ye encounters Mike...","[-0.04694913, -0.01860135, -0.00046791031, -0...."
57,Two groups of human starships fled the solar s...,"[-0.07281824, 0.008806537, 0.04961432, 0.02709..."


In [14]:
embedding_df.iloc[0][1]

  embedding_df.iloc[0][1]


array([-4.83101271e-02,  4.32314351e-02,  5.61867952e-02,  3.33585478e-02,
       -9.85858031e-03, -3.04072537e-02,  7.13311881e-02, -3.68279442e-02,
        5.47659909e-03,  2.23255320e-03,  9.11525339e-02, -3.91898826e-02,
        1.96600445e-02, -8.52200575e-03,  6.42613396e-02,  1.51600363e-03,
        1.86530948e-02, -1.12397261e-02, -9.31401551e-02, -1.12448238e-01,
       -6.46404698e-02, -8.69178586e-03,  7.52593428e-02,  6.25277217e-03,
        3.65514830e-02, -5.63865453e-02,  1.02910124e-01,  9.40488931e-03,
        2.88410522e-02, -2.28482787e-03, -8.42841342e-02,  6.25420883e-02,
       -9.50923711e-02,  6.24949560e-02, -8.16075727e-02,  9.59585980e-02,
        1.45677909e-01, -5.97295612e-02, -6.04661703e-02, -1.01486864e-02,
        3.65829654e-03, -5.07462136e-02,  1.02338426e-01, -1.95213109e-02,
       -1.81333367e-02, -3.07383295e-02,  1.52453957e-02, -8.95272940e-02,
        6.07432332e-03, -2.69291569e-02,  3.58025618e-02, -7.04213220e-04,
       -7.40223052e-03,  