In [7]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("thenlper/gte-large")

# Sentences we want to encode. Example:
sentence = "This framework generates embeddings for each input sentence"

# Sentences are encoded by calling model.encode()
embedding = model.encode(sentence, output_value="token_embeddings")

In [None]:
embedding = model.encode(sentence, output_value="token_embeddings")
embedding.shape

In [None]:
embedding.mean(axis=0).shape

In [None]:
w1 = "payment"
w2 = "onion"

# Compute the embeddings
embedding1 = model.encode(w1, output_value="token_embeddings")
embedding2 = model.encode(w2, output_value="token_embeddings")

# Compute the cosine-similarity
from sentence_transformers import SimilarityFunction

cosine_fn = SimilarityFunction.to_similarity_fn("cosine")
cosine_score = cosine_fn(embedding1, embedding2)

print("Cosine-Similarity:", cosine_score)

In [None]:
embedding2.shape

In [1]:
from report_metrics import *

  from tqdm.autonotebook import tqdm, trange


In [10]:
text1 = "Hello world! How are you doing?"

emb1 = get_embedding(text1)

In [11]:
text2 = "Thank you! I am doing well."
emb2 = get_embedding(text2)

In [None]:
emb2

In [2]:
text3 = "In the heart of the enchanted forest stood a sentient house, its windows like eyes observing the world. It longed to live, to experience life beyond its rooted existence. One moonlit night, a shooting star granted its wish, transforming the house into a towering golem of wood and stone. As it roamed the forest, its window-eyes saw wonders and met creatures it had only dreamed of. With each step, the house-turned-golem learned the essence of life, its walls now filled with memories instead of rooms."
get_words(text3, remove_stopwords=False, dominant_k=5)

['the', 'of', 'its', 'a', 'house']

In [None]:
dis = compute_sem_dis(emb1, emb2)
dis

In [3]:
text4 = "As the alien spacecraft hovered silently above the city, its metallic surface gleaming in the moonlight, Dr. Eliza Chen peered anxiously through the observatory's window. Her lifetime of research into extraterrestrial life had led to this moment, but nothing could have prepared her for the sight of the massive vessel, easily the size of a house, descending gracefully towards the Earth. With trembling hands, she reached for the radio transmitter, knowing that her next words could determine whether humanity would live or perish. \"Welcome,\" she breathed into the microphone, her voice barely above a whisper, \"We come in peace.\" The ship's response came not in words, but in a brilliant burst of light that bathed the entire planet in a warm, comforting glow."
compute_theme_uniqueness([text3, text4], cluster_distance_threshold=0.5)

[1.0, 1.0]

In [7]:
compute_surprise(text3), compute_surprise(text4), compute_surprise("".join([text3, text4]))

(0.023017248541417762, 0.016241591014377388, 0.019932572159316275)

In [9]:
compute_n_gram_diversity(text3, max_n_gram=3)

[0.6213592233009708, 0.9705882352941176, 1.0]

In [1]:
from report_metrics import *

  from tqdm.autonotebook import tqdm, trange


In [3]:
rel_text1 = "automation innovation software hardware engineering robotics programming digital network data"
rel_text2 = "food cooking recipe restaurant chef cuisine ingredient flavor"
unrel_text1 = "ocean laptop basketball poetry mountain refrigerator zebra guitar democracy candle"

In [None]:
compute_avg_sem_dis(rel_text1), compute_avg_sem_dis(rel_text2), compute_avg_sem_dis(unrel_text1)

In [3]:
compute_avg_sem_dis(rel_text1), compute_avg_sem_dis(rel_text2), compute_avg_sem_dis(unrel_text1)

(0.20274831851323447, 0.15473486483097076, 0.22474116219414605)

In [5]:
get_words(rel_text1), get_words(rel_text2), get_words(unrel_text1)

(['digital',
  'hardware',
  'automation',
  'network',
  'robotic',
  'program',
  'software',
  'engineering',
  'innovation',
  'datum'],
 ['flavor',
  'recipe',
  'chef',
  'restaurant',
  'cooking',
  'ingredient',
  'cuisine',
  'food'],
 ['mountain',
  'laptop',
  'basketball',
  'zebra',
  'refrigerator',
  'guitar',
  'democracy',
  'ocean',
  'candle',
  'poetry'])

In [6]:
emb1 = get_embedding("democracy")
emb2 = get_embedding("candle")

compute_sem_dis(emb1, emb2)

0.24128174781799316

In [10]:
compute_theme_uniqueness([rel_text1, rel_text2, rel_text2, unrel_text1], cluster_distance_threshold=0.5)

[1.0, 0.5, 0.5, 1.0]

In [5]:
from utils import read_json, write_json, find_files
from statistics import mean

data_dir = "../experiments/reports/pilot/run1_report2/gemini-1.5-flash"
files = find_files(data_dir, "json")

for results_file in files:
    results = read_json(results_file)
    results["metrics"]["avg_n_gram_diversity"] = [mean([result["metrics"]["n_gram_diversity"][n_gram_len-1] for result in results["data"] if "metrics" in result]) for n_gram_len in range(1, 6)]
    write_json(results, results_file)