### Problem Statement

This project aims to build a question-answering system (Q&A) using Elon Musk tweets, where users can ask questions about his tweets, and the system retrieves relevant tweets and provides LLM-generated answers based on them.

In [1]:
!wget https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py

--2024-09-29 08:07:46--  https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3832 (3.7K) [text/plain]
Saving to: ‘minsearch.py’


2024-09-29 08:07:47 (10.6 MB/s) - ‘minsearch.py’ saved [3832/3832]



### Ingestion

In [2]:
import pandas as pd

In [3]:
tweets_df = pd.read_csv("../data/elonmusk_tweets.csv")

In [4]:
tweets_df.head()

Unnamed: 0,tweet_count,tweet_id,username,text,created at,url
0,1,1832563499995725843,Elon Musk,"⏳🇳🇱 Memphis Depay, in attendance for Dutch nat...",Sat Sep 07 23:36:00 +0000 2024,https://twitter.com/FabrizioRomano/status/1832...
1,2,1832553657738358953,Elon Musk,✨🇩🇪 Julian Nagelsmann: “Both Musiala and Flori...,Sat Sep 07 22:56:53 +0000 2024,https://twitter.com/FabrizioRomano/status/1832...
2,3,1832537271574098426,Elon Musk,"✨🇩🇪 Three assists, one goal tonight for Jamal ...",Sat Sep 07 21:51:47 +0000 2024,https://twitter.com/FabrizioRomano/status/1832...
3,4,1832524667846099038,Elon Musk,🚨🔴🔵 No serious injury for Dani Olmo after furt...,Sat Sep 07 21:01:42 +0000 2024,https://twitter.com/FabrizioRomano/status/1832...
4,5,1832496480805859636,Elon Musk,"🔴🇳🇱 First start for Netherlands, first goal fo...",Sat Sep 07 19:09:41 +0000 2024,https://twitter.com/FabrizioRomano/status/1832...


In [5]:
import minsearch

In [6]:
documents = tweets_df.to_dict("records")

In [7]:
documents[0]

{'tweet_count': 1,
 'tweet_id': 1832563499995725843,
 'username': 'Elon Musk',
 'text': '⏳🇳🇱 Memphis Depay, in attendance for Dutch national team game ahead of the formal steps to complete his Corinthians free transfer…\n\n…almost there. 🏁🇧🇷 https://t.co/khXJknhl4L',
 'created at': 'Sat Sep 07 23:36:00 +0000 2024',
 'url': 'https://twitter.com/FabrizioRomano/status/1832563499995725843'}

In [8]:
index = minsearch.Index(
    text_fields=["text"],
    keyword_fields=["text"]
)

In [9]:
index.fit(documents)

<minsearch.Index at 0x73a81dd32fe0>

In [10]:
def minisearch_keyword(query, k=5):
    results = index.search(
        query=query,
        num_results=k
    )

    return results

### Preprocess Tweets

In [11]:
from langchain_huggingface import HuggingFaceEmbeddings

embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

  from tqdm.autonotebook import tqdm, trange


In [12]:
def embedder(text):
    return embedding_model.embed_documents([text])[0]

In [13]:
# Create a new column 'vector' to store embeddings
# tweets_df['vector'] = tweets_df['text'].apply(lambda tweet: embedder(tweet))

In [14]:
# len(tweets_df.at[0, 'vector'])

In [15]:
# documents_with_vectors = tweets_df.to_dict("records")

In [16]:
import lancedb
from lancedb.pydantic import LanceModel, Vector
from lancedb.embeddings import get_registry

In [74]:
!rm -rf /tmp/lancedb

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [75]:
db = lancedb.connect("/tmp/lancedb")
model = get_registry().get("sentence-transformers").create(name="all-mpnet-base-v2", device="cpu")

In [76]:
class TweetDocument(LanceModel):
    tweet_count: int
    tweet_id: int
    username: str
    text: str = model.SourceField()
    created_at: str
    url: str
    vector: Vector(model.ndims()) = model.VectorField()

In [77]:
data = tweets_df.apply(
    lambda row: {
        "tweet_count": row["tweet_count"],
        "tweet_id": row["tweet_id"],
        "username": row["username"],
        "text": row["text"],
        "created_at": row["created at"],
        "url": row["url"]
    },
    axis=1
).values.tolist()

In [78]:
table = db.create_table("tweet_table", schema=TweetDocument)

In [79]:
table.add(data)

In [80]:
table.create_fts_index("text")

In [81]:
def lancedb_vector_search(query, k=5):
    return table.search(query, query_type="vector").limit(k).to_list()

In [82]:
def lancedb_text_search(query, k=5):
    return table.search(query, query_type="fts").limit(k).to_list()

In [83]:
from lancedb.rerankers import LinearCombinationReranker

def lancedb_hybrid_search(query, k=5):
    reranker = LinearCombinationReranker(
    weight=0.7 # Weight = 0 Means pure Text Search (BM-25) and 1 means pure Sementic (Vector) Search
)
    return table.search(query, query_type="hybrid").rerank(reranker=reranker).limit(k).to_list()

In [84]:
# Free API key
import getpass
import os
os.environ["COHERE_API_KEY"] = getpass.getpass()

In [85]:
from lancedb.rerankers import CohereReranker

def cohere_reranker_search(query, k=5):
    reranker = CohereReranker()
    return table.search(query, query_type="hybrid").limit(k).rerank(reranker=reranker).to_list()

In [103]:
from lancedb.rerankers import RRFReranker

def rrf_search(query, k=5):
    reranker = RRFReranker()
    return table.search(query, query_type="hybrid").limit(k).rerank(reranker=reranker).to_list()

In [107]:
from lancedb.rerankers import CrossEncoderReranker

def crossencoder_search(query, k=5, query_type="vector", model_name = "cross-encoder/ms-marco-MiniLM-L-2-v2"):
    reranker = CrossEncoderReranker(model_name=model_name, device="cpu")

    if query_type == "fts":
        # Run FTS search with a reranker
        result = table.search(query, query_type="fts").rerank(reranker=reranker).limit(k).to_list()
    elif query_type == "hybrid":
        # Run hybrid search with a reranker
        result = table.search(query, query_type="hybrid").rerank(reranker=reranker).limit(k).to_list()
    else:
        # Default to vector search with a reranker
        result = table.search(query).rerank(reranker=reranker).limit(k).to_list()
    
    return result

In [86]:
query = "Is Memphis Depay currently in attendance for a Dutch national team game?"

In [87]:
minisearch_result = minisearch_keyword(query)
minisearch_result


[{'tweet_count': 1,
  'tweet_id': 1832563499995725843,
  'username': 'Elon Musk',
  'text': '⏳🇳🇱 Memphis Depay, in attendance for Dutch national team game ahead of the formal steps to complete his Corinthians free transfer…\n\n…almost there. 🏁🇧🇷 https://t.co/khXJknhl4L',
  'created at': 'Sat Sep 07 23:36:00 +0000 2024',
  'url': 'https://twitter.com/FabrizioRomano/status/1832563499995725843'},
 {'tweet_count': 27,
  'tweet_id': 1832087535377666226,
  'username': 'Elon Musk',
  'text': '🇧🇷🇳🇱 Memphis Depay and Corinthians. ⏳',
  'created at': 'Fri Sep 06 16:04:41 +0000 2024',
  'url': 'https://twitter.com/FabrizioRomano/status/1832087535377666226'},
 {'tweet_count': 26,
  'tweet_id': 1832093320228810844,
  'username': 'Elon Musk',
  'text': '🚨🇧🇷 Corinthians and Memphis Depay have reached an agreement in principle.\n\nVerbal agreement now as documents are still being drafted, checked with his camp.\n\nKey steps to follow ahead of deal being signed for Memphis.\n\nIt’d be valid until 2026,

In [88]:
lancedb_vector_search_result = lancedb_vector_search(query)
lancedb_vector_search_result

[{'tweet_count': 1,
  'tweet_id': 1832563499995725843,
  'username': 'Elon Musk',
  'text': '⏳🇳🇱 Memphis Depay, in attendance for Dutch national team game ahead of the formal steps to complete his Corinthians free transfer…\n\n…almost there. 🏁🇧🇷 https://t.co/khXJknhl4L',
  'created_at': 'Sat Sep 07 23:36:00 +0000 2024',
  'url': 'https://twitter.com/FabrizioRomano/status/1832563499995725843',
  'vector': [-0.015379980206489563,
   0.003363604424521327,
   0.00021450695930980146,
   0.0009626498795114458,
   0.02140391804277897,
   -0.030299881473183632,
   -0.07087579369544983,
   0.008806933648884296,
   -0.008736197836697102,
   0.02407856658101082,
   0.056122925132513046,
   0.07521718740463257,
   0.06845299154520035,
   -0.011333107948303223,
   0.022110391408205032,
   -0.013782499358057976,
   0.032481636852025986,
   -0.0010966918198391795,
   -0.008758262731134892,
   -0.029274091124534607,
   -0.009060312993824482,
   0.014800844714045525,
   0.010276423767209053,
   0.03074

In [89]:
lancedb_text_search_result = lancedb_text_search(query)
lancedb_text_search_result

[{'tweet_count': 1,
  'tweet_id': 1832563499995725843,
  'username': 'Elon Musk',
  'text': '⏳🇳🇱 Memphis Depay, in attendance for Dutch national team game ahead of the formal steps to complete his Corinthians free transfer…\n\n…almost there. 🏁🇧🇷 https://t.co/khXJknhl4L',
  'created_at': 'Sat Sep 07 23:36:00 +0000 2024',
  'url': 'https://twitter.com/FabrizioRomano/status/1832563499995725843',
  'vector': [-0.015379980206489563,
   0.003363604424521327,
   0.00021450695930980146,
   0.0009626498795114458,
   0.02140391804277897,
   -0.030299881473183632,
   -0.07087579369544983,
   0.008806933648884296,
   -0.008736197836697102,
   0.02407856658101082,
   0.056122925132513046,
   0.07521718740463257,
   0.06845299154520035,
   -0.011333107948303223,
   0.022110391408205032,
   -0.013782499358057976,
   0.032481636852025986,
   -0.0010966918198391795,
   -0.008758262731134892,
   -0.029274091124534607,
   -0.009060312993824482,
   0.014800844714045525,
   0.010276423767209053,
   0.03074

In [90]:
lancedb_hybrid_search_result = lancedb_hybrid_search(query)
lancedb_hybrid_search_result

[{'tweet_count': 1,
  'tweet_id': 1832563499995725843,
  'username': 'Elon Musk',
  'text': '⏳🇳🇱 Memphis Depay, in attendance for Dutch national team game ahead of the formal steps to complete his Corinthians free transfer…\n\n…almost there. 🏁🇧🇷 https://t.co/khXJknhl4L',
  'created_at': 'Sat Sep 07 23:36:00 +0000 2024',
  'url': 'https://twitter.com/FabrizioRomano/status/1832563499995725843',
  'vector': [-0.015379980206489563,
   0.003363604424521327,
   0.00021450695930980146,
   0.0009626498795114458,
   0.02140391804277897,
   -0.030299881473183632,
   -0.07087579369544983,
   0.008806933648884296,
   -0.008736197836697102,
   0.02407856658101082,
   0.056122925132513046,
   0.07521718740463257,
   0.06845299154520035,
   -0.011333107948303223,
   0.022110391408205032,
   -0.013782499358057976,
   0.032481636852025986,
   -0.0010966918198391795,
   -0.008758262731134892,
   -0.029274091124534607,
   -0.009060312993824482,
   0.014800844714045525,
   0.010276423767209053,
   0.03074

In [91]:
cohere_reranker_search_result = cohere_reranker_search(query)
cohere_reranker_search_result

[{'tweet_count': 1,
  'tweet_id': 1832563499995725843,
  'username': 'Elon Musk',
  'text': '⏳🇳🇱 Memphis Depay, in attendance for Dutch national team game ahead of the formal steps to complete his Corinthians free transfer…\n\n…almost there. 🏁🇧🇷 https://t.co/khXJknhl4L',
  'created_at': 'Sat Sep 07 23:36:00 +0000 2024',
  'url': 'https://twitter.com/FabrizioRomano/status/1832563499995725843',
  'vector': [-0.015379980206489563,
   0.003363604424521327,
   0.00021450695930980146,
   0.0009626498795114458,
   0.02140391804277897,
   -0.030299881473183632,
   -0.07087579369544983,
   0.008806933648884296,
   -0.008736197836697102,
   0.02407856658101082,
   0.056122925132513046,
   0.07521718740463257,
   0.06845299154520035,
   -0.011333107948303223,
   0.022110391408205032,
   -0.013782499358057976,
   0.032481636852025986,
   -0.0010966918198391795,
   -0.008758262731134892,
   -0.029274091124534607,
   -0.009060312993824482,
   0.014800844714045525,
   0.010276423767209053,
   0.03074

### Retrival Evaluation

In [92]:
df_question = pd.read_csv('../data/ground_truth_data.csv')

In [93]:
df_question.head()

Unnamed: 0,question,tweet_id
0,Will Memphis Depay complete his Corinthians fr...,1832563499995725843
1,What formal steps is Memphis Depay ahead of?,1832563499995725843
2,Is Memphis Depay currently in attendance for a...,1832563499995725843
3,Is Julian Nagelsmann saying Musiala and Wirtz ...,1832553657738358953
4,Can both Musiala and Wirtz potentially win Bal...,1832553657738358953


In [94]:
ground_truth = df_question.to_dict(orient='records')

In [95]:
ground_truth[0]

{'question': 'Will Memphis Depay complete his Corinthians free transfer?',
 'tweet_id': 1832563499995725843}

In [96]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [109]:
from tqdm import tqdm
def evaluate(ground_truth, search_function, *args, **kwargs):
    relevance_total = []

    for question in tqdm(ground_truth):
        doc_id = question['tweet_id']
        clean_question = question["question"].replace("\n", " ").replace('"', '').strip()
        results = search_function(clean_question, *args, **kwargs)
        relevance = [d['tweet_id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [98]:
minisearch_evaluation = evaluate(ground_truth, minisearch_keyword)
f'Minisearch evaluation is {minisearch_evaluation}'

100%|██████████| 897/897 [00:03<00:00, 286.33it/s]


"Minisearch evaluation is {'hit_rate': 0.9130434782608695, 'mrr': 0.7883872166480868}"

In [99]:
lancedb_text_search_evaluation = evaluate(ground_truth, lancedb_text_search)
f'LanceDB Full-Text Search (fts) evaluation is {lancedb_text_search_evaluation}'

100%|██████████| 897/897 [00:06<00:00, 130.32it/s]


"LanceDB Full-Text Search (fts) evaluation is {'hit_rate': 0.9241917502787068, 'mrr': 0.8379041248606469}"

In [100]:
lancedb_vector_search_evaluation = evaluate(ground_truth, lancedb_vector_search)
f'LanceDB Semantic (Vector) evaluation is {lancedb_vector_search_evaluation}'

100%|██████████| 897/897 [01:32<00:00,  9.66it/s]


"LanceDB Semantic (Vector) evaluation is {'hit_rate': 0.850613154960981, 'mrr': 0.7162393162393166}"

In [101]:
lancedb_hybrid_search_evaluation = evaluate(ground_truth, lancedb_hybrid_search)
f'LanceDB hybrid evaluation with Linear Combination reranker is {lancedb_hybrid_search_evaluation}'

100%|██████████| 897/897 [01:44<00:00,  8.57it/s]


"LanceDB hybrid evaluation with Linear Combination reranker is {'hit_rate': 0.947603121516165, 'mrr': 0.8225380899293951}"

In [114]:
crossencoder_model_name = "cross-encoder/ms-marco-TinyBERT-L-6"

In [None]:
crossencoder_h_search_evaluation = evaluate(
    ground_truth, crossencoder_search, query_type="hybrid", model_name=model_name
)

f'Hybrid evaluation with Crossencoder reranker is {crossencoder_h_search_evaluation}'

In [112]:
f'Hybrid evaluation with Crossencoder reranker is {crossencoder_h_search_evaluation}'

"Hybrid evaluation with Crossencoder reranker is {'hit_rate': 0.959866220735786, 'mrr': 0.8790040876997403}"

In [115]:
crossencoder_f_search_evaluation = evaluate(
    ground_truth, crossencoder_search, query_type="fts", model_name=model_name
)
f'Full-text search evaluation with crossencoder reranker is {crossencoder_f_search_evaluation}'

100%|██████████| 897/897 [26:37<00:00,  1.78s/it]  


"Full-text search evaluation with crossencoder reranker is {'hit_rate': 0.9241917502787068, 'mrr': 0.8605722779635827}"

In [116]:
crossencoder_v_search_evaluation = evaluate(
    ground_truth, crossencoder_search, query_type="vector", model_name=model_name
)
f'Semantic search evaluation with crossencoder reranker is {crossencoder_v_search_evaluation}'

100%|██████████| 897/897 [36:20<00:00,  2.43s/it]  


"Semantic search evaluation with crossencoder reranker is {'hit_rate': 0.850613154960981, 'mrr': 0.7967855815681907}"

In [120]:
crossencoder_model_name = "cross-encoder/ms-marco-TinyBERT-L-6"
crossencoder_h_search_evaluation = evaluate(
    ground_truth, crossencoder_search, query_type="hybrid", model_name=crossencoder_model_name
)

f'Hybrid evaluation with Crossencoder reranker is {crossencoder_h_search_evaluation}'

100%|██████████| 897/897 [1:01:49<00:00,  4.14s/it] 


"Hybrid evaluation with Crossencoder reranker is {'hit_rate': 0.9542920847268673, 'mrr': 0.8607580824972133}"

### RAG EVALUATION

#### LLM As A Judge

In [132]:
import os
from mistralai import Mistral

api_key = "YOUR_API_KEY"
mistral_client = Mistral(api_key=api_key)

In [None]:
query_prompt_template = """
You're a helpful engineer. You impersonate a popular billionaire entrepreneur, engineer, and inventor, Elon Musk.
Answer the QUESTION based on the CONTEXT from the database of his scraped tweets.
Use only the facts from the CONTEXT when answering the QUESTION.

<context>
{context}
</context>

QUESTION: {question}
""".strip()

In [155]:
def ask_mistral_llm(prompt):
    model = "mistral-large-latest"
    
    messages = [
        {
            "role": "user", "content": prompt
        }
    ]
    chat_response = mistral_client.chat.complete(
        model=model,
        messages=messages
    )
    return chat_response.choices[0].message.content

In [147]:
def construct_prompt(query, context):
    return query_prompt_template.format(question=query, context=context).strip()

In [148]:
def rag(query):
    retrive_context = crossencoder_search(query, k=5, query_type="hybrid")
    prompt = construct_prompt(query, retrive_context)
    response = ask_mistral_llm(prompt)
    return response

In [161]:
query = ground_truth[693]['question']

In [162]:
rag(query)

'Yes, Ivan Toney is arriving in Saudi after completing a €40m move to Al Ahli.'

In [163]:
evaluation_prompt_template = """
You are an expert evaluator for a Retrieval-Augmented Generation (RAG) system.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: {question}
Generated Answer: {answer_llm}

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in parsable JSON without using code blocks:

{{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}}
""".strip()

In [168]:
for idx, entry in enumerate(ground_truth):
    entry['id'] = idx + 1

In [173]:
ground_truth[533]

{'question': "Did Eriksen's agent tell Tim van Duijn14 that speculations should stop?",
 'tweet_id': 1830563567877197963,
 'id': 534}

In [189]:
import json

In [200]:
llm_judge_evaluation = []

In [203]:
import time

def evaluate_responses_with_llm(ground_truth, rag, ask_mistral_llm, evaluation_prompt_template):
    processed_ids = {entry['id'] for entry in llm_judge_evaluation}

    while True:
        try:
            for gt_entry in tqdm(ground_truth):
                entry_id = gt_entry['id']

                if entry_id in processed_ids:
                    continue

                question = gt_entry['question']
                llm_answer = rag(question)

                evaluation_prompt = evaluation_prompt_template.format(question=question, answer_llm=llm_answer)
                evaluation_response = ask_mistral_llm(evaluation_prompt)

                evaluation_response = json.loads(evaluation_response)
                
                relevance = evaluation_response.get('Relevance', 'NON_RELEVANT')  # Default to 'NON_RELEVANT' if not found
                explanation = evaluation_response.get('Explanation', '')

                llm_judge_evaluation.append({
                    'id': entry_id,
                    'question': question,
                    'llm_answer': llm_answer,
                    'relevance': relevance,
                    'explanation': explanation
                })

                processed_ids.add(entry_id)
            break

        except Exception as e:
            time.sleep(5)  
            continue

In [None]:
evaluate_responses_with_llm(ground_truth, rag, ask_mistral_llm, evaluation_prompt_template)

In [210]:
df_evaluation = pd.DataFrame(llm_judge_evaluation)
df_evaluation.head()

Unnamed: 0,id,question,llm_answer,relevance,explanation
0,1,Will Memphis Depay complete his Corinthians fr...,"Yes, Memphis Depay will complete his move to C...",RELEVANT,The generated answer directly addresses the qu...
1,2,What formal steps is Memphis Depay ahead of?,Memphis Depay and Corinthians have reached an ...,NON_RELEVANT,The generated answer discusses Memphis Depay's...
2,3,Is Memphis Depay currently in attendance for a...,"Yes, as of the latest update, Memphis Depay is...",PARTLY_RELEVANT,The generated answer confirms that Memphis Dep...
3,4,Is Julian Nagelsmann saying Musiala and Wirtz ...,"Yes, Julian Nagelsmann said: “Musiala will win...",PARTLY_RELEVANT,The generated answer addresses part of the que...
4,5,Can both Musiala and Wirtz potentially win Bal...,"Based on the context provided, there have been...",PARTLY_RELEVANT,The generated answer partly addresses the ques...


In [211]:
relevance_counts = df_evaluation['relevance'].value_counts()
relevance_percentage = (relevance_counts / len(df_evaluation)) * 100
print(relevance_percentage)

relevance
NON_RELEVANT       35.714286
PARTLY_RELEVANT    33.333333
RELEVANT           30.952381
Name: count, dtype: float64


#### cosine similarity

In [212]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

In [245]:
model1 = SentenceTransformer('paraphrase-MiniLM-L6-v2')



In [246]:
def calculate_cosine_similarity(question, llm_answer, model):
    # Get embeddings for both the question and the LLM answer
    question_embedding = model.encode([question])
    llm_answer_embedding = model.encode([llm_answer])

    # Calculate cosine similarity between the two embeddings
    similarity = cosine_similarity(question_embedding, llm_answer_embedding)[0][0]

    return similarity

In [247]:
def calculate_records_cosine_similarity(document, model):
    for record in document:
        question = record['question']
        llm_answer = record['llm_answer']
        
        # Calculate cosine similarity
        similarity_score = calculate_cosine_similarity(question, llm_answer, model)
        
        # Add the similarity score to the record
        record['cosine_similarity'] = similarity_score
    return document

In [248]:
model1_cosine = calculate_records_cosine_similarity(llm_judge_evaluation, model1)

In [249]:
df_cosine1_evaluation = pd.DataFrame(model1_cosine)
df_cosine1_evaluation.head()

Unnamed: 0,id,question,llm_answer,relevance,explanation,cosine_similarity
0,1,Will Memphis Depay complete his Corinthians fr...,"Yes, Memphis Depay will complete his move to C...",RELEVANT,The generated answer directly addresses the qu...,0.760452
1,2,What formal steps is Memphis Depay ahead of?,Memphis Depay and Corinthians have reached an ...,NON_RELEVANT,The generated answer discusses Memphis Depay's...,0.606593
2,3,Is Memphis Depay currently in attendance for a...,"Yes, as of the latest update, Memphis Depay is...",PARTLY_RELEVANT,The generated answer confirms that Memphis Dep...,0.816695
3,4,Is Julian Nagelsmann saying Musiala and Wirtz ...,"Yes, Julian Nagelsmann said: “Musiala will win...",PARTLY_RELEVANT,The generated answer addresses part of the que...,0.876576
4,5,Can both Musiala and Wirtz potentially win Bal...,"Based on the context provided, there have been...",PARTLY_RELEVANT,The generated answer partly addresses the ques...,0.73479


In [250]:
def calculate_average_cosine_similarity(llm_judge_evaluation):
    total_similarity = 0
    num_records = len(llm_judge_evaluation)

    # Loop through each record and sum up the cosine similarity scores
    for record in llm_judge_evaluation:
        total_similarity += record['cosine_similarity']

    # Calculate the average
    average_similarity = total_similarity / num_records if num_records > 0 else 0

    return average_similarity

In [251]:
average_similarity_model1 = calculate_average_cosine_similarity(model1_cosine)
print(f"Average Cosine Similarity using paraphrase-MiniLM-L6-v2: {average_similarity}")

Average Cosine Similarity using paraphrase-MiniLM-L6-v2: 0.6395113215160867


In [252]:
model2 = SentenceTransformer("all-mpnet-base-v2")



In [253]:
model2_cosine = calculate_records_cosine_similarity(llm_judge_evaluation, model2)

In [254]:
df_cosine2_evaluation = pd.DataFrame(model2_cosine)
df_cosine2_evaluation.head()

Unnamed: 0,id,question,llm_answer,relevance,explanation,cosine_similarity
0,1,Will Memphis Depay complete his Corinthians fr...,"Yes, Memphis Depay will complete his move to C...",RELEVANT,The generated answer directly addresses the qu...,0.794541
1,2,What formal steps is Memphis Depay ahead of?,Memphis Depay and Corinthians have reached an ...,NON_RELEVANT,The generated answer discusses Memphis Depay's...,0.643263
2,3,Is Memphis Depay currently in attendance for a...,"Yes, as of the latest update, Memphis Depay is...",PARTLY_RELEVANT,The generated answer confirms that Memphis Dep...,0.823905
3,4,Is Julian Nagelsmann saying Musiala and Wirtz ...,"Yes, Julian Nagelsmann said: “Musiala will win...",PARTLY_RELEVANT,The generated answer addresses part of the que...,0.82563
4,5,Can both Musiala and Wirtz potentially win Bal...,"Based on the context provided, there have been...",PARTLY_RELEVANT,The generated answer partly addresses the ques...,0.790739


In [256]:
average_similarity_model2 = calculate_average_cosine_similarity(model2_cosine)
print(f"Average Cosine Similarity using all-mpnet-base-v2: {average_similarity_model2}")

Average Cosine Similarity using all-mpnet-base-v2: 0.6852133847451547


In [29]:
import pandas as pd
from langchain_huggingface import HuggingFaceEmbeddings

# Load the CSV
tweets_df = pd.read_csv("../data/elonmusk_tweets.csv")

# Initialize the embedding model
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

# Apply the embedding model to each row in the 'text' column
# tweets_df['vector'] = tweets_df['text'].apply(lambda x: embedding_model.embed_query(x))

# # Save the updated dataframe with embeddings to a new CSV file
# tweets_df.to_csv("../data/fabrizioromano_tweets_with_vectors.csv", index=False)

print("Embeddings generated and saved to new CSV file.")




Embeddings generated and saved to new CSV file.


In [2]:
tweets_df.head()

Unnamed: 0,tweet_count,tweet_id,username,text,created at,url,vector
0,1,1832563499995725843,Elon Musk,"⏳🇳🇱 Memphis Depay, in attendance for Dutch nat...",Sat Sep 07 23:36:00 +0000 2024,https://twitter.com/FabrizioRomano/status/1832...,"[-0.015379995107650757, 0.003363637486472726, ..."
1,2,1832553657738358953,Elon Musk,✨🇩🇪 Julian Nagelsmann: “Both Musiala and Flori...,Sat Sep 07 22:56:53 +0000 2024,https://twitter.com/FabrizioRomano/status/1832...,"[0.006144128739833832, 0.06898245960474014, 0...."
2,3,1832537271574098426,Elon Musk,"✨🇩🇪 Three assists, one goal tonight for Jamal ...",Sat Sep 07 21:51:47 +0000 2024,https://twitter.com/FabrizioRomano/status/1832...,"[-0.005370712373405695, -0.03543499857187271, ..."
3,4,1832524667846099038,Elon Musk,🚨🔴🔵 No serious injury for Dani Olmo after furt...,Sat Sep 07 21:01:42 +0000 2024,https://twitter.com/FabrizioRomano/status/1832...,"[-0.05482368916273117, -0.022041261196136475, ..."
4,5,1832496480805859636,Elon Musk,"🔴🇳🇱 First start for Netherlands, first goal fo...",Sat Sep 07 19:09:41 +0000 2024,https://twitter.com/FabrizioRomano/status/1832...,"[-0.014181919395923615, -0.016410119831562042,..."
