# Prompting

In [None]:
%pip install -U llama-cpp-python faiss-cpu sentence-transformers
!mkdir models
!wget -O models/phi-2.q4.gguf https://huggingface.co/TheBloke/phi-2-GGUF/resolve/main/phi-2.Q4_K_M.gguf

In [8]:
import pandas as pd
from llama_cpp import Llama
from transformers import pipeline
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import torch

In [None]:
df_author = pd.read_csv(
    "../utils/dataset.csv")
df_author.head()

In [16]:
encoder = SentenceTransformer("all-MiniLM-L6-v2")
corpus_embeddings = encoder.encode(df_author["Text"].tolist(), convert_to_numpy=True)

dimension = corpus_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(corpus_embeddings)

def retrieve_similar_tweets(df, query, k=5):
    query_embedding = encoder.encode([query])
    distances, indices = index.search(query_embedding, k)
    return df.iloc[indices[0]]["Text"].tolist(), distances[0]

# 4. Funzione per creare prompt da DataFrame e tweet di test
def create_prompt(df, test_tweet, n_context=3):
    context_tweets = get_best_tweets(df,test_tweet,n_context)
    context_str = "\n".join(f'- "{t}"' for t in context_tweets)
    prompt = f"""I will provide you with a list of tweets written by the same person.

Author's tweets:
{context_str}

Now, consider this new tweet:

"{test_tweet}"
Question: Could this tweet have been written by the same person?
Answer only with YES or NO."""
    return prompt

def get_best_tweets(df, tweet, k=5):
    retrieved_texts, distances = retrieve_similar_tweets(df, tweet, k=k)
    print(retrieved_texts, distances)
    examples = "\n".join([
        f'{i+1}. "{text}"'
        for i, text in enumerate(retrieved_texts)
    ])
    return retrieved_texts

    '''
    prompt = create_prompt(df, tweet, n_context=k)

    result = generator(prompt, max_new_tokens=5, do_sample=False)[0]["generated_text"]
    return result.strip()
    '''

In [12]:
llm = Llama(model_path="models/phi-2.q4.gguf", n_ctx=2048)

llama_model_load_from_file_impl: using device Metal (Apple M2 Pro) - 9817 MiB free
llama_model_loader: loaded meta data with 20 key-value pairs and 325 tensors from models/phi-2.q4.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = phi2
llama_model_loader: - kv   1:                               general.name str              = Phi2
llama_model_loader: - kv   2:                        phi2.context_length u32              = 2048
llama_model_loader: - kv   3:                      phi2.embedding_length u32              = 2560
llama_model_loader: - kv   4:                   phi2.feed_forward_length u32              = 10240
llama_model_loader: - kv   5:                           phi2.block_count u32              = 32
llama_model_loader: - kv   6:                  phi2.attention.head_count u32              = 32
llama_model_l

In [None]:
# 6. Testa con un tweet
test_tweet = """
Kobe was a legend on the court and just getting started in what would have been just as meaningful a second act. To lose Gianna is even more heartbreaking to us as parents. Michelle and I send love and prayers to Vanessa and the entire Bryant family on an unthinkable day.
"""
prompt = create_prompt(df_author, test_tweet, n_context=5)

response = llm(prompt, max_tokens=None, echo=False)
print("Prompt:\n", prompt)
print("\nRisposta:", response['choices'][0]['text'].strip())