In [2]:
import pandas as pd

In [3]:
tweets_df = pd.read_csv("../data/fabrizioromano_tweets.csv")

In [4]:
documents = tweets_df.to_dict(orient='records')

In [5]:
documents[0]

{'tweet_count': 1,
 'tweet_id': 1832563499995725843,
 'username': 'Fabrizio Romano',
 'text': '⏳🇳🇱 Memphis Depay, in attendance for Dutch national team game ahead of the formal steps to complete his Corinthians free transfer…\n\n…almost there. 🏁🇧🇷 https://t.co/khXJknhl4L',
 'created at': 'Sat Sep 07 23:36:00 +0000 2024',
 'url': 'https://twitter.com/FabrizioRomano/status/1832563499995725843'}

In [24]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.llms import Ollama

In [99]:
model = Ollama(model="llama3", base_url="http://localhost:11434")

In [29]:
import json

with open('tweets_json.json', 'wt') as f_out:
    json.dump(documents, f_out, indent=2)

In [115]:
prompt_template = """
You emulate a football fan who wants to get football news from the tweets of a Journalist.
A tweet would be provided, you are to formulate 3 questions this fan might ask based on the tweet. 
The questions are to illustrate a fan asking a question that this tweet will answer, 
So each questions should not use pronouns that refer to people or things that are not named or stated in the question.
I repeat, no pronouns. Each question should state the names of who or what you are referring to from the tweet.
Each question should be a standalone, and not a followup to past questions.
No question should reference another question or names from a previous question. Each question is a standalone and not dependent on other generated questions.
The questions should be complete, short, and consise.
for context, The questions would be used to create a ground truth dataset.


The tweet:
{tweet} 

 Instruction:
Provide the output as a list without using code blocks but the quwstions should be in quotes separated by a comma. 
The format should be, first question in quotes then a comma, second question in quotes then a comma, third question in quotes, that's all, no extra character.
Please do not include any extra text or characters in your response. Just follow the format I stated.
Format:


"question1", "question2", "question3"
""".strip()

In [112]:
formatted_prompt = prompt_template.format(tweet=documents[2]['text'])
formatted_prompt

'You emulate a football fan who wants to get football news from the tweets of a Journalist.\nA tweet would be provided, you are to formulate 3 questions this fan might ask based on the tweet. \nThe questions are to illustrate a fan asking a question that this tweet will answer, \nSo each questions should not use pronouns that refer to people or things that are not named or stated in the question.\nI repeat, no pronouns. Each question should state the names of who or what you are referring to from the tweet.\nEach question should be a standalone, and not a followup to past questions.\nNo question should reference another question or names from a previous question. Each question is a standalone and not dependent on other generated questions.\nThe questions should be complete, short, and consise. \n\n\nThe tweet:\n✨🇩🇪 Three assists, one goal tonight for Jamal Musiala.\n\nKimmich: “I don\'t know what criteria are used to decide the Ballon d’Or list”.\n\n“The best players should actually be

In [116]:
result = model.invoke(formatted_prompt)
print(result)

"Who gave three assists and one goal tonight?", "What are Kimmich's thoughts on the Ballon d'Or list?", "Is Jamal Musiala considered one of the best players?"


In [122]:
ground_truth =[]

In [124]:
from tqdm import tqdm

for doc in tqdm(documents[:100]):  
    formatted_prompt = prompt_template.format(tweet=doc['text'])
    result = model.invoke(formatted_prompt)
    questions = [q.strip().strip('"') for q in result.split(',')]
    document = {"id": doc['tweet_id'], "questions": questions}
    ground_truth.append(document)


100%|██████████| 100/100 [1:31:52<00:00, 55.13s/it]


In [125]:
len(ground_truth)

100

In [126]:
for doc in tqdm(documents[101:300]):  
    formatted_prompt = prompt_template.format(tweet=doc['text'])
    result = model.invoke(formatted_prompt)
    questions = [q.strip().strip('"') for q in result.split(',')]
    document = {"id": doc['tweet_id'], "questions": questions}
    ground_truth.append(document)

100%|██████████| 199/199 [2:41:18<00:00, 48.64s/it]  


In [132]:
len(ground_truth)

299

In [135]:
import copy

ground_truth_copy = copy.deepcopy(ground_truth)

In [136]:
for gt_doc in ground_truth_copy:
    matching_doc = next((doc for doc in documents if doc['tweet_id'] == gt_doc['id']), None)
    if matching_doc:
        gt_doc['tweet_text'] = matching_doc['text']

In [138]:
len(ground_truth_copy)

299

In [143]:
json_file_path = '../data/ground_truth_copy.json'

with open(json_file_path, 'w', encoding='utf-8') as file:
    json.dump(ground_truth_copy, file, ensure_ascii=False, indent=4)

print(f"JSON file saved as {json_file_path}")

JSON file saved as ../data/ground_truth_copy.json


In [145]:
# Save to csv
ground_truth_data = []

for doc in ground_truth_copy:
    tweet_id = doc['id']  
    
    for question in doc['questions']:
        ground_truth_data.append({"question": question, "tweet_id": tweet_id})

In [148]:
len(ground_truth_data)

930

In [151]:
import csv

csv_file_path = '../data/ground_truth_data.csv'

with open(csv_file_path, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.DictWriter(file, fieldnames=["question", "tweet_id"])
    writer.writeheader()
    writer.writerows(ground_truth_data)

print(f"CSV file saved as {csv_file_path}")

CSV file saved as ../data/ground_truth_data.csv


In [None]:
import csv
from collections import defaultdict

def get_frequent_tweet_ids(csv_file):
    # Dictionary to count occurrences of tweet_ids
    tweet_id_count = defaultdict(int)

    # Open and read the CSV file
    with open(csv_file, mode='r', newline='') as file:
        reader = csv.DictReader(file)
        
        # Loop over each row and count occurrences of tweet_id
        for row in reader:
            tweet_id = row['tweet_id']
            tweet_id_count[tweet_id] += 1

    # Sort out tweet_ids that appear more than 3 times
    frequent_tweet_ids = [tweet_id for tweet_id, count in tweet_id_count.items() if count > 3]
    
    return frequent_tweet_ids

# Example usage
csv_file_path = '../data/ground_truth_data.csv'  # Replace with your actual CSV file path
frequent_ids = get_frequent_tweet_ids(csv_file_path)
print(frequent_ids)
