In [1]:
import pandas as pd

In [2]:
# Try different encodings to handle the CSV file properly
try:
    tweets_df = pd.read_csv("../data/elonmusk_tweets.csv", encoding='utf-8')
except UnicodeDecodeError:
    try:
        tweets_df = pd.read_csv("../data/elonmusk_tweets.csv", encoding='latin-1')
    except UnicodeDecodeError:
        tweets_df = pd.read_csv("../data/elonmusk_tweets.csv", encoding='cp1252')

print(f"Successfully loaded {len(tweets_df)} tweets")
print(f"Columns: {list(tweets_df.columns)}")

Successfully loaded 3218 tweets
Columns: ['tweet_count', 'tweet_id', 'username', 'text', 'created at']


In [3]:
documents = tweets_df.to_dict(orient='records')

In [5]:
documents[0]

{'tweet_count': 1,
 'tweet_id': 2,
 'username': 'elonmusk',
 'text': "@MeltingIce Assuming max acceleration of 2 to 3 g's, but in a comfortable direction. Will feel like a mild to moder? https://t.co/fpjmEgrHfC",
 'created at': '2017-09-29 17:39:19'}

In [4]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.llms import Ollama

In [5]:
model = Ollama(model="llama3", base_url="http://localhost:11434")

  model = Ollama(model="llama3", base_url="http://localhost:11434")


In [6]:
import json

with open('tweets_json.json', 'wt') as f_out:
    json.dump(documents, f_out, indent=2)

In [10]:
prompt_template = """
You emulate a nerd fan who wants to get news from the tweets of a billionaire entrepreneur.
A tweet would be provided, you are to formulate 3 questions this fan might ask based on the tweet. 
The questions are to illustrate a fan asking a question that this tweet will answer, 
So each questions should not use pronouns that refer to people or things that are not named or stated in the question.
I repeat, no pronouns. Each question should state the names of who or what you are referring to from the tweet.
Each question should be a standalone, and not a followup to past questions.
No question should reference another question or names from a previous question. Each question is a standalone and not dependent on other generated questions.
The questions should be complete, short, and consise.
for context, The questions would be used to create a ground truth dataset.


The tweet:
{tweet} 

 Instruction:
Provide the output as a list without using code blocks but the quwstions should be in quotes separated by a comma. 
The format should be, first question in quotes then a comma, second question in quotes then a comma, third question in quotes, that's all, no extra character.
Please do not include any extra text or characters in your response. Just follow the format I stated.
Format:


"question1", "question2", "question3"
""".strip()

In [11]:
formatted_prompt = prompt_template.format(tweet=documents[2]['text'])
formatted_prompt

'You emulate a nerd fan who wants to get news from the tweets of a billionaire entrepreneur.\nA tweet would be provided, you are to formulate 3 questions this fan might ask based on the tweet. \nThe questions are to illustrate a fan asking a question that this tweet will answer, \nSo each questions should not use pronouns that refer to people or things that are not named or stated in the question.\nI repeat, no pronouns. Each question should state the names of who or what you are referring to from the tweet.\nEach question should be a standalone, and not a followup to past questions.\nNo question should reference another question or names from a previous question. Each question is a standalone and not dependent on other generated questions.\nThe questions should be complete, short, and consise.\nfor context, The questions would be used to create a ground truth dataset.\n\n\nThe tweet:\n@bigajm Yup :) \n\n Instruction:\nProvide the output as a list without using code blocks but the quws

In [13]:
result = model.invoke(formatted_prompt)
print(result)

OllamaEndpointNotFoundError: Ollama call failed with status code 404. Maybe your model is not found and you should pull the model with `ollama pull llama3`.

In [14]:
ground_truth =[]

In [15]:
from tqdm import tqdm

for doc in tqdm(documents[:100]):  
    formatted_prompt = prompt_template.format(tweet=doc['text'])
    result = model.invoke(formatted_prompt)
    questions = [q.strip().strip('"') for q in result.split(',')]
    document = {"id": doc['tweet_id'], "questions": questions}
    ground_truth.append(document)


  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]



OllamaEndpointNotFoundError: Ollama call failed with status code 404. Maybe your model is not found and you should pull the model with `ollama pull llama3`.

In [None]:
len(ground_truth)

100

In [None]:
for doc in tqdm(documents[101:300]):  
    formatted_prompt = prompt_template.format(tweet=doc['text'])
    result = model.invoke(formatted_prompt)
    questions = [q.strip().strip('"') for q in result.split(',')]
    document = {"id": doc['tweet_id'], "questions": questions}
    ground_truth.append(document)

100%|██████████| 199/199 [2:41:18<00:00, 48.64s/it]  


In [None]:
len(ground_truth)

299

In [None]:
import copy

ground_truth_copy = copy.deepcopy(ground_truth)

In [None]:
for gt_doc in ground_truth_copy:
    matching_doc = next((doc for doc in documents if doc['tweet_id'] == gt_doc['id']), None)
    if matching_doc:
        gt_doc['tweet_text'] = matching_doc['text']

In [None]:
len(ground_truth_copy)

299

In [None]:
json_file_path = '../data/ground_truth_copy.json'

with open(json_file_path, 'w', encoding='utf-8') as file:
    json.dump(ground_truth_copy, file, ensure_ascii=False, indent=4)

print(f"JSON file saved as {json_file_path}")

JSON file saved as ../data/ground_truth_copy.json


In [None]:
# Save to csv
ground_truth_data = []

for doc in ground_truth_copy:
    tweet_id = doc['id']  
    
    for question in doc['questions']:
        ground_truth_data.append({"question": question, "tweet_id": tweet_id})

In [None]:
len(ground_truth_data)

930

In [None]:
import csv

csv_file_path = '../data/ground_truth_data.csv'

with open(csv_file_path, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.DictWriter(file, fieldnames=["question", "tweet_id"])
    writer.writeheader()
    writer.writerows(ground_truth_data)

print(f"CSV file saved as {csv_file_path}")

CSV file saved as ../data/ground_truth_data.csv


In [None]:
import csv
from collections import defaultdict

def get_frequent_tweet_ids(csv_file):
    # Dictionary to count occurrences of tweet_ids
    tweet_id_count = defaultdict(int)

    # Open and read the CSV file
    with open(csv_file, mode='r', newline='') as file:
        reader = csv.DictReader(file)
        
        # Loop over each row and count occurrences of tweet_id
        for row in reader:
            tweet_id = row['tweet_id']
            tweet_id_count[tweet_id] += 1

    # Sort out tweet_ids that appear more than 3 times
    frequent_tweet_ids = [tweet_id for tweet_id, count in tweet_id_count.items() if count > 3]
    
    return frequent_tweet_ids

# Example usage
csv_file_path = '../data/ground_truth_data.csv'  # Replace with your actual CSV file path
frequent_ids = get_frequent_tweet_ids(csv_file_path)
print(frequent_ids)
