In [None]:
import os
from time import time
import re

import nltk
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords

nltk.download('stopwords')

from dotenv import load_dotenv

load_dotenv()

from openai import OpenAI
from openai import RateLimitError

import pandas as pd

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from tqdm import tqdm

In [None]:
client = OpenAI(
    api_key=os.environ.get("OPENAI_API_KEY"),
)

In [None]:
train_df = pd.read_csv("./data/nlp-getting-started/train.csv")
test_df = pd.read_csv("./data/nlp-getting-started/test.csv")

tweet_train_df = train_df[['text', 'target']]
tweet_test_df = test_df[['id', 'text']]

In [None]:
def clean_tweet(tweet):
    # Remove URLs
    tweet = re.sub(r'http\S+|www\S+|https\S+', '', tweet)
    # Remove mentions
    tweet = re.sub(r'@\w+', '', tweet)
    # Remove hashtags (optional to retain words)
    tweet = re.sub(r'#', '', tweet)
    # Remove punctuation
    tweet = re.sub(r'[^\w\s]', '', tweet)
    # Remove emojis
    tweet = re.sub(r'[^\x00-\x7F]+', '', tweet)  # Removes non-ASCII characters (emojis)
    # Convert to lowercase
    tweet = tweet.lower()

    # Tokenize
    tokenizer = TweetTokenizer()
    tokens = tokenizer.tokenize(tweet)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Reassemble tweet
    cleaned_tweet = ' '.join(tokens)
    return cleaned_tweet

In [None]:
tweet_train_df.loc[:, 'text'] = tweet_train_df['text'].apply(clean_tweet)
tweet_test_df.loc[:, 'text'] = tweet_test_df['text'].apply(clean_tweet)

tweet_train_df['text'].loc[:10]

In [None]:
few_shot = 'You are a twitter tweet analysis assistant. You analyze if a tweet is about a natural disaster or not. Analyze the provided tweet and respond with 0 if the tweet is not about a natural disaser and 1 if it is. \n\n'
for index, row in enumerate(tweet_train_df.sample(n=50).iterrows()):
    few_shot += f'Example {index + 1} \n'
    few_shot += f'Tweet: {row[1]['text']} \n'
    few_shot += f'Is about natural disaster: {row[1]['target']} \n\n'

print(few_shot)

def is_natural_disaster_tweet(tweet, max_tries=3, delay=0.5):
    prompt = few_shot + f'Now analyze the following tweet:\nTweet: {tweet}\nIs about natural disaster:'
    for _ in range(max_tries):
        try:
            response = client.completions.create(
                model="gpt-3.5-turbo-instruct",
                prompt=prompt,
                temperature=0.0,
            )
            return response.choices[0].text.strip()
        except RateLimitError as e:
            exception = e
            print(f'Rate limit exceeded. Wait for {delay*1000}ms and retry.')
            time.sleep(delay)
    raise exception

In [None]:
tweet_val = tweet_train_df.loc[:600, 'text']
target_val_list = tweet_train_df.loc[:600, 'target'].to_list()
is_natural_disaster_list = [int(is_natural_disaster_tweet(tweet)) for tweet in tqdm(tweet_val)]

In [None]:
accuracy = accuracy_score(target_val_list, is_natural_disaster_list)
precision = precision_score(target_val_list, is_natural_disaster_list)
recall = recall_score(target_val_list, is_natural_disaster_list)
f1 = f1_score(target_val_list, is_natural_disaster_list)

print(f"Accuracy:  {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall:    {recall:.2f}")
print(f"F1 Score:  {f1:.2f}")