In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.4-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m41.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m70.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.4-py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 KB[0m [31m14.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.4 tokenizers-0.13.3 transformers-4.27.4


In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import transformers
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix ,f1_score

# Set up the GPT-2 tokenizer and model
tokenizer = transformers.GPT2Tokenizer.from_pretrained('gpt2')
model = transformers.TFGPT2LMHeadModel.from_pretrained('gpt2')

# Load the Disaster Tweets Classification dataset
data = pd.read_csv('train.csv')

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading tf_model.h5:   0%|          | 0.00/498M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at gpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

no preprocess for now

In [None]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(data['text'], data['target'], test_size=0.2, random_state=42)

# Further split the training set into prompt tuning set and evaluation benchmark
X_prompt_tune, X_eval, y_prompt_tune, y_eval = train_test_split(X_train, y_train, test_size=0.9, random_state=42)

# Define the prompt engineering techniques to test
prompt_techniques = ['add_keywords', 'pre_classification', 'sentiment_analysis']

# Define a function to generate prompts based on the selected technique
def generate_prompt(technique, tweet):
    if technique == 'add_keywords':
        keywords = ['hurricane', 'earthquake', 'flood', 'disaster', 'emergency']
        prompt = f"Classify the following tweet as disaster-related or not: {tweet} Keywords: {', '.join(keywords)}"
    elif technique == 'pre_classification':
        prompt = f"Classify the following tweet as either weather-related or not: {tweet} Then, if it is weather-related, classify it further as disaster-related or not: {tweet}"
    elif technique == 'sentiment_analysis':
        prompt = f"Classify the following tweet as disaster-related or not based on its sentiment: {tweet} Keywords: disaster, emergency"
    else:
        raise ValueError("Invalid prompt engineering technique specified.")
    return prompt

# Define a function to test the selected prompt engineering technique
def test_prompt_engineering(technique):
    # Initialize a list to store the predicted labels
    y_pred = []
    # Loop through the evaluation set and generate prompts for each tweet
    for tweet in X_eval:
        prompt = generate_prompt(technique, tweet)
        # Encode the prompt and tokenize the input tweet
        input_ids = tokenizer.encode(prompt + tweet, return_tensors='tf')
        # Generate predictions using the GPT-2 model
        logits = model(input_ids)[0]
        predicted_class = np.argmax(logits)
        # Append the predicted label to the list
        y_pred.append(predicted_class)
    # Print the classification report and confusion matrix for the model
    print(f"Results for prompt engineering technique: {technique}")
    print(classification_report(y_eval, y_pred))
    print(confusion_matrix(y_eval, y_pred))

In [None]:
# Test each prompt engineering technique on the prompt tuning set and select the best one
best_technique = None
best_score = 0.0
for technique in prompt_techniques:
    # Initialize a list to store the predicted labels
    y_pred = []
    # Loop through the prompt tuning set and generate prompts for each tweet
    for tweet in X_prompt_tune:
        prompt = generate_prompt(technique, tweet)
        # Encode the prompt and tokenize the input tweet
        input_ids = tokenizer.encode(prompt + tweet, return_tensors='tf')
        # Generate predictions using the GPT-2 model
        logits = model(input_ids)[0]
        predicted_class = np.argmax(logits)
        # Append the predicted label to the list
        y_pred.append(predicted_class)
    # Compute the F1-score for the current prompt engineering technique
    score = f1_score(y_prompt_tune, y_pred, average='weighted')
    # Print the F1-score for the current prompt engineering technique
    print(f"F1-score for prompt engineering technique {technique}: {score}")
    # Update the best prompt engineering technique if the current score is higher
    if score > best_score:
        best_score = score
        best_technique = technique

# Print the best prompt engineering technique
print(f"The best prompt engineering technique is {best_technique} with an F1-score of {best_score}.")


F1-score for prompt engineering technique add_keywords: 0.0
F1-score for prompt engineering technique pre_classification: 0.0
F1-score for prompt engineering technique sentiment_analysis: 0.0
The best prompt engineering technique is None with an F1-score of 0.0.
