In [1]:
%load_ext autoreload
%autoreload 2
import os
import pandas as pd
from constants import gcs_main_bucket, gcs_folder
from utils import run_llm_label_flow,disagreement_percentage
from sklearn.metrics import classification_report

In [2]:
# So we can see all of the text
pd.set_option('display.max_colwidth', 300)

# Step 1. Load in Ground Truth "Test" Data and Send Out Input Data to GCS


**Data Generation**: 
> Data was generated hilariously with ChatGPT-4.

> Prompt included asking GPT to come up with theoretical posts from different social platforms that were in a positive, negative, or neutral tone about Pikachu.

> Example prompt (nothing fancy): *"Come up with 20 theoretrical linkedin posts that mention pikachu in a negative tone and really seem like they could have been written by a person."*



In [8]:
test_data = pd.read_json('data/test_data.json',orient='records',lines=True)

In [9]:
test_data.head(5)

Unnamed: 0,label,text,source,uid
0,negative,Saw a Pikachu-themed display at the mall. Feels like it's just a cash grab at this point. #Pikachu #PokemonFranchise,linkedin,214
1,positive,Pikachu's design is timeless and iconic! 🎨 #Pikachu,X,79
2,negative,Attending a conference next week where the speaker will discuss Pikachu's impact on culture. Hoping to be proven wrong. #Pikachu #PopularCulture,linkedin,231
3,negative,I don't like how Pikachu is always the center of attention in the Pokémon world.,reddit,110
4,negative,Pikachu is not cute. It's annoying. 😒 #Pikachu #NotCute,X,11


### Send out input data for later use in Pipeline 

In [113]:
os.system(f'gsutil cp data/input_data.json gs://{gcs_main_bucket}/{gcs_folder}/preprocessed/input_data.json')

Copying file://data/input_data.json [Content-Type=application/json]...
/ [0 files][    0.0 B/ 19.2 KiB]                                                / [1 files][ 19.2 KiB/ 19.2 KiB]                                                
Operation completed over 1 objects/19.2 KiB.                                     


0

# Step 2. Prompt Engineering 

For this part we will be trying to optimize the prompt based on the ground truth data itself. We will be using run_llm_label_flow located in utils.py to properly format the results of the llm_component class defined in llm_label_flow.py 





In [5]:
# Define labeling inputs 
sent_labels = ['negative','neutral','positive']
sent_dict = {'negative':0,'neutral':1,'positive':2}

In [None]:
"""Label the sentiment of the text as
   positive, negative, or neutral

    Output: 
    neutral
    positive
    negative

"""

###  Define your starter prompt 

In [10]:
prompt_1 = """Label the sentiment of the text as positive, negative, or neutral

Output: 
neutral
positive
negative

"""

# Try out starter prompt on test data 

In [39]:
p1_results = run_llm_label_flow(sent_dict,prompt_1,sent_labels,test_data,'label',label_name='llm_label')

100%|█████████████████████████████████████████| 136/136 [00:07<00:00, 18.61it/s]


              precision    recall  f1-score   support

           0       0.97      0.75      0.85        44
           1       0.72      0.54      0.62        48
           2       0.65      0.98      0.78        44

    accuracy                           0.75       136
   macro avg       0.78      0.76      0.75       136
weighted avg       0.78      0.75      0.75       136



# Step 2: Prompt Engineering - Optimizing the Prompt 



There are obviously a number of ways you can optimize a prompt for your specific use case. Some helpful from geeksforgeeks are as follows: (source: https://www.geeksforgeeks.org/chatgpt-prompt-engineering-principles/)


Using the guidelines, we will assess if prompt 1 really does a good job of following the principles, and if not, how we will optimize it. 



> **1. Clarity:**
> * Clear Instructions
> * Clear Requirements
> * Clear Goals
> > Typically in the real world data is never usually clean and the sentiment isn't always clear. Therefore to **to optimize prompt 1 we should probably add some guidelines around what the characteristics of sentiment are**. 

> **2. Specificity/Conciseness:**

> > Prompt 1 is concise--but it lacks the specifics (as mentioned above). I.e. the prompt is lack the specifics of the rules that govern sentiment **to optimize prompt 1 we will add specific guidelines for each sentiment label**. 


> **3. Providing/Elaborating Context:**

> > Prompt 1 does not have context into the kind of data coming in, which could alter the predictions--**to optimize prompt 1 we will note in the prompt that these data are social posts.**

> **4. "I want you to be":**

> > The "I want you to be" statement is essentially a statement that adds more context to the model. Prompt 1 does not have this statement. --**to optimize prompt 1 we will include in the front of the prompt that we want it to act as a data curator with a rich understanding of how to correctly label sentiment for social posts.**.

> **5.Consistency/Continous Learning:**

> > Refining prompts based on user feedback/tests to establish whether the prompt tangibly is working better (in this case model performance) can add to the model performance overtime. We haven't tried any variation of prompt 1. --**to optimize prompt 1 we will assess variations of the prompt using the guidelines above piece by piece to see if performance improves.**











# Prompt Engineering Results 


Using an iterative approach by following the principles mentioned above, we will try to obtain better performance.



Starting with the principles 1-2, we will begin by taking a random sample of positive, negative, and neutral labels and asking ChatGPT-4 to derive guidelines that a curator could use in labeling posts in the future:


Example Prompt:
> * "Given that the posts are labeled negative develop a list of guidelines based on the posts characteristics that will help a data curator label the sentiment of similar posts in the future, written in very straight-forward and clear terms. Posts [insert negative post samples]: 

In [28]:
negative_sample = test_data.loc[test_data['label']=='negative'].sample(n=10,random_state=42)
positive_sample = test_data.loc[test_data['label']=='positive'].sample(n=10,random_state=42)
neutral_sample = test_data.loc[test_data['label']=='neutral'].sample(n=10,random_state=42)


In [33]:
prompt_2 = """ Label the sentiment of the text by following the sentiment guidelines below.

Negative Guidelines:

1. **Focus on Sentiment Words:** Look for words that express a negative opinion or dissatisfaction, such as "overrated," "not worth it," "wish was different," "annoying," or "too predictable."

2. **Check for Comparisons:** Pay attention to comparisons that place the subject in a negative light compared to something else. For example, "Pikachu's popularity has overshadowed other great Pokémon" implies that Pikachu doesn't deserve its popularity compared to other Pokémon.

3. **Look for Wishes or Desires for Change:** If the post expresses a wish or desire for something to be different, it may indicate dissatisfaction with the current state. For example, "I wish Ash had a different starter Pokémon" shows a desire for change and potential negativity towards the current situation.

4. **Consider Hashtags:** Hashtags can often provide context about the post's sentiment. For example, #Boycott is a clear indication of a negative sentiment.

5. **Pay Attention to Emoji Usage:** Emojis can often convey the tone of the post. A frustrated or angry emoji can indicate a negative sentiment.

6. **Take into Account the Overall Tone:** Look at the post as a whole and consider the overall tone. If the post seems to be expressing dissatisfaction, disappointment, or frustration, it is likely negative.

7. **Note any Call to Action:** If the post includes a call to action that is based on a negative sentiment, such as boycotting, it is likely negative.

Positive Guidelines:

1. **Look for Positive Words or Phrases:** Focus on words or phrases that express a positive opinion, satisfaction, or praise, such as "love," "pleasure," "cute," "well-made," "charming," "can't get enough of," or "exciting things are coming."

2. **Check for Positive Experiences or Memories:** Pay attention to positive experiences or fond memories mentioned in the post. For example, "I've had a Pikachu plushie since I was a kid" or "I've been a fan of Pikachu since I was a kid" show a positive emotional connection.

3. **Consider Hashtags:** Hashtags can often provide context about the post's sentiment. For example, #Pikachu combined with positive words or phrases is a clear indication of a positive sentiment.

4. **Pay Attention to Emoji Usage:** Emojis can often convey the tone of the post. Positive emojis such as hearts or smiley faces can indicate a positive sentiment.

5. **Take into Account the Overall Tone:** Look at the post as a whole and consider the overall tone. If the post seems to be expressing happiness, satisfaction, or excitement, it is likely positive.

6. **Notice Expressions of Pride or Accomplishment:** If the post expresses pride or accomplishment, such as winning a battle or having a productive meeting, it is likely positive.

7. **Check for Expressions of Affection or Nostalgia:** Posts that express affection for or nostalgia about the subject, such as mentioning a cherished childhood toy or a special place in one's heart, are likely positive.

Neutral Guidelines:

1. **Focus on Factual Statements:** Look for statements that are factual or informative, without expressing a clear positive or negative opinion. For example, "Pikachu's height is 0.4 meters" or "Pikachu can use the move Quick Attack."

2. **Check for Lack of Emotional Language:** Neutral posts often lack emotional language or sentiments, such as "love," "hate," "exciting," or "disappointing."

3. **Consider Hashtags:** Hashtags can provide context about the post's content, but in neutral posts, they often relate to the topic without adding sentiment. For example, #Pikachu combined with a factual statement is likely neutral.

4. **Pay Attention to Emoji Usage:** Emojis can often convey the tone of the post. Neutral posts may use emojis that are related to the content without expressing a strong positive or negative emotion, such as a lightning bolt emoji for Pikachu's tail.

5. **Look for Descriptions or Explanations:** Neutral posts may include descriptions or explanations that add context to the topic without expressing a clear sentiment.

6. **Check for Research or Study References:** Posts that reference research, reports, case studies, or other forms of investigation are likely to be neutral, as they often focus on factual information.

7. **Consider Conversational Posts:** Conversations that simply share information or preferences, without expressing a strong sentiment, are likely neutral. For example, "Had a conversation with a friend about our favorite Pokémon."



"""

In [40]:
p2_results = run_llm_label_flow(sent_dict,prompt_2,sent_labels,test_data,'label',label_name='llm_label')

100%|█████████████████████████████████████████| 136/136 [00:07<00:00, 18.69it/s]


              precision    recall  f1-score   support

           0       1.00      0.80      0.89        44
           1       0.81      0.71      0.76        48
           2       0.75      1.00      0.85        44

    accuracy                           0.83       136
   macro avg       0.85      0.83      0.83       136
weighted avg       0.85      0.83      0.83       136



Ok, results are now starting to look a LOT better--but... can we do even better? We will use Principle 3, by optimizing prompt_2.

In [6]:
prompt_3 = """ Label the sentiment of the social posts by following the sentiment guidelines below.

Negative Guidelines:

1. **Focus on Sentiment Words:** Look for words that express a negative opinion or dissatisfaction, such as "overrated," "not worth it," "wish was different," "annoying," or "too predictable."

2. **Check for Comparisons:** Pay attention to comparisons that place the subject in a negative light compared to something else. For example, "Pikachu's popularity has overshadowed other great Pokémon" implies that Pikachu doesn't deserve its popularity compared to other Pokémon.

3. **Look for Wishes or Desires for Change:** If the post expresses a wish or desire for something to be different, it may indicate dissatisfaction with the current state. For example, "I wish Ash had a different starter Pokémon" shows a desire for change and potential negativity towards the current situation.

4. **Consider Hashtags:** Hashtags can often provide context about the post's sentiment. For example, #Boycott is a clear indication of a negative sentiment.

5. **Pay Attention to Emoji Usage:** Emojis can often convey the tone of the post. A frustrated or angry emoji can indicate a negative sentiment.

6. **Take into Account the Overall Tone:** Look at the post as a whole and consider the overall tone. If the post seems to be expressing dissatisfaction, disappointment, or frustration, it is likely negative.

7. **Note any Call to Action:** If the post includes a call to action that is based on a negative sentiment, such as boycotting, it is likely negative.

Positive Guidelines:

1. **Look for Positive Words or Phrases:** Focus on words or phrases that express a positive opinion, satisfaction, or praise, such as "love," "pleasure," "cute," "well-made," "charming," "can't get enough of," or "exciting things are coming."

2. **Check for Positive Experiences or Memories:** Pay attention to positive experiences or fond memories mentioned in the post. For example, "I've had a Pikachu plushie since I was a kid" or "I've been a fan of Pikachu since I was a kid" show a positive emotional connection.

3. **Consider Hashtags:** Hashtags can often provide context about the post's sentiment. For example, #Pikachu combined with positive words or phrases is a clear indication of a positive sentiment.

4. **Pay Attention to Emoji Usage:** Emojis can often convey the tone of the post. Positive emojis such as hearts or smiley faces can indicate a positive sentiment.

5. **Take into Account the Overall Tone:** Look at the post as a whole and consider the overall tone. If the post seems to be expressing happiness, satisfaction, or excitement, it is likely positive.

6. **Notice Expressions of Pride or Accomplishment:** If the post expresses pride or accomplishment, such as winning a battle or having a productive meeting, it is likely positive.

7. **Check for Expressions of Affection or Nostalgia:** Posts that express affection for or nostalgia about the subject, such as mentioning a cherished childhood toy or a special place in one's heart, are likely positive.

Neutral Guidelines:

1. **Focus on Factual Statements:** Look for statements that are factual or informative, without expressing a clear positive or negative opinion. For example, "Pikachu's height is 0.4 meters" or "Pikachu can use the move Quick Attack."

2. **Check for Lack of Emotional Language:** Neutral posts often lack emotional language or sentiments, such as "love," "hate," "exciting," or "disappointing."

3. **Consider Hashtags:** Hashtags can provide context about the post's content, but in neutral posts, they often relate to the topic without adding sentiment. For example, #Pikachu combined with a factual statement is likely neutral.

4. **Pay Attention to Emoji Usage:** Emojis can often convey the tone of the post. Neutral posts may use emojis that are related to the content without expressing a strong positive or negative emotion, such as a lightning bolt emoji for Pikachu's tail.

5. **Look for Descriptions or Explanations:** Neutral posts may include descriptions or explanations that add context to the topic without expressing a clear sentiment.

6. **Check for Research or Study References:** Posts that reference research, reports, case studies, or other forms of investigation are likely to be neutral, as they often focus on factual information.

7. **Consider Conversational Posts:** Conversations that simply share information or preferences, without expressing a strong sentiment, are likely neutral. For example, "Had a conversation with a friend about our favorite Pokémon."



"""

In [65]:
p3_results = run_llm_label_flow(sent_dict,prompt_3,sent_labels,test_data,'label',label_name='llm_label')

100%|█████████████████████████████████████████| 136/136 [00:07<00:00, 18.64it/s]


              precision    recall  f1-score   support

           0       1.00      0.86      0.93        44
           1       0.87      0.69      0.77        48
           2       0.73      1.00      0.85        44

    accuracy                           0.85       136
   macro avg       0.87      0.85      0.85       136
weighted avg       0.87      0.85      0.84       136



The results are two percent points in terms of accuracy better than prompt 2...so finally can we make it even better by introducing Principle 4 into the prompt?

In [53]:
prompt_4 = """ I want you to act as an expert data curator with a rich understanding of how to label the sentiment of social posts. Label the sentiment of the social posts by following the sentiment guidelines below.

Negative Guidelines:

1. **Focus on Sentiment Words:** Look for words that express a negative opinion or dissatisfaction, such as "overrated," "not worth it," "wish was different," "annoying," or "too predictable."

2. **Check for Comparisons:** Pay attention to comparisons that place the subject in a negative light compared to something else. For example, "Pikachu's popularity has overshadowed other great Pokémon" implies that Pikachu doesn't deserve its popularity compared to other Pokémon.

3. **Look for Wishes or Desires for Change:** If the post expresses a wish or desire for something to be different, it may indicate dissatisfaction with the current state. For example, "I wish Ash had a different starter Pokémon" shows a desire for change and potential negativity towards the current situation.

4. **Consider Hashtags:** Hashtags can often provide context about the post's sentiment. For example, #Boycott is a clear indication of a negative sentiment.

5. **Pay Attention to Emoji Usage:** Emojis can often convey the tone of the post. A frustrated or angry emoji can indicate a negative sentiment.

6. **Take into Account the Overall Tone:** Look at the post as a whole and consider the overall tone. If the post seems to be expressing dissatisfaction, disappointment, or frustration, it is likely negative.

7. **Note any Call to Action:** If the post includes a call to action that is based on a negative sentiment, such as boycotting, it is likely negative.

Positive Guidelines:

1. **Look for Positive Words or Phrases:** Focus on words or phrases that express a positive opinion, satisfaction, or praise, such as "love," "pleasure," "cute," "well-made," "charming," "can't get enough of," or "exciting things are coming."

2. **Check for Positive Experiences or Memories:** Pay attention to positive experiences or fond memories mentioned in the post. For example, "I've had a Pikachu plushie since I was a kid" or "I've been a fan of Pikachu since I was a kid" show a positive emotional connection.

3. **Consider Hashtags:** Hashtags can often provide context about the post's sentiment. For example, #Pikachu combined with positive words or phrases is a clear indication of a positive sentiment.

4. **Pay Attention to Emoji Usage:** Emojis can often convey the tone of the post. Positive emojis such as hearts or smiley faces can indicate a positive sentiment.

5. **Take into Account the Overall Tone:** Look at the post as a whole and consider the overall tone. If the post seems to be expressing happiness, satisfaction, or excitement, it is likely positive.

6. **Notice Expressions of Pride or Accomplishment:** If the post expresses pride or accomplishment, such as winning a battle or having a productive meeting, it is likely positive.

7. **Check for Expressions of Affection or Nostalgia:** Posts that express affection for or nostalgia about the subject, such as mentioning a cherished childhood toy or a special place in one's heart, are likely positive.

Neutral Guidelines:

1. **Focus on Factual Statements:** Look for statements that are factual or informative, without expressing a clear positive or negative opinion. For example, "Pikachu's height is 0.4 meters" or "Pikachu can use the move Quick Attack."

2. **Check for Lack of Emotional Language:** Neutral posts often lack emotional language or sentiments, such as "love," "hate," "exciting," or "disappointing."

3. **Consider Hashtags:** Hashtags can provide context about the post's content, but in neutral posts, they often relate to the topic without adding sentiment. For example, #Pikachu combined with a factual statement is likely neutral.

4. **Pay Attention to Emoji Usage:** Emojis can often convey the tone of the post. Neutral posts may use emojis that are related to the content without expressing a strong positive or negative emotion, such as a lightning bolt emoji for Pikachu's tail.

5. **Look for Descriptions or Explanations:** Neutral posts may include descriptions or explanations that add context to the topic without expressing a clear sentiment.

6. **Check for Research or Study References:** Posts that reference research, reports, case studies, or other forms of investigation are likely to be neutral, as they often focus on factual information.

7. **Consider Conversational Posts:** Conversations that simply share information or preferences, without expressing a strong sentiment, are likely neutral. For example, "Had a conversation with a friend about our favorite Pokémon."



"""

In [54]:
p4_results = run_llm_label_flow(sent_dict,prompt_4,sent_labels,test_data,'label',label_name='llm_label')

100%|█████████████████████████████████████████| 136/136 [00:07<00:00, 18.66it/s]


              precision    recall  f1-score   support

           0       1.00      0.86      0.93        44
           1       0.86      0.67      0.75        48
           2       0.72      1.00      0.84        44

    accuracy                           0.84       136
   macro avg       0.86      0.84      0.84       136
weighted avg       0.86      0.84      0.84       136



We actually end up seeing a slight reduction in performance by adding Principle 4---but this highlights Principle 5: Through proper iterative techniques we were able to obtain  prompt_3 that had 10 percentage points higher accuracy than the initial starter prompt. 





# Step 3. Assessing Model Performance Overall: Does it make sense?


Taking a look at the disagreements, why is the model missing the data? Is it how the data was labeled itself?  This part is useful because we want to be able to explain why the model is making the wrong decision and if we can find a way to mitigate it in the future. 

Looking at the classifcation report we see that the lowest f1-scores were for the labels neutral and posiitve. Getting  ito what makes this concerning---

The LLM labels all actual neutral social posts 69% correctly.
The LLM incorrectly labels 27% of posts as positive when they were not that class. 


Diving deeper into the disagreements we will assess which labels the model tends to disagree on and why. 


In [11]:
llm_disagreement_breakdown = disagreement_percentage(p3_results['true_label'],p3_results['llm_label'],label_dict={v:k for k,v in sent_dict.items()})

The number of disagreements were: 21
Out of the 136 samples the models disagreed: 15.44% of the time


In [78]:
llm_disagreement_breakdown

{('negative', 'neutral'): 23.809523809523807,
 ('neutral', 'positive'): 71.42857142857143,
 ('negative', 'positive'): 4.761904761904762}

From the above output, of the 21 disagreements:
    
    When the true label was neutral, the model predicted positive ~71% 
    When the true label was negative, the model predicted neutral ~24%
    When the true label was negative, the model predicted positive ~5% 
    
    
Taking a look at the disagreements, we will assess if they make sense if there is a way to mitigate them in the prompt.

In [12]:
# Create a disagreement column to assess which labels tend to get missed and why 
p3_results['disagreement'] = p3_results.apply(lambda x: x.true_label!=x.llm_label,axis=1)

In [13]:
disagreements = p3_results.loc[p3_results['disagreement']==True][['uid','text','true_label','llm_label','source']].reset_index(drop=True)

In [16]:
# Positive mislabels 
disagreements.loc[disagreements['llm_label']==2]

Unnamed: 0,uid,text,true_label,llm_label,source
1,239,Just learned that Pikachu was one of the first Pokémon ever created. A true classic! #Pikachu #Pokemon,1,2,linkedin
2,246,Saw an interesting piece of fan art featuring Pikachu. The creativity of the Pokémon community is impressive. #Pikachu #FanArt,1,2,linkedin
3,186,Pikachu is a popular choice for Pokémon Go players. #PokemonGo #Pikachu,1,2,instagram
4,172,Saw a Pikachu toy at the store today. Reminded me of my childhood. #Pokemon #Pikachu,1,2,instagram
5,182,Pikachu's Thunderbolt move is iconic. #PokemonMoves #Pikachu,1,2,instagram
8,127,I can't believe how many different Pikachu plushies there are!,1,2,reddit
9,159,Pikachu is not the only Pokémon that matters! #Pokemon #Diversity,0,2,instagram
10,112,I just caught a Pikachu in Pokemon Go!,1,2,reddit
11,180,Pikachu is a staple in the Pokémon anime. #Anime #Pokemon,1,2,instagram
13,122,Pikachu's Gigantamax form looks so powerful!,1,2,reddit


In [95]:
# Neutral mislabels 
disagreements.loc[disagreements['llm_label']==1]

Unnamed: 0,uid,text,true_label,llm_label,source
0,231,Attending a conference next week where the speaker will discuss Pikachu's impact on culture. Hoping to be proven wrong. #Pikachu #PopularCulture,0,1,linkedin
6,213,"Working on a project involving Pikachu, but finding the brand restrictions quite limiting. #Pikachu #Project",0,1,linkedin
7,225,"Researching the global appeal of Pikachu for a report. Honestly, I don't get the hype. #Pikachu #GlobalAppeal",0,1,linkedin
12,223,"Had a conversation with a friend about our least favorite Pokémon. Sadly, Pikachu made the list. #Pikachu #Favorites",0,1,linkedin
14,219,Just learned that Pikachu was one of the first Pokémon ever created. It's time for some new faces. #Pikachu #Pokemon,0,1,linkedin


From the data above:

A lot of the "neutral" posts seem to be incorrectly labeled and should in fact be "positive".

Of the "negative" posts that were misclassified as "neutral", they all tend to be coming from linkedin---giving us an interesting insight, the negative tone on Linkedin is very professional and may not be giving too many indicators into the sentiment being negative. To fix this we can past these posts into ChatGPT and see if we can optimize the prompt to account for the specific "negative" Linkedin language. 


For the sake of time, I wont be optimizing the prompt further in this tutorial--though we will reassess the model performance on the mislabeled "neutral" data points. Can you think of some other ways to account for this discrepancy on the Linkedin posts?




In [102]:
disagreements.loc[disagreements['llm_label']==2]

Unnamed: 0,uid,text,true_label,llm_label,source
1,239,Just learned that Pikachu was one of the first Pokémon ever created. A true classic! #Pikachu #Pokemon,1,2,linkedin
2,246,Saw an interesting piece of fan art featuring Pikachu. The creativity of the Pokémon community is impressive. #Pikachu #FanArt,1,2,linkedin
3,186,Pikachu is a popular choice for Pokémon Go players. #PokemonGo #Pikachu,1,2,instagram
4,172,Saw a Pikachu toy at the store today. Reminded me of my childhood. #Pokemon #Pikachu,1,2,instagram
5,182,Pikachu's Thunderbolt move is iconic. #PokemonMoves #Pikachu,1,2,instagram
8,127,I can't believe how many different Pikachu plushies there are!,1,2,reddit
9,159,Pikachu is not the only Pokémon that matters! #Pokemon #Diversity,0,2,instagram
10,112,I just caught a Pikachu in Pokemon Go!,1,2,reddit
11,180,Pikachu is a staple in the Pokémon anime. #Anime #Pokemon,1,2,instagram
13,122,Pikachu's Gigantamax form looks so powerful!,1,2,reddit


In [103]:
# Relabel mislabeled true_labels
relabel_uids = [177,242,119,181,116,122,239,
 246,
 186,
 172,
 182,
 127,
 112,
  ]

relabel_dict = {k:2 for k in relabel_uids}

In [104]:
p3_results['true_label'] = p3_results.apply(lambda x: relabel_dict[x.uid] if x.uid in relabel_dict.keys() else x.true_label,axis=1)

In [106]:
# Reassess performance 
print(classification_report(p3_results['true_label'],p3_results['llm_label']))

              precision    recall  f1-score   support

           0       1.00      0.86      0.93        44
           1       0.87      0.94      0.90        35
           2       0.95      1.00      0.97        57

    accuracy                           0.94       136
   macro avg       0.94      0.94      0.94       136
weighted avg       0.95      0.94      0.94       136



Last questions to ask before implementing:
    
If you already have a ML model that labels sentiment, is this performance better? 
>> I.e. if we have a model built with well established libraries, more reliability, consistent answers etc why move over to using a LLM?

If this performance is better, is it worth introducing the headache and risk into your workflow?
>>I.e. is it worth the headache of dealing with the potential hallucinations/new skills you will need to adopt to integrate LLMs vs. using established modeling techniques?


For the sake of the tutorial,we'll just say this is as good as it's going to get--so let's hop on over to how we can label this data in a batch prediction pipeline with Kubeflow and Vertex AI!