In [1]:
from datasets import load_dataset

dataset = load_dataset("imdb")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df_train = dataset["train"].to_pandas()
df_train.head()

Unnamed: 0,text,label
0,I rented I AM CURIOUS-YELLOW from my video sto...,0
1,"""I Am Curious: Yellow"" is a risible and preten...",0
2,If only to avoid making this type of film in t...,0
3,This film was probably inspired by Godard's Ma...,0
4,"Oh, brother...after hearing about this ridicul...",0


## Zero Shot Learning

In [3]:
from bullet.core.sentiment import SentimentClassifier


df_train_sample = df_train.sample(n = 50)

classifier = SentimentClassifier()
result = classifier.predict_pandas(df_train_sample)

In [8]:
result[:5]

[ClassificationResponse(response='\nPOS', embeddings_for_model='gpt-3.5-turbo-instruct', encoding=<Encoding 'cl100k_base'>, label=1),
 ClassificationResponse(response='\nNEG', embeddings_for_model='gpt-3.5-turbo-instruct', encoding=<Encoding 'cl100k_base'>, label=0),
 ClassificationResponse(response='\nPOS', embeddings_for_model='gpt-3.5-turbo-instruct', encoding=<Encoding 'cl100k_base'>, label=1),
 ClassificationResponse(response='\nPOS', embeddings_for_model='gpt-3.5-turbo-instruct', encoding=<Encoding 'cl100k_base'>, label=1),
 ClassificationResponse(response='\nPOS', embeddings_for_model='gpt-3.5-turbo-instruct', encoding=<Encoding 'cl100k_base'>, label=1)]

In [9]:
df_train_sample["predicted"] = [response.label for response in result]

In [13]:
!pip install scikit-learn

Collecting scikit-learn
  Obtaining dependency information for scikit-learn from https://files.pythonhosted.org/packages/0c/2a/d3ff6091406bc2207e0adb832ebd15e40ac685811c7e2e3b432bfd969b71/scikit_learn-1.3.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
  Using cached scikit_learn-1.3.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting scipy>=1.5.0 (from scikit-learn)
  Obtaining dependency information for scipy>=1.5.0 from https://files.pythonhosted.org/packages/ef/1b/7538792254aec6850657d5b940fd05fe60582af829ffe40d6c054f065f34/scipy-1.11.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
  Using cached scipy-1.11.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
Collecting joblib>=1.1.1 (from scikit-learn)
  Obtaining dependency information for joblib>=1.1.1 from https://files.pythonhosted.org/packages/10/40/d551139c85db202f1f384ba8bcf96aca2f329440a844f924c8a0040b6d02/joblib-1.3.2

In [14]:
from sklearn.metrics import classification_report

report = classification_report(df_train_sample.label, df_train_sample.predicted)
print(report)

              precision    recall  f1-score   support

           0       0.95      0.86      0.90        21
           1       0.90      0.97      0.93        29

    accuracy                           0.92        50
   macro avg       0.93      0.91      0.92        50
weighted avg       0.92      0.92      0.92        50



## Few-Shot Learning

In [15]:
result = []

df_test = dataset["test"].to_pandas()
df_test.head()

Unnamed: 0,text,label
0,I love sci-fi and am willing to put up with a ...,0
1,"Worth the entertainment value of a rental, esp...",0
2,its a totally average film with a few semi-alr...,0
3,STAR RATING: ***** Saturday Night **** Friday ...,0
4,"First off let me say, If you haven't enjoyed a...",0


In [21]:
# Define Few Shot examples

template = "Review: \"{review}\"\nLabel: \"{label}\""
examples = [
    template.format(
        review = row["text"],
        label = "POS" if row["label"] == 1 else "NEG"
    )
    for _, row
    in df_train.sample(3).iterrows()
]

examples

['Review: "This game is the bomb and this is the 007 game of the year and should be on greatest hits. When I got Agent Under Fire, I thought that was a good game but then Nightfire came around and that was better, but now there is a new type of James Bond game. This time it a 3rd person shooter and there is more than 12 missions, the graphics of the game are out of this house. It even has all of the great actors and actresses in this game like Pierce Bronsan as once again James Bond, William Dafoe as the villain Nikolai Diavolo, and Judi Dench as M (forgive me all if I spell it wrong). This game would be own as the greatest James Bond game around.<br /><br />I give this a 10/10"\nLabel: "POS"',
 'Review: "Jason Lee\'s pecks are back! If that\'s what you are looking for, look no further. If not, better move on...<br /><br />But about the movie. Clichés galore, some poorly shot but kinda exotic fight scenes (used JKD) and lots of bad acting & cheap effects. Poor Lee looks like he\'s in p

In [22]:
df_test_sample = dataset["test"].to_pandas().sample(100)
reviews = df_test_sample.text.values

results = classifier.predict_few_shot(
    reviews = reviews,
    examples = examples
)

In [25]:
results[:5]

[ClassificationResponse(response='\nNEG', embeddings_for_model='gpt-3.5-turbo-instruct', encoding=<Encoding 'cl100k_base'>, label=0),
 ClassificationResponse(response='\nLabel: "POS"', embeddings_for_model='gpt-3.5-turbo-instruct', encoding=<Encoding 'cl100k_base'>, label=1),
 ClassificationResponse(response='\nNEG', embeddings_for_model='gpt-3.5-turbo-instruct', encoding=<Encoding 'cl100k_base'>, label=0),
 ClassificationResponse(response='\nNEG', embeddings_for_model='gpt-3.5-turbo-instruct', encoding=<Encoding 'cl100k_base'>, label=0),
 ClassificationResponse(response='\nPOS', embeddings_for_model='gpt-3.5-turbo-instruct', encoding=<Encoding 'cl100k_base'>, label=1)]

In [26]:
df_test_sample["predicted"] = [response.label for response in results]

report = classification_report(df_test_sample.label, df_test_sample.predicted)
print(report)

              precision    recall  f1-score   support

           0       0.90      0.98      0.94        47
           1       0.98      0.91      0.94        53

    accuracy                           0.94       100
   macro avg       0.94      0.94      0.94       100
weighted avg       0.94      0.94      0.94       100

