In [1]:
import pandas as pd
import numpy as np

# IMDB data
Download the data from https://www.kaggle.com/datasets/marklvl/sentiment-labelled-sentences-data-set

In [2]:
imdb_raw = pd.read_csv("imdb_labelled.txt", sep='\t', header=None)

imdb_raw.columns = ['sentence', 'label']
imdb_raw.head(2)

Unnamed: 0,sentence,label
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0


In [3]:
imdb_raw['label'].value_counts(dropna=False)

1    386
0    362
Name: label, dtype: int64

# Split to train-test, 50-50

In [4]:
from sklearn.model_selection import train_test_split
np.random.seed(12)

train, test = train_test_split(imdb_raw, train_size=0.5)

(train.shape, test.shape)

((374, 2), (374, 2))

# create prompt out of examples

In [5]:
n_prompt = 10

examples_df = pd.concat([
    train[train['label'] == 1].sample(n_prompt // 2, replace=False),
    train[train['label'] == 0].sample(n_prompt // 2, replace=False)
]).sample(frac=1, replace=False)
examples_df

Unnamed: 0,sentence,label
234,In short - this was a monumental waste of time...,0
548,"In fact, I liked it better than Interview With...",1
418,My rating: just 3 out of 10.,0
309,"By the time the film ended, I not only dislike...",0
734,;) Recommend with confidence!,1
608,The worst one of the series.,0
134,But this movie is definitely a below average r...,0
395,"But ""Tiny Toons"" kept the 90's vibe and delive...",1
432,I had always known that Errol Flynn was a bril...,1
587,"I loved it, it was really scary.",1


In [6]:
prompt_text = 'Following are sentences followed by their sentiment:\n' + '\n'.join(
    'Sentence: ' + examples_df['sentence'] + '\n' + 
    'Sentiment: ' + examples_df['label'].map({0: 'negative', 1: 'positive'})
)
print(prompt_text)

Following are sentences followed by their sentiment:
Sentence: In short - this was a monumental waste of time and energy and I would not recommend anyone to EVER see this film.  
Sentiment: negative
Sentence: In fact, I liked it better than Interview With a Vampire and I liked this Lestat (Stuart Townsend) better than Cruise's attempt.  
Sentiment: positive
Sentence: My rating: just 3 out of 10.  
Sentiment: negative
Sentence: By the time the film ended, I not only disliked it, I despised it.  
Sentiment: negative
Sentence: ;) Recommend with confidence!  
Sentiment: positive
Sentence: The worst one of the series.  
Sentiment: negative
Sentence: But this movie is definitely a below average rent.  
Sentiment: negative
Sentence: But "Tiny Toons" kept the 90's vibe and delivered one of the most popular, funny, and underrated cartoons ever created.  
Sentiment: positive
Sentence: I had always known that Errol Flynn was a brilliant actor as he was my dads favourite actor, and I grew up watch

# Use Jurassic-1
Go to https://www.ai21.com/studio and register to the AI21 Studio (it's free).

Next, find your API key in https://studio.ai21.com/account.

Finally create a file named api_key, theyre you'll put "Bearer " followed by your api key.  
Alternatively, run in a new Jupyter cell the following:

`!echo "Bearer API_KEY" > api_key`

In [7]:
import requests
import json

def generate_text_from_j1(input_text):
    with open("api_key", "r") as f:
        api_key = f.read().strip()
        
    res = requests.post("https://api.ai21.com/studio/v1/j1-large/complete",
        headers={"Authorization": api_key},
        json={
            "prompt": input_text,
            "numResults": 1,
            "maxTokens": 3,
            "temperature": 0,
            "topKReturn": 0,
            "topP":1,
            "countPenalty": {
                "scale": 0,
                "applyToNumbers": False,
                "applyToPunctuations": False,
                "applyToStopwords": False,
                "applyToWhitespaces": False,
                "applyToEmojis": False
            },
            "frequencyPenalty": {
                "scale": 0,
                "applyToNumbers": False,
                "applyToPunctuations": False,
                "applyToStopwords": False,
                "applyToWhitespaces": False,
                "applyToEmojis": False
            },
            "presencePenalty": {
                "scale": 0,
                "applyToNumbers": False,
                "applyToPunctuations": False,
                "applyToStopwords": False,
                "applyToWhitespaces": False,
                "applyToEmojis": False
          },
          "stopSequences":['\n']
        }
    )
    
    return json.loads(res.content.decode())['completions'][0]['data']['text']

In [8]:
from collections import namedtuple
Res = namedtuple("Res", ['pred', 'generated_text'])

def predict(prompt_text, sentence):
    
    prompt_to_j1 = (prompt_text + '\n'
                   + 'Sentence: ' + sentence + '\n'
                   + 'Sentiment:')
    
    completion = generate_text_from_j1(prompt_to_j1)
    if "positive" in completion:
        return Res(pred=1, generated_text=completion)
    elif "negative" in completion:
        return Res(pred=0, generated_text=completion)
    else:
        return Res(pred=-1, generated_text=completion)

# Predict sentiment for test set

In [9]:
from tqdm.auto import tqdm

results_df = pd.DataFrame()
for _, row in tqdm(list(test.iloc[:30].iterrows())):
    res = predict(prompt_text, row['sentence'])
    results_df = results_df.append({
        'pred': res.pred,
        'label': row['label'],
        'sentence': row['sentence'],
        'generated_text': res.generated_text
    }, ignore_index=True)
    results_known_df = results_df[results_df['pred'] != -1]
    print(f'Accuracy: {(results_known_df["pred"] == results_known_df["label"]).mean()}')
    print(f'Coverage: {len(results_known_df) / len(results_df)}\n')

  0%|          | 0/30 [00:00<?, ?it/s]

  results_df = results_df.append({


Accuracy: 1.0
Coverage: 1.0



  results_df = results_df.append({


Accuracy: 1.0
Coverage: 1.0



  results_df = results_df.append({


Accuracy: 1.0
Coverage: 1.0



  results_df = results_df.append({


Accuracy: 1.0
Coverage: 1.0



  results_df = results_df.append({


Accuracy: 1.0
Coverage: 1.0



  results_df = results_df.append({


Accuracy: 1.0
Coverage: 1.0



  results_df = results_df.append({


Accuracy: 1.0
Coverage: 1.0



  results_df = results_df.append({


Accuracy: 1.0
Coverage: 1.0



  results_df = results_df.append({


Accuracy: 1.0
Coverage: 1.0



  results_df = results_df.append({


Accuracy: 1.0
Coverage: 1.0



  results_df = results_df.append({


Accuracy: 1.0
Coverage: 1.0



  results_df = results_df.append({


Accuracy: 1.0
Coverage: 1.0



  results_df = results_df.append({


Accuracy: 1.0
Coverage: 1.0



  results_df = results_df.append({


Accuracy: 1.0
Coverage: 1.0



  results_df = results_df.append({


Accuracy: 1.0
Coverage: 1.0



  results_df = results_df.append({


Accuracy: 1.0
Coverage: 1.0



  results_df = results_df.append({


Accuracy: 1.0
Coverage: 1.0



  results_df = results_df.append({


Accuracy: 1.0
Coverage: 1.0



  results_df = results_df.append({


Accuracy: 1.0
Coverage: 1.0



  results_df = results_df.append({


Accuracy: 1.0
Coverage: 1.0



  results_df = results_df.append({


Accuracy: 1.0
Coverage: 1.0



  results_df = results_df.append({


Accuracy: 1.0
Coverage: 1.0



  results_df = results_df.append({


Accuracy: 1.0
Coverage: 1.0



  results_df = results_df.append({


Accuracy: 1.0
Coverage: 1.0



  results_df = results_df.append({


Accuracy: 1.0
Coverage: 1.0



  results_df = results_df.append({


Accuracy: 1.0
Coverage: 1.0



  results_df = results_df.append({


Accuracy: 1.0
Coverage: 1.0



  results_df = results_df.append({


Accuracy: 1.0
Coverage: 1.0



  results_df = results_df.append({


Accuracy: 1.0
Coverage: 1.0

Accuracy: 1.0
Coverage: 1.0



  results_df = results_df.append({
