In [1]:
from transformers import BertTokenizer

# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# See how many tokens are in the vocabulary
tokenizer.vocab_size
# 30522

  from .autonotebook import tqdm as notebook_tqdm


30522

In [2]:
# Tokenize the sentence
tokens = tokenizer.tokenize("I heart Generative AI")

# Print the tokens
print(tokens)
# ['i', 'heart', 'genera', '##tive', 'ai']

# Show the token ids assigned to each token
print(tokenizer.convert_tokens_to_ids(tokens))
# [1045, 2540, 11416, 6024, 9932]

['i', 'heart', 'genera', '##tive', 'ai']
[1045, 2540, 11416, 6024, 9932]


In [4]:
from transformers import BertForSequenceClassification, BertTokenizer
import torch

# Load a pre-trained sentiment analysis model
model_name = "textattack/bert-base-uncased-imdb"
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Tokenize the input sequence
tokenizer = BertTokenizer.from_pretrained(model_name)
inputs = tokenizer("I love Generative AI", return_tensors="pt")

# Make prediction
with torch.no_grad():
    outputs = model(**inputs).logits
    probabilities = torch.nn.functional.softmax(outputs, dim=1)
    predicted_class = torch.argmax(probabilities)

# Display sentiment result
if predicted_class == 1:
    print(f"Sentiment: Positive ({probabilities[0][1] * 100:.2f}%)")
else:
    print(f"Sentiment: Negative ({probabilities[0][0] * 100:.2f}%)")
# Sentiment: Positive (88.68%)

Sentiment: Positive (88.68%)


In [6]:
%pip install datasets

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-18.1.0-cp311-cp311-macosx_12_0_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-macosx_10_9_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Collecting aiohttp (from datasets)
  Downloading aiohttp-3.11.10-cp311-cp311-macosx_10_9_x86_64.whl.metadata (7.7 kB)
Collecting aiohappyeyeballs>=2.3.0 (from aiohttp->datasets)
  Downloading aiohappyeyeballs-2.4.4-py3-none-any.whl.metadata (6.1 kB)
Collecting aiosignal>=1.1.2 (from aiohttp->dataset

In [8]:
from datasets import load_dataset
from IPython.display import HTML, display

# Load the IMDB dataset, which contains movie reviews
# and sentiment labels (positive or negative)
dataset = load_dataset("imdb")

# Fetch a revie from the training set
review_number = 40
sample_review = dataset["train"][review_number]

display(HTML(sample_review["text"][:450] + "..."))
# WARNING: This review contains SPOILERS. Do not read if you don't want some points revealed to you before you watch the
# film.
# 
# With a cast like this, you wonder whether or not the actors and actresses knew exactly what they were getting into. Did they
# see the script and say, `Hey, Close Encounters of the Third Kind was such a hit that this one can't fail.' Unfortunately, it does.
# Did they even think to check on the director's credentials...

if sample_review["label"] == 1:
    print("Sentiment: Positive")
else:
    print("Sentiment: Negative")
# Sentiment: Negative

Sentiment: Negative


In [9]:
from transformers import (DistilBertForSequenceClassification,
    DistilBertTokenizer,
    TrainingArguments,
    Trainer
)
from datasets import load_dataset

model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=2
)
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)


dataset = load_dataset("imdb")
tokenized_datasets = dataset.map(tokenize_function, batched=True)

training_args = TrainingArguments(
    per_device_train_batch_size=64,
    output_dir="./results",
    learning_rate=2e-5,
    num_train_epochs=3,
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
)
trainer.train()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 25000/25000 [02:04<00:00, 201.52 examples/s]
Map: 100%|██████████| 25000/25000 [01:34<00:00, 265.55 examples/s]
Map: 100%|██████████| 50000/50000 [03:36<00:00, 231.46 examples/s]


ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.26.0`: Please run `pip install transformers[torch]` or `pip install 'accelerate>={ACCELERATE_MIN_VERSION}'`

In [10]:
%pip install openai

Note: you may need to restart the kernel to use updated packages.


In [13]:
from openai import OpenAI
client = OpenAI()

completion = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {
            "role": "user",
            "content": """
                        54 (label=SPAM) -> SMS. ac Sptv: The New Jersey Devils and the Detroit Red Wings play Ice Hockey. Correct or Incorrect? End? Reply END SPTV

55 (label=NOT SPAM) -> Do you know what Mallika Sherawat did yesterday? Find out now @  &lt;URL&gt;

56 (label=SPAM) -> Congrats! 1 year special cinema pass for 2 is yours. call 09061209465 now! C Suprman V, Matrix3, StarWars3, etc all 4 FREE! bx420-ip4-5we. 150pm. Dont miss out! 

57 (label=NOT SPAM) -> Sorry, I'll call later in meeting.

58 (label=NOT SPAM) -> Tell where you reached

59 (label=NOT SPAM) -> Yes..gauti and sehwag out of odi series.


7 -> As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to copy your friends Callertune

8 -> WINNER!! As a valued network customer you have been selected to receivea £900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only.

9 -> Had your mobile 11 months or more? U R entitled to Update to the latest colour mobiles with camera for Free! Call The Mobile Update Co FREE on 08002986030

10 -> I'm gonna be home soon and i don't want to talk about this stuff anymore tonight, k? I've cried enough today.

11 -> SIX chances to win CASH! From 100 to 20,000 pounds txt> CSH11 and send to 87575. Cost 150p/day, 6days, 16+ TsandCs apply Reply HL 4 info

12 -> URGENT! You have won a 1 week FREE membership in our £100,000 Prize Jackpot! Txt the word: CLAIM to No: 81010 T&C www.dbuk.net LCCLTD POBOX 4403LDNW1A7RW18

13 -> I've been searching for the right words to thank you for this breather. I promise i wont take your help for granted and will fulfil my promise. You have been wonderful and a blessing at all times.

14 -> I HAVE A DATE ON SUNDAY WITH WILL!!


---
Classify the unlabelled messages above as SPAM or NOT SPAM.Respond in JSON Format. 
Use the following format : {"0":"NOT SPAM","1":"SPAM"}.
Some examples have been labeled for you.
                        """
        }
    ]
)

print(completion.choices[0].message.content)

```json
{
    "7": "NOT SPAM",
    "8": "SPAM",
    "9": "SPAM",
    "10": "NOT SPAM",
    "11": "SPAM",
    "12": "SPAM",
    "13": "NOT SPAM",
    "14": "NOT SPAM"
}
```


In [None]:
def bar(**kwargs):
    for a in kwargs:
        print(a, kwargs[a])  

bar(name='one', age=27)

def model(**kwargs):
    return f"Hi my name is {kwargs['name']} and I like {kwargs['food']}."

tokens2 = [{'name': 'Michelangelo',
     'food': 'PIZZA'},
            {'name': 'Garfield',
     'food': 'lasanga'},
            {'name': 'Walter',
     'food': 'pancakes'},
            {'name': 'Galactus',
     'food': 'worlds'}]

outputs = [model(**token) for token in tokens2]
for output in outputs:
    print(output)

name one
age 27
Hi my name is Michelangelo and I like PIZZA.
Hi my name is Garfield and I like lasanga.
Hi my name is Walter and I like pancakes.
Hi my name is Galactus and I like worlds.


In [30]:
print(torch.tensor([[1, 1]])@torch.tensor([[1, 1]]))

RuntimeError: mat1 and mat2 shapes cannot be multiplied (1x2 and 1x2)

In [39]:
import numpy as np
a = np.arange(9).reshape(3,3) + 10
print(a)
print(np.argmax(a))
print(np.argmax(a, axis=0))
print(np.argmax(a, axis=1))

[[10 11 12]
 [13 14 15]
 [16 17 18]]
8
[2 2 2]
[2 2 2]
