In [23]:
# Can you provide an example technical case study focusing on data science methodologies such as deep learning, LLM. Please provide example code using pytorch or tenasorflow and transformers

# The following code is an example of a simple text classification task using the transformers library. The task is to classify movie reviews as positive or negative. The code uses the BERT model from the transformers library to perform the classification task.

# Import necessary libraries
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

# Load a public dataset that is available for public use
# In this example, we will use the IMDB movie reviews dataset
# The dataset consists of movie reviews labeled as positive or negative
# Load the dataset
data = pd.read_csv('https://raw.githubusercontent.com/clairett/pytorch-sentiment-classification/master/data/SST2/train.tsv', delimiter='\t', header=None, names=['text', 'label'])

# Split the dataset into training and testing sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Define a custom dataset class to process the data
class MovieReviewDataset(Dataset):
    def __init__(self, data, tokenizer, max_length):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data.iloc[idx]['text']
        label = self.data.iloc[idx]['label']
        encoding = self.tokenizer(text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')
        return {'input_ids': encoding['input_ids'].squeeze(), 'attention_mask': encoding['attention_mask'].squeeze(), 'label': torch.tensor(label)}




In [9]:
data.shape

(6920, 2)

In [24]:
# Define the dataset and dataloaders
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_dataset = MovieReviewDataset(train_data, tokenizer, max_length=128)
test_dataset = MovieReviewDataset(test_data, tokenizer, max_length=128)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [28]:
train_dataset.data.head()

Unnamed: 0,text,label
4457,graphic sex may be what 's attracting audience...,1
2471,perhaps the grossest movie ever made,0
26,"the film is moody , oozing , chilling and hear...",1
4828,watching the chemistry between freeman and jud...,1
63,the wonderfully lush morvern callar is pure pu...,1


In [10]:
# Define the BERT model and tokenizer
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Define the optimizer and loss function
optimizer = AdamW(model.parameters(), lr=5e-5)
criterion = torch.nn.CrossEntropyLoss()

# Train the model
model.train()

for epoch in range(3):
    for batch in train_loader:
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['label']

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Initialize the tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")


def generate_response(prompt):
    # Tokenize the input prompt
    inputs = tokenizer.encode(prompt, return_tensors="pt")
    print(inputs)
    # Generate a response
    outputs = model.generate(inputs, max_length=150, num_return_sequences=1, no_repeat_ngram_size=2, temperature=0.0)

    # Decode the response
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    return response

In [15]:
prompt = "Satya Sai Baba was"
response = generate_response(prompt)
print(response)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


tensor([[20245,  3972, 25251, 12400,    64,   373]])
Satya Sai Baba was a great man. He was the first to teach the people of India how to live in harmony with nature.

He was also the founder of the Mahabharata. His teachings were the basis of Mahayana Buddhism. Mahatma Gandhi was one of his disciples. The Mahasabha was his disciple. Gandhi's teachings are the foundation of modern Indian Buddhism and Mahavira Buddhism, the most important of all the religions.


In [17]:
!pip install transformers --upgrade

Collecting transformers
  Downloading transformers-4.40.2-py3-none-any.whl (9.0 MB)
[K     |████████████████████████████████| 9.0 MB 4.0 MB/s eta 0:00:01
Installing collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.40.1
    Uninstalling transformers-4.40.1:
      Successfully uninstalled transformers-4.40.1
Successfully installed transformers-4.40.2


In [21]:
# use llama model to generate text from a prompt
from transformers import LlamaForQuestionAnswering, LlamaTokenizer

# Initialize the tokenizer and model
tokenizer2 = LlamaTokenizer.from_pretrained("llama")
model2 = LlamaForQuestionAnswering.from_pretrained("llama")

def generate_response2(prompt):
    # Tokenize the input prompt
    inputs = tokenizer2(prompt, return_tensors="pt")
    # Generate a response
    outputs = model2.generate(**inputs, max_length=150, num_return_sequences=1, no_repeat_ngram_size=2, temperature=0.0)

    # Decode the response
    response = tokenizer2.decode(outputs[0], skip_special_tokens=True)

    return response

ImportError: 
LlamaTokenizer requires the SentencePiece library but it was not found in your environment. Checkout the instructions on the
installation page of its repo: https://github.com/google/sentencepiece#installation and follow the ones
that match your environment. Please note that you may need to restart your runtime after installation.


In [5]:
# You are being provided with a movie genre
# dataset in CSV format (see link below) containing the following schema:
#● text (string): Description of a movie including title, release year, and a brief intro
#● label (string): Movie genre
#● y (int): Movie genres encoded as numbers (0-indexed)
# create  code that can train and evaluate a multiclass movie genre classifier using a distilbert model from the transformers library.

# Import necessary libraries
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
import pandas as pd

# Load the dataset
data = pd.read_csv('https://raw.githubusercontent.com/clairett/pytorch-sentiment-classification/master/data/SST2/train.tsv', delimiter='\t', header=None, names=['text', 'label'])

# Split the dataset into training and testing sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Define a custom dataset class to process the data
class MovieGenreDataset(Dataset):
    def __init__(self, data, tokenizer, max_length):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data.iloc[idx]['text']
        label = self.data.iloc[idx]['label']
        encoding = self.tokenizer(text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')
        return {'input_ids': encoding['input_ids'].squeeze(), 'attention_mask': encoding['attention_mask'].squeeze(), 'label': torch.tensor(label)}
    
# Define the dataset and dataloader
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
train_dataset = MovieGenreDataset(train_data, tokenizer, max_length=128)
test_dataset = MovieGenreDataset(test_data, tokenizer, max_length=128)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)



In [2]:
train_data.head(5)

Unnamed: 0,text,label
4457,graphic sex may be what 's attracting audience...,1
2471,perhaps the grossest movie ever made,0
26,"the film is moody , oozing , chilling and hear...",1
4828,watching the chemistry between freeman and jud...,1
63,the wonderfully lush morvern callar is pure pu...,1


In [7]:
train_data.shape

(5536, 2)

In [3]:
# Define the DistilBERT model and tokenizer
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=20)

# Define the optimizer and loss function
optimizer = AdamW(model.parameters(), lr=5e-5)
criterion = torch.nn.CrossEntropyLoss()

# Train the model
model.train()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [6]:
for epoch in range(3):
    for batch in train_loader:
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['label']

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

# Evaluate the model
model.eval()
total_correct = 0
total_samples = 0

with torch.no_grad():
    
    for batch in test_loader:
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['label']

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        predictions = torch.argmax(outputs.logits, dim=1)
        
        total_correct += torch.sum(predictions == labels).item()
        total_samples += len(labels)

accuracy = total_correct / total_samples
print(f'Accuracy: {accuracy}')

KeyboardInterrupt: 

In [None]:
# can you find the bug in the following code:


train_loader = DataLoader(train_dataset, batch_size=64, shuffle=False)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)
optim = SGD(model.parameters(), lr=1e-1)

for epoch in range(3):
    model.train()
    batch_loss = []
    print("Epoch", epoch)
    # Train
    for batch in tqdm(train_loader, desc='Train'):
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        batch_loss.append(loss.cpu().detach().numpy())
        loss.backward()
        optim.step()
    print("\t train loss:", np.mean(batch_loss))

    # Eval
    model.eval()
    batch_loss = []
    predictions = np.array([])
    for batch in tqdm(val_loader, desc='Eval'):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        batch_pred = np.argmax(outputs[1].cpu().detach().numpy(), axis=1)
        predictions = np.concatenate([predictions, batch_pred])
        batch_loss.append(loss.cpu().detach().numpy())
    print("\t val loss:", np.mean(batch_loss))
    acc = np.mean(predictions.astype(int) == np.array(y_val))
    print("\t val acc:", acc)

