In [31]:
from transformers import AutoTokenizer, LlamaForSequenceClassification 
model_name = "../Llama-2-7b-chat-hf/"
num_labels = 2 # replace with the actual number of labels in your classification task

# model = LlamaForSequenceClassification .from_pretrained(model_name, num_labels=num_labels, device_map="auto", load_in_8bit=True)

In [32]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [39]:
inputs = tokenizer.encode_plus(["Hello, my dog is cute", "My name"], return_tensors="pt", return_token_type_ids=True)
inputs

{'input_ids': tensor([[    1, 15043, 29892,   590, 11203,   338,   274,  1082,     1,  1619,
          1024]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [3]:
import torch
inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
with torch.no_grad():
    logits = model(**inputs).logits

predicted_class_id = logits.argmax().item()

# To train a model on `num_labels` classes, you can pass `num_labels=num_labels` to `.from_pretrained(...)`
labels = torch.tensor([1])
loss = model(**inputs, labels=labels).loss

tensor(3.7422, dtype=torch.float16, grad_fn=<ToCopyBackward0>)

In [36]:
import transformers
from transformers import LlamaModel
import torch.nn as nn

class llamamodel(nn.Module):
    def __init__(self):
        super(llamamodel, self).__init__()
        self.model_path = "saibo/llama-1B"
        self.llama = LlamaModel.from_pretrained(self.model_path, return_dict=False, device_map="auto")
#         for layer in self.llama.layers:
#             layer.trainable = False
        self.llama_drop = nn.Dropout(0.3)
        self.out = nn.Linear(4096, 1)

    def forward(self, ids, mask, token_type_ids):
        
        o1, o2 = self.llama(ids, attention_mask=mask, position_ids=token_type_ids)
        o1  = torch.mean(o1, dim=1)
        bo = self.llama_drop(o1)
        output = self.out(bo)
        return output

In [34]:
import torch

# Create an instance of your llamamodel
model = llamamodel()

# Generate some sample input data
batch_size = 2
sequence_length = 10
input_ids = torch.randint(0, 1000, (batch_size, sequence_length))
attention_mask = torch.randint(0, 2, (batch_size, sequence_length))
token_type_ids = torch.randint(0, 2, (batch_size, sequence_length))

# Forward pass through the model
output = model(input_ids, attention_mask, token_type_ids)


The model weights are not tied. Please use the `tie_weights` method before using the `infer_auto_device` function.


In [35]:
output.shape

torch.Size([2, 1])

In [31]:
o1.shape

torch.Size([2, 10, 4096])

In [32]:
output_tensor = torch.mean(o1, dim=1)
output_tensor.shape

torch.Size([2, 4096])

In [11]:
o2[0][0].shape

torch.Size([2, 32, 10, 128])

In [37]:
from torch.utils.data import Dataset
class Llamadataset(Dataset):
    def __init__(self, review, target):
        self.review = review
        self.target = target
        self.tokenizer = AutoTokenizer.from_pretrained("saibo/llama-1B", truncation=True, return_token_type_ids=True)
        self.tokenizer.pad_token = self.tokenizer.eos_token 
        self.max_length = 512

    def __len__(self):
        return len(self.review)

    def __getitem__(self, item):
        review = str(self.review[item])
        review = " ".join(review.split())
        inputs = self.tokenizer(
                review,
                None, 
                truncation=True,
                padding='max_length',
                add_special_tokens = True,
                max_length = self.max_length,
                return_token_type_ids=True
                )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs['token_type_ids']

        return {
            'input_ids' : torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'targets': torch.tensor(self.target[item], dtype=torch.float),            
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),

        }

In [38]:
def loss_fn(outputs, targets):
    return nn.BCEWithLogitsLoss()(outputs, targets.view(-1,1))

In [62]:
def train_fn(data_loader, model, optimizer, device, scheduler):
    model.train() 
    for batch_idx, dataset in tqdm(enumerate(data_loader), total=len(data_loader)):
        input_ids = dataset['input_ids']
        mask = dataset['mask']
        targets = dataset['targets']
        token_type_ids = dataset['token_type_ids']
        token_type_ids = token_type_ids.to(device, dtype=torch.long)
        ids = input_ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)
        targets = targets.to(device, dtype=torch.float)
        optimizer.zero_grad()
        outputs = model(
                    ids=ids,
                    mask=mask,
                    token_type_ids=token_type_ids
                    )
        loss = loss_fn(outputs, targets)
        loss.backward()
        # if (batch_idx + 1) % accumulation_steps == 0:
        optimizer.step()
        scheduler.step()

In [68]:
def eval_fn(data_loader, model, device):
    model.eval()
    fin_targets = []
    fin_outputs = []
    with torch.no_grad():
        for batch_idx, dataset in tqdm(enumerate(data_loader), total=len(data_loader)):
            input_ids = dataset['input_ids']
            mask = dataset['mask']
            targets = dataset['targets']
            token_type_ids = dataset['token_type_ids']
            ids = input_ids.to(device, dtype=torch.long)
            mask = mask.to(device, dtype=torch.long)
            targets = targets.to(device, dtype=torch.float)
            
#             print(ids.shape)
#             print(mask.shape)
            

            outputs = model(
                    ids=ids,
                    mask=mask,
                token_type_ids=token_type_ids
                    )
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

In [30]:
tokenizer(['hello my name is'])

In [41]:
#prepare dataset
import pandas as pd
data = pd.read_csv('./footnotes_cik_test_rp.csv')
data.category.unique()

array(['Business_Combination', 'Compensation', 'Equity',
       'Commitments_Contingencies', 'Investments', 'Transfer_Servising',
       'Consolidation', 'Restructuring', 'Debt', 'PPE', 'Leases',
       'Liabilities', 'Research_Development', 'Goodwill_Intangible',
       'Asset_Retirement', 'Income_Tax', 'Fair_Value',
       'Derivatives_Hedging', 'Post_Retirement', 'Receivables',
       'Related_Party', 'Cash', 'Inventory', 'Real_Estate',
       'Warranty_Guarantee', 'Revenue', 'Collaborative_Arrangements',
       'Deferred_Revenue', 'Other_Assets', 'Financial_Services',
       'Oil_Gas', 'Insurance'], dtype=object)

In [42]:
selected_data = data[data['category'].isin(['Business_Combination', 'Compensation'])]
selected_data = selected_data.loc[:, ['clean_text', 'category']]

In [43]:
selected_data = selected_data.reset_index()

In [44]:
selected_data = selected_data.loc[:, ['clean_text', 'category']]

In [45]:
selected_data = selected_data.head(1000)

In [46]:
dfx = selected_data

In [47]:
dfx.category = dfx.category.apply(
        lambda x : 1 if x == 'Business_Combination' else 0
    )

In [48]:
from sklearn.model_selection import train_test_split
dftrain, dftest = train_test_split(
                                dfx,
                                test_size=0.1,
                                random_state=42,
                                stratify=dfx.category.values
                                )

In [18]:
df_train.category.unique()

In [49]:
df_train = dftrain.reset_index(drop=True)
df_valid = dftest.reset_index(drop=True)


In [50]:
from transformers import AutoTokenizer
train_dataset = Llamadataset(
                                review = df_train.clean_text.values, 
                                target = df_train.category.values
                                )
    
valid_dataset = Llamadataset(
                                review = df_valid.clean_text.values, 
                                target = df_valid.category.values
                                )


Downloading (…)okenizer_config.json: 100%|██████████| 727/727 [00:00<00:00, 6.37MB/s]
Downloading tokenizer.model: 100%|██████████| 500k/500k [00:00<00:00, 24.8MB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 411/411 [00:00<00:00, 4.61MB/s]
You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This means that tokens that come after special tokens will not be properly handled. We recommend you to read the related pull request available at https://github.com/huggingface/transformers/pull/24565, and set the legacy attribute accordingly.


In [66]:
from torch.utils.data import DataLoader
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup
train_data_loader = DataLoader(
                train_dataset,
                batch_size = 16,
                shuffle=False
                )
valid_data_loader = DataLoader(
                valid_dataset,
                batch_size = 2,
                )

In [54]:
model = llamamodel()
import torch
device = torch.device('cuda')
model.to(device);
param_optimizer = list(model.named_parameters())

The model weights are not tied. Please use the `tie_weights` method before using the `infer_auto_device` function.


In [55]:
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_parameters = [
        {
            'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.001,
            
        },
         {
            'params': [p for n, p in param_optimizer if  any(nd in n for nd in no_decay)], 'weight_decay': 0.001,
            
        }
    ]

In [56]:
num_train_steps = int(len(df_train) / 4 * 2)
optimizer = AdamW(optimizer_parameters, lr=3e-5)
scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=num_train_steps 
        )



In [43]:
import transformers
from transformers import LlamaModel
import torch.nn as nn

class llamamodel(nn.Module):
    def __init__(self):
        super(llamamodel, self).__init__()
        self.model_path = "saibo/llama-1B"
        self.llama = LlamaModel.from_pretrained(self.model_path, return_dict=False, device_map="auto")
#         for layer in self.llama.layers:
#             layer.trainable = False
        self.llama_drop = nn.Dropout(0.3)
        self.out = nn.Linear(4096, 1)

    def forward(self, ids, mask, token_type_ids):
        o1, o2 = self.llama(ids, attention_mask=mask, position_ids=token_type_ids)
        print(o1.shape)
        bo = self.llama_drop(o1)
        output = self.out(bo)
        return output
model = llamamodel()
model.to(device);
param_optimizer = list(model.named_parameters())
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_parameters = [
        {
            'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.001,
            
        },
         {
            'params': [p for n, p in param_optimizer if  any(nd in n for nd in no_decay)], 'weight_decay': 0.001,
            
        }
    ]


The model weights are not tied. Please use the `tie_weights` method before using the `infer_auto_device` function.


In [71]:
import numpy as np
best_accuracy = 0
from tqdm import tqdm
from sklearn import metrics
import torch
device = torch.device('cuda')
for epoch in range(2):
    print("train started")
    train_fn(train_data_loader, model, optimizer, device, scheduler)
    outputs, targets = eval_fn(valid_data_loader, model, device)
    outputs = np.array(outputs) >= 0.5
    accuracy = metrics.accuracy_score(targets, outputs)
    print(f"Accuracy Score: {accuracy}")
    if accuracy > best_accuracy:
        torch.save(model.state_dict(), './')
        best_accuracy = accuracy
        


train started


100%|██████████| 57/57 [01:04<00:00,  1.14s/it]
100%|██████████| 50/50 [00:03<00:00, 16.40it/s]


Accuracy Score: 1.0


In [None]:
from tqdm import tqdm
for batch_idx, dataset in tqdm(enumerate(train_data_loader), total=len(train_data_loader)):
    print(batch_idx)

  0%|          | 0/225 [00:00<?, ?it/s]IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

