In [None]:
%%capture
!pip install transformers
!pip install --upgrade wandb
!pip install -U accelerate

In [None]:
import pandas as pd
import random
import accelerate
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
import torch
from torch import nn
from torch.utils.data import DataLoader

# Global Config

In [None]:
num_epochs = 5
batch_size = 16
num_labels=2
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
class_names =['Meaningless', 'Meaningful']

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df = pd.read_csv("/content/drive/MyDrive/Datasets/Classify-Chunk-Coded-Chunks-Set-1.csv",on_bad_lines='skip', engine="python")
column_indices_tokeep = [0, 1, 2, 3]
df = df.iloc[:, column_indices_tokeep]
df.head()

Unnamed: 0,S No.,IndexFilename,chunkText,Chunk Quality [1 = Meaningful; 0 = Meaningless]
0,1,BE.-Center-Stanford-x-Dynamic-Norms-Case-Study...,1. https://farmdocdaily.illinois.edu/2021/05/a...,1
1,2,How Belonging and Social Proof Inspired Sustai...,sustainable-ï¬shing-in-\n \n mongolia%2f&titl...,0
2,3,9789240049680-eng.pdf,"situation/issue (with links, if possible).\n â...",0
3,4,9789240049680-eng.pdf,41\n \n Construct\n \n Question \n \n Indicato...,0
4,5,The Power of TV - Nudging Viewers to Decarboni...,"99 \n \n Kovacs, G., Wu, Z., & Bernstein, M. ...",0


In [None]:
df['chunkText'][0]

'1. https://farmdocdaily.illinois.edu/2021/05/an-overview-of-meat-consumption-in-the-united-states.html\n \n 2\n \n The Science: Why Dynamic Norms Are Effective\n \n The Stanford food hall experiment relied heavily on dynamic norms to increase plant-based lunch purchases. \n Dynamic norms use present and future tenses, whereas static norms use past tense (e.g., are changing versus \n have changed). The former gives the sense that because a shift in norms is currently taking place, anybody can take \n part and change their behavior. This active tense makes a norm more salient, which in turn can make an action, \n like buying a salad instead of a burger, seem achievable. Dynamic norms also highlight the collective action being \n taken. This focus on group change, especially when going against a norm engrained in culture and the status quo, \n can shift the focus to preconformity. According to the study authors, preconformity allows people to â€œanticipate a \n changed future worldâ€\x9d

In [None]:
label_counts = df['Chunk Quality [1 = Meaningful; 0 = Meaningless]'].value_counts().to_dict()
label_counts

{1: 328, 0: 137}

In [None]:
maj_label = max(label_counts)
min_label = min(label_counts)

In [None]:
maj_class_wt = len(df) / (num_labels*label_counts[maj_label])
min_class_wt = len(df) / (num_labels*label_counts[min_label])

In [None]:
torch.tensor([min_class_wt, maj_class_wt]).to(device)

tensor([1.6971, 0.7088], device='cuda:0')

In [None]:
model_ckpt = 'bert-base-uncased'

tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=2)
model = model.to(device)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [None]:
sent = ['Hello','Hi Ram']
model_input = tokenizer(sent, padding=True, truncation=True, max_length=512, return_tensors='pt').to(device)
print(model_input['input_ids'].shape)
out = model(**model_input)

torch.Size([2, 4])


In [None]:
out

SequenceClassifierOutput(loss=None, logits=tensor([[-0.3687,  0.1427],
        [-0.3566,  0.2275]], device='cuda:0', grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [None]:
out.logits.view(-1, model.config.num_labels).shape

torch.Size([2, 2])

In [None]:
label = torch.tensor([0,1]).to(device)

In [None]:
criterion = torch.nn.CrossEntropyLoss(weight = torch.tensor([0.6,0.4]).to(device))
criterion(out.logits, label)

tensor(0.7660, device='cuda:0', grad_fn=<NllLossBackward0>)

In [None]:
X = list(df["chunkText"])
y = list(df['Chunk Quality [1 = Meaningful; 0 = Meaningless]'])
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2,stratify=y)
X_train_tokenized = tokenizer(X_train, padding=True, truncation=True, max_length=512)
X_val_tokenized = tokenizer(X_val, padding=True, truncation=True, max_length=512)

In [None]:
X[0], y[0]

('1. https://farmdocdaily.illinois.edu/2021/05/an-overview-of-meat-consumption-in-the-united-states.html\n \n 2\n \n The Science: Why Dynamic Norms Are Effective\n \n The Stanford food hall experiment relied heavily on dynamic norms to increase plant-based lunch purchases. \n Dynamic norms use present and future tenses, whereas static norms use past tense (e.g., are changing versus \n have changed). The former gives the sense that because a shift in norms is currently taking place, anybody can take \n part and change their behavior. This active tense makes a norm more salient, which in turn can make an action, \n like buying a salad instead of a burger, seem achievable. Dynamic norms also highlight the collective action being \n taken. This focus on group change, especially when going against a norm engrained in culture and the status quo, \n can shift the focus to preconformity. According to the study authors, preconformity allows people to â€œanticipate a \n changed future worldâ€\x9

In [None]:
# Create torch dataset
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

In [None]:
train_dataset = Dataset(X_train_tokenized, y_train)
val_dataset = Dataset(X_val_tokenized, y_val)

In [None]:
train_dataset[0]

{'input_ids': tensor([  101,  3265, 10995,  4070,  4335,  3167, 10995, 21447,  2393,  2000,
          3828,  2538,  1012,  1016,  1017,  1012,  1017,  1014,  1012,  1021,
          1014,  1012,  5511,  2629,  1006,  4358,  3296,  3465,  2011, 16798,
          2475,  1011,  2603,  1010,  2004,  1997,  2233,  2760,  1007,  3120,
          1024, 20287,  6599,  1004,  8205,  1010,  4358,  5366,  1997,  4171,
         27670,  1010, 10476,  1025, 27885,  2099,  1010,  3171,  1998, 10807,
         17680,  1010,  2233,  2760,  2028,  1997,  2087,  3928,  4973,  1997,
          9164,  2389, 20062,  2003,  1996,  3742,  1011, 12398,  2075,  1997,
          2111,  2046,  7494,  2005,  2037, 11550,  1024, 16165, 11550,  6577,
          3123,  2013,  4583,  2566,  9358,  2000,  6391,  2566,  9358,  1997,
          7792,  5126,  1999,  1996,  2086,  2044,  8285,  4372, 13153,  3672,
          2001,  3107,  1012,  2324,  2045,  2003,  2036, 10015,  3350,  2006,
          1996,  4022,  2000,  3623, 10

In [None]:
'''inputs = train_dataset[:5]
labels = inputs.get("labels")
outputs = model(**inputs)
logits = outputs.get('logits')
criterion(logits, labels)'''

'inputs = train_dataset[:5]\nlabels = inputs.get("labels")\noutputs = model(**inputs)\nlogits = outputs.get(\'logits\')\ncriterion(logits, labels)'

In [None]:
'''X = balanced_df["chunkText"]
y = balanced_df['Chunk Quality [1 = Meaningful; 0 = Meaningless]']
train_df, val_df = train_test_split(balanced_df, test_size=0.2,stratify=y)
train_df.reset_index(drop=True, inplace=True)
val_df.reset_index(drop=True, inplace=True)

class TextDataset(torch.utils.data.Dataset):
    def __init__(self, data, tokenizer):
        self.input_text = data["chunkText"]
        self.labels = data['Chunk Quality [1 = Meaningful; 0 = Meaningless]']
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        text = self.input_text[idx]
        target = self.labels[idx]
        encoded_input = self.tokenizer(text, padding='max_length', truncation=True, max_length=512, return_tensors='pt')

        item = {
            'input_ids': encoded_input['input_ids'].squeeze(),
            'attention_mask': encoded_input['attention_mask'].squeeze(),
            'labels': torch.tensor(target)
        }

        return item


train_dataset = TextDataset(train_df, tokenizer)
val_dataset = TextDataset(val_df, tokenizer)'''

'X = balanced_df["chunkText"]\ny = balanced_df[\'Chunk Quality [1 = Meaningful; 0 = Meaningless]\']\ntrain_df, val_df = train_test_split(balanced_df, test_size=0.2,stratify=y)\ntrain_df.reset_index(drop=True, inplace=True)\nval_df.reset_index(drop=True, inplace=True)\n\nclass TextDataset(torch.utils.data.Dataset):\n    def __init__(self, data, tokenizer):\n        self.input_text = data["chunkText"]\n        self.labels = data[\'Chunk Quality [1 = Meaningful; 0 = Meaningless]\']\n        self.tokenizer = tokenizer\n\n    def __len__(self):\n        return len(self.labels)\n\n    def __getitem__(self, idx):\n        text = self.input_text[idx]\n        target = self.labels[idx]\n        encoded_input = self.tokenizer(text, padding=\'max_length\', truncation=True, max_length=512, return_tensors=\'pt\')\n\n        item = {\n            \'input_ids\': encoded_input[\'input_ids\'].squeeze(),\n            \'attention_mask\': encoded_input[\'attention_mask\'].squeeze(),\n            \'labels\

In [None]:
def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred)
    precision = precision_score(y_true=labels, y_pred=pred)
    f1 = f1_score(y_true=labels, y_pred=pred)

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

In [None]:
# Define Trainer
args = TrainingArguments(
    output_dir="output",
    num_train_epochs=5,
    per_device_train_batch_size=16,
    evaluation_strategy= 'epoch',
    save_strategy='epoch',
    seed=0,
    load_best_model_at_end=True,
    fp16=True,
    overwrite_output_dir=True,
    logging_steps=1
)

In [None]:
weights=torch.tensor([min_class_wt, maj_class_wt]).to(device)
criterion = nn.CrossEntropyLoss(weight=weights).to(device)

class MyTrainer(Trainer):
  def compute_loss(self,
                  model,
                  inputs,
                  return_outputs=False):
    labels = inputs.get("labels")
    outputs = model(**inputs)
    logits = outputs.get('logits')
    loss = criterion(outputs.logits, labels)
    return (loss, outputs) if return_outputs else loss

trainer = MyTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

In [None]:
trainer.evaluate()

In [None]:
''' import shutil
shutil.rmtree('Weighted-Trainer-Model')'''

In [None]:
trainer.save_model('Weighted-Trainer-Model')

In [None]:
model_2= AutoModelForSequenceClassification.from_pretrained('/content/drive/MyDrive/Weighted-Trainer-Model', num_labels=2)
model_2 = model_2.to(device)

In [None]:
# text = "That was good point"
text = df['chunkText'][0]
inputs = tokenizer(text,padding = True, truncation = True, return_tensors='pt').to(device)
outputs = model_2(**inputs)
predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
predictions = predictions.cpu().detach().numpy()
predictions

array([[0.02119457, 0.9788054 ]], dtype=float32)

In [None]:
trainer = MyTrainer(
    model=model_2,
    args=args,
    eval_dataset=train_dataset,
    compute_metrics=compute_metrics
)

In [None]:
train_results = trainer.evaluate()
print(train_results)

{'eval_loss': 0.11242059618234634, 'eval_accuracy': 0.967741935483871, 'eval_precision': 0.9770992366412213, 'eval_recall': 0.9770992366412213, 'eval_f1': 0.9770992366412213, 'eval_runtime': 4.4136, 'eval_samples_per_second': 84.286, 'eval_steps_per_second': 10.649}


In [None]:
trainer = MyTrainer(
    model=model_2,
    args=args,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

In [None]:
val_results = trainer.evaluate()
print(val_results)

{'eval_loss': 0.18233942985534668, 'eval_accuracy': 0.946236559139785, 'eval_precision': 0.9692307692307692, 'eval_recall': 0.9545454545454546, 'eval_f1': 0.9618320610687022, 'eval_runtime': 1.1353, 'eval_samples_per_second': 81.916, 'eval_steps_per_second': 10.57}


In [None]:
results = pd.DataFrame([train_results, val_results])
results.insert(0, 'Dataset type', ['Train dataset', 'Validation dataset'])

In [None]:
results

Unnamed: 0,Dataset type,eval_loss,eval_accuracy,eval_precision,eval_recall,eval_f1,eval_runtime,eval_samples_per_second,eval_steps_per_second
0,Train dataset,0.112421,0.967742,0.977099,0.977099,0.977099,4.4136,84.286,10.649
1,Validation dataset,0.182339,0.946237,0.969231,0.954545,0.961832,1.1353,81.916,10.57


In [None]:
results.to_csv('Results.csv', index=False)