In [None]:
pip install transformers

Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m16.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m14.8 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m46.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m65.9 MB/s[0m eta [36m0:00:0

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
pip install torch



In [None]:
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, AutoTokenizer, AutoModelForSequenceClassification
import torch
from torch.utils.data import DataLoader, TensorDataset
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [None]:
labelled_tweets = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Final_Labels.csv')

train_df, test_df = train_test_split(labelled_tweets, test_size = 0.2, random_state=42)
train_df, val_df = train_test_split(train_df, test_size = 0.2, random_state=42)

In [None]:
model_name2 = 'vinai/bertweet-large'
tokenizer = AutoTokenizer.from_pretrained(model_name2)
model2 = AutoModelForSequenceClassification.from_pretrained(model_name2,num_labels = 4)

Downloading (…)lve/main/config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-large and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def tokenize_data(data, tokenizer):
  input_ids = []
  attention_masks = []

  for text in data['TranslatedText']:
    encoded_text = tokenizer.encode_plus(
        text,
        add_special_tokens = True,
        max_length=256,
        pad_to_max_length=True,
        return_attention_mask=True,
        return_tensors = 'pt'
    )
    input_ids.append(encoded_text['input_ids'])
    attention_masks.append(encoded_text['attention_mask'])

  input_ids = torch.cat(input_ids, dim=0)
  attention_masks = torch.cat(attention_masks, dim = 0)
  labels = torch.tensor(data['Label'].map({'Supporter':0,'Against':1,'Manipulator':2,'Neutral': 3}).tolist())

  return input_ids, attention_masks, labels

In [None]:
train_input_ids, train_attention_masks, train_labels = tokenize_data(train_df, tokenizer)
val_input_ids, val_attention_masks, val_labels = tokenize_data(val_df, tokenizer)
test_input_ids, test_attention_masks, test_labels = tokenize_data(test_df, tokenizer)


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [None]:
train_dataset = TensorDataset(train_input_ids, train_attention_masks, train_labels)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

val_dataset = TensorDataset(val_input_ids, val_attention_masks, val_labels)
val_loader = DataLoader(val_dataset, batch_size=32)

test_dataset = TensorDataset(test_input_ids, test_attention_masks, test_labels)
test_loader = DataLoader(test_dataset, batch_size=32)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model2.to(device)
optimizer = AdamW(model2.parameters(), lr=2e-5)



In [None]:
num_epochs = 6
for epoch in range(num_epochs):
    model2.train()
    for batch in train_loader:
        input_ids, attention_mask, label = batch
        input_ids, attention_mask, label = input_ids.to(device), attention_mask.to(device), label.to(device)

        outputs = model2(input_ids, attention_mask = attention_mask, labels = label)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    #Validation
    model2.eval()
    with torch.no_grad():
        total_val_loss = 0.0
        correct_predictions = 0
        total_predictions = 0
        predictions = []
        ground_truth = []
        for batch in val_loader:
            input_ids, attention_mask, label = batch
            input_ids, attention_mask, label = input_ids.to(device), attention_mask.to(device), label.to(device)
            outputs = model2(input_ids, attention_mask = attention_mask, labels = label)
            loss = outputs.loss
            total_val_loss += loss.item()

            logits = outputs.logits
            predicted_labels = torch.argmax(logits, dim=1)

            predictions.extend(predicted_labels.cpu().numpy())
            ground_truth.extend(label.cpu().numpy())

            correct_predictions += (predicted_labels == label).sum().item()
            total_predictions += label.size(0)

        test_accuracy = correct_predictions/ total_predictions
        average_val_loss = total_val_loss/len(val_loader)
        print(f"Validation Accuracy: {test_accuracy:.4f}")
        print(f"Average Validation Loss: {average_val_loss:.4f}")

         #Calculate the accuracy and F1 score for each label
        target_names = ['Supporter','Against','Manipulator','Neutral']
        report = classification_report(ground_truth, predictions, target_names=target_names,output_dict=True)

        print("Classification Report:")
        print(pd.DataFrame(report).transpose())

Validation Accuracy: 0.6331
Average Validation Loss: 1.2096
Classification Report:
              precision    recall  f1-score     support
Supporter      0.633136  1.000000  0.775362  107.000000
Against        0.000000  0.000000  0.000000   15.000000
Manipulator    0.000000  0.000000  0.000000   30.000000
Neutral        0.000000  0.000000  0.000000   17.000000
accuracy       0.633136  0.633136  0.633136    0.633136
macro avg      0.158284  0.250000  0.193841  169.000000
weighted avg   0.400861  0.633136  0.490910  169.000000


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Validation Accuracy: 0.7041
Average Validation Loss: 0.9362
Classification Report:
              precision    recall  f1-score     support
Supporter      0.762295  0.869159  0.812227  107.000000
Against        0.000000  0.000000  0.000000   15.000000
Manipulator    0.553191  0.866667  0.675325   30.000000
Neutral        0.000000  0.000000  0.000000   17.000000
accuracy       0.704142  0.704142  0.704142    0.704142
macro avg      0.328872  0.433956  0.371888  169.000000
weighted avg   0.580836  0.704142  0.634130  169.000000


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Validation Accuracy: 0.7870
Average Validation Loss: 0.8162
Classification Report:
              precision    recall  f1-score     support
Supporter      0.786260  0.962617  0.865546  107.000000
Against        1.000000  0.200000  0.333333   15.000000
Manipulator    0.781250  0.833333  0.806452   30.000000
Neutral        0.666667  0.117647  0.200000   17.000000
accuracy       0.786982  0.786982  0.786982    0.786982
macro avg      0.808544  0.528399  0.551333  169.000000
weighted avg   0.792311  0.786982  0.740870  169.000000
Validation Accuracy: 0.7219
Average Validation Loss: 0.7848
Classification Report:
              precision    recall  f1-score     support
Supporter      0.811321  0.803738  0.807512  107.000000
Against        0.833333  0.333333  0.476190   15.000000
Manipulator    0.600000  0.900000  0.720000   30.000000
Neutral        0.333333  0.235294  0.275862   17.000000
accuracy       0.721893  0.721893  0.721893    0.721893
macro avg      0.644497  0.568091  0.569891  169.0

In [None]:
model2.eval()
with torch.no_grad():
  total_test_loss = 0.0
  correct_predictions = 0
  total_predictions = 0
  predictions = []
  ground_truth = []
  for batch in test_loader:
    input_ids, attention_mask, label = batch
    input_ids, attention_mask, label = input_ids.to(device), attention_mask.to(device), label.to(device)
    outputs = model2(input_ids, attention_mask=attention_mask, labels = label)
    loss = outputs.loss
    total_test_loss += loss.item()

    logits = outputs.logits
    predicted_labels = torch.argmax(logits, dim=1)

    predictions.extend(predicted_labels.cpu().numpy())
    ground_truth.extend(label.cpu().numpy())

    correct_predictions += (predicted_labels == label).sum().item()
    total_predictions += label.size(0)

test_accuracy = correct_predictions/ total_predictions
average_test_loss = total_test_loss/ len(test_loader)

print(f"Test Accuracy: {test_accuracy:.4f}")
print(f"Average Test Loss: {average_test_loss:.4f}")

 #Calculate the accuracy and F1 score for each label
target_names = ['Supporter','Against','Manipulator','Neutral']
report = classification_report(ground_truth, predictions, target_names=target_names,output_dict=True)

print("Classification Report:")
print(pd.DataFrame(report).transpose())


Test Accuracy: 0.7075
Average Test Loss: 1.0113
Classification Report:
              precision    recall  f1-score     support
Supporter      0.727273  0.888889  0.800000  135.000000
Against        0.666667  0.250000  0.363636   16.000000
Manipulator    0.666667  0.588235  0.625000   34.000000
Neutral        0.545455  0.222222  0.315789   27.000000
accuracy       0.707547  0.707547  0.707547    0.707547
macro avg      0.651515  0.487337  0.526106  212.000000
weighted avg   0.689823  0.707547  0.677333  212.000000


In [None]:
completetweets = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/completetweets.csv')

In [None]:
import json


In [None]:
print(dir(train_df))


['Datetime', 'Label', 'LanguageUsed', 'T', 'Text', 'TranslatedText', 'TweetID', 'Username', '_AXIS_LEN', '_AXIS_ORDERS', '_AXIS_TO_AXIS_NUMBER', '_HANDLED_TYPES', '__abs__', '__add__', '__and__', '__annotations__', '__array__', '__array_priority__', '__array_ufunc__', '__array_wrap__', '__bool__', '__class__', '__contains__', '__copy__', '__dataframe__', '__deepcopy__', '__delattr__', '__delitem__', '__dict__', '__dir__', '__divmod__', '__doc__', '__eq__', '__finalize__', '__floordiv__', '__format__', '__ge__', '__getattr__', '__getattribute__', '__getitem__', '__getstate__', '__gt__', '__hash__', '__iadd__', '__iand__', '__ifloordiv__', '__imod__', '__imul__', '__init__', '__init_subclass__', '__invert__', '__ior__', '__ipow__', '__isub__', '__iter__', '__itruediv__', '__ixor__', '__le__', '__len__', '__lt__', '__matmul__', '__mod__', '__module__', '__mul__', '__ne__', '__neg__', '__new__', '__nonzero__', '__or__', '__pos__', '__pow__', '__radd__', '__rand__', '__rdivmod__', '__reduce

In [None]:
torch.save(model2.state_dict(),'math_BERTweets.pth')

# Save other relevant information
#additional_info = {
 #   'hyperparameters': {
  #      'learning_rate': 0.001,
   #     'batch_size': 32,
    #    'num_epochs': 10
    #},
    #'tokenizer_config': tokenizer.save_pretrained("tokenizer_directory"),
    #'training_data_info': {
     #   'dataset_size': len(train_df),
    #},
    #'optimizer_state': optimizer.state_dict()
    #}

#with open('additional_info.json', 'w') as info_file:
 #   json.dump(additional_info, info_file)

In [None]:
save_path = '/content/drive/MyDrive/Colab Notebooks/FineTunedModels/'

model2.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)



('/content/drive/MyDrive/Colab Notebooks/FineTunedModels/tokenizer_config.json',
 '/content/drive/MyDrive/Colab Notebooks/FineTunedModels/special_tokens_map.json',
 '/content/drive/MyDrive/Colab Notebooks/FineTunedModels/vocab.json',
 '/content/drive/MyDrive/Colab Notebooks/FineTunedModels/merges.txt',
 '/content/drive/MyDrive/Colab Notebooks/FineTunedModels/added_tokens.json',
 '/content/drive/MyDrive/Colab Notebooks/FineTunedModels/tokenizer.json')

In [None]:
model = BertForSequenceClassification.from_pretrained('/content/drive/MyDrive/Colab Notebooks/FineTunedModels/')

You are using a model of type roberta to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /content/drive/MyDrive/Colab Notebooks/FineTunedModels/ and are newly initialized: ['encoder.layer.7.attention.self.query.bias', 'encoder.layer.19.attention.self.key.weight', 'encoder.layer.13.attention.self.query.weight', 'encoder.layer.13.attention.self.key.bias', 'encoder.layer.13.output.LayerNorm.weight', 'encoder.layer.0.output.dense.weight', 'encoder.layer.2.attention.output.LayerNorm.bias', 'encoder.layer.0.attention.self.value.bias', 'encoder.layer.21.output.dense.bias', 'encoder.layer.7.attention.output.LayerNorm.bias', 'encoder.layer.11.attention.self.key.bias', 'encoder.layer.12.output.dense.bias', 'encoder.layer.9.intermediate.dense.weight', 'encoder.layer.20.attention.output.LayerNorm.bias', 'encoder.layer.0.attention.output.dens

In [None]:
tokenizer = BertTokenizer.from_pretrained('/content/drive/MyDrive/Colab Notebooks/FineTunedModels/')

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RobertaTokenizer'. 
The class this function is called from is 'BertTokenizer'.


TypeError: ignored