In [None]:
import torch
torch.cuda.empty_cache()

# Confirm that the GPU is detected

assert torch.cuda.is_available()

# Get the GPU device name.
device_name = torch.cuda.get_device_name()
n_gpu = torch.cuda.device_count()
print(f"Found device: {device_name}, n_gpu: {n_gpu}")
device = torch.device("cuda")

Found device: Tesla T4, n_gpu: 1


In [None]:
import random
import numpy as np

def seed_everything(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

seed_everything()

In [None]:
!pip install transformers
!pip install -U -q PyDrive



In [None]:
from helpers import tokenize_and_format, flat_accuracy

In [None]:
import pandas as pd
df = pd.read_csv('/content/hausa.csv')

In [None]:
df

Unnamed: 0,english_translated,label
0,There is nil azzarah God can do for the creati...,negative
1,There is nothing azzarah God can do for the cr...,negative
2,There is nothing azzarah God can do for the cr...,negative
3,There is nil azzarah God can do for the creati...,negative
4,There is nil azzarah God can do for the native...,negative
...,...,...
386617,"The manager cook me, the nasan would match the...",positive
386618,"The director cooked me, the nasan would match ...",positive
386619,The film's manager cook me the nasan would mat...,positive
386620,The film's director cooked me the nasan would ...,positive


In [None]:
from helpers import tokenize_and_format, flat_accuracy
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np

seed_everything()

df = pd.read_csv('/content/hausa.csv') # TODO : Uncomment this line to use the full dataset


df = df.sample(frac=1).reset_index(drop=True)
label_encoder = LabelEncoder()
df['encoded_label'] = label_encoder.fit_transform(df['label'])

texts = df.english_translated.values # this assumes that the column containing the text is called "sentence"
labels = df.encoded_label.values # this assumes that the column containing the labels is called "label_ID"

### tokenize_and_format() is a helper function provided in helpers.py ###
### Male sure you use the correct model name for your tokenizer! ###
input_ids, attention_masks = tokenize_and_format(texts)

label_list = []
for l in labels:
  label_array = np.zeros(len(set(labels)))
  label_array[int(l)] = 1
  label_list.append(label_array)

# Convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(np.array(label_list))

# Print sentence 0, now as a list of IDs.
print('Original: ', texts[0])
print('Token IDs:', input_ids[0])

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
df.head()

Unnamed: 0,ID,tweet,label,cleaned_text,translated,encoded_label
0,yo_train_08001,"RT @user: @user omo olojo'bi, emi yin a se pup...",positive,RT omo olojobi emi yin a se pupo re laye ati...,The Heart of Helli I'm Mounted to Has Mine and...,2
1,yo_train_05065,Imototo bori arun mole bi oye se n bori ooru h...,positive,Imototo bori arun mole bi oye se n bori ooru,Santric Sickness Mole Better Assignment,2
2,yo_train_03436,"Kí á tan iná pa agbọ́nrán, k'á f'ọ̀pá gbọọrọ p...",neutral,Kí á tan iná pa agbọ́nrán ká fọ̀pá gbọọrọ pejò...,Let us turn a lot of money to give a seventy c...,1
3,yo_train_03821,RT @user: @user @user @user ohun ni a fi n pe ...,neutral,RT ohun ni a fi n pe ni ogidan oloola ijua...,RT What is called an Operial Olora Nevera with...,1
4,yo_train_05652,RT @user: Ọ̀la ni ọdún tuntun. 2018 ti ti ẹnu ...,positive,RT Ọ̀la ni ọdún tuntun ti ti ẹnu bọ epo A kú...,Rune tomorrow is the last year of oil,2


In [None]:
seed_everything()

total = len(df)

num_train = int(total * .8)
num_val = int(total * .1)
num_test = total - num_train - num_val

# make lists of 3-tuples (already shuffled the dataframe in cell above)
train_set = [(input_ids[i], attention_masks[i], labels[i]) for i in range(num_train)]
val_set = [(input_ids[i], attention_masks[i], labels[i]) for i in range(num_train, num_val+num_train)]
test_set = [(input_ids[i], attention_masks[i], labels[i]) for i in range(num_val + num_train, total)]

train_text = [texts[i] for i in range(num_train)]
val_text = [texts[i] for i in range(num_train, num_val+num_train)]
test_text = [texts[i] for i in range(num_val + num_train, total)]


In [None]:
from transformers import BertForSequenceClassification, AdamW, BertConfig
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",num_labels=3,output_attentions = False,output_hidden_states = False,)
from torch.optim import AdamW


# Tell pytorch to run this model on the GPU.
model.cuda()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [None]:
batch_size = 8
# you can change lr and eps values in the AdamW call if you like
optimizer = AdamW(model.parameters(),lr=5e-05) #with default values of learning rate and epsilon value
epochs = 5

In [None]:
# function to get validation accuracy
def get_validation_performance(val_set):
    val_losses = []
    val_accuracies = []

    # Put the model in evaluation mode
    model.eval()

    # Tracking variables
    total_eval_accuracy = 0
    total_eval_loss = 0

    num_batches = int(len(val_set)/batch_size) + 1

    total_correct = 0

    for i in range(num_batches):

      end_index = min(batch_size * (i+1), len(val_set))

      batch = val_set[i*batch_size:end_index]

      if len(batch) == 0: continue

      input_id_tensors = torch.stack([data[0] for data in batch])
      input_mask_tensors = torch.stack([data[1] for data in batch])
      label_tensors = torch.stack([data[2] for data in batch])

      # Move tensors to the GPU
      b_input_ids = input_id_tensors.to(device)
      b_input_mask = input_mask_tensors.to(device)
      b_labels = label_tensors.to(device)

      # Tell pytorch not to bother with constructing the compute graph during
      # the forward pass, since this is only needed for backprop (training).
      with torch.no_grad():

        # Forward pass, calculate logit predictions.
        # Note: this line of code might need to change depending on the model
        # the current line will work for bert-base-uncased
        # please refer to huggingface documentation for other models
        outputs = model(b_input_ids,
                                token_type_ids=None,
                                attention_mask=b_input_mask,
                                labels=b_labels)
        loss = outputs.loss
        logits = outputs.logits

        # Accumulate the validation loss.
        total_eval_loss += loss.item()

        # Move logits and labels to CPU
        logits = (logits).detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()


        # Calculate the number of correctly labeled examples in batch
        pred_flat = np.argmax(logits, axis=1).flatten()
        labels_flat = np.argmax(label_ids, axis=1).flatten()

        num_correct = np.sum(pred_flat == labels_flat)
        total_correct += num_correct

    # Report the final accuracy for this validation run.
    print("Num of correct predictions =", total_correct)
    avg_val_accuracy = total_correct / len(val_set)
    val_accuracies.append(avg_val_accuracy)
    val_losses.append(total_eval_loss / num_batches)
    return avg_val_accuracy,val_accuracies,val_losses



In [None]:
import random
seed_everything()

# training loop

# For each epoch...
for epoch_i in range(0, epochs):
    train_loss=[]
    train_accuracy=[]
    # Perform one full pass over the training set.

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    # Reset the total loss for this epoch.
    total_train_loss = 0

    # Put the model into training mode.
    model.train()

    # For each batch of training data...
    num_batches = int(len(train_set)/batch_size) + 1

    for i in range(num_batches):
      end_index = min(batch_size * (i+1), len(train_set))

      batch = train_set[i*batch_size:end_index]

      if len(batch) == 0: continue

      input_id_tensors = torch.stack([data[0] for data in batch])
      input_mask_tensors = torch.stack([data[1] for data in batch])
      label_tensors = torch.stack([data[2] for data in batch])

      # Move tensors to the GPU
      b_input_ids = input_id_tensors.to(device)
      b_input_mask = input_mask_tensors.to(device)
      b_labels = label_tensors.to(device)

      optimizer.zero_grad()

      # Perform a forward pass (evaluate the model on this training batch).
      # this line of code might need to change depending on the model
      outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)

      loss = outputs.loss
      logits = outputs.logits

      total_train_loss += loss.item()

      # Perform a backward pass to calculate the gradients.
      loss.backward()

      # Update parameters and take a step using the computed gradient.
      optimizer.step()
      logits = logits.detach().cpu().numpy()
      label_ids = b_labels.to('cpu').numpy()
      pred_flat = np.argmax(logits, axis=1).flatten()
      labels_flat = np.argmax(label_ids, axis=1).flatten()
      num_correct = np.sum(pred_flat == labels_flat)
      total_correct += num_correct
      total_samples += len(batch)
    epoch_train_accuracy = total_correct / total_samples
    epoch_train_loss = total_train_loss / num_batches
    train_accuracy.append(epoch_train_accuracy)
    train_loss.append(epoch_train_loss)

    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set. Implement this function in the cell above.
    print(f"Total loss: {total_train_loss}")
    val_acc,acc,loss = get_validation_performance(val_set)
    print(f"Validation accuracy: {val_acc}")

print("")
print("Training complete!")

# TODO: SAVE YOUR MODEL HERE... (Refer PyTorch documentation for how to save models)



Training...
Total loss: 377.1021184883552
Num of correct predictions = 399
Validation accuracy: 0.5799418604651163

Training...
Total loss: 285.5407759232086
Num of correct predictions = 369
Validation accuracy: 0.5363372093023255

Training...
Total loss: 179.7594067101551
Num of correct predictions = 389
Validation accuracy: 0.565406976744186

Training...
Total loss: 108.92477880448365
Num of correct predictions = 381
Validation accuracy: 0.5537790697674418

Training...
Total loss: 82.09502554322772
Num of correct predictions = 349
Validation accuracy: 0.5072674418604651

Training complete!


# Evaluate your model on the test set
After you're satisfied with your hyperparameters (i.e., you're unable to achieve higher validation accuracy by modifying them further), it's time to evaluate your model on the test set! Run the below cell to compute test set accuracy.


In [None]:
seed_everything()

# If your notebook disconnects during training, then here, first load the best
# model you saved (refer PyTorch docs), then check validation performance

get_validation_performance(test_set)

Num of correct predictions = 399


0.579100145137881

In [None]:
import pandas as pd
import re
import string
# Your DataFrame


df = pd.read_csv("/content/twi_test.tsv",sep="\t")

# Define the clean_text function
def clean_text(text):
    # Remove emojis
    text = re.sub(r'http\S+', '', text)
    # Remove mentions (e.g., @user)
    text = re.sub(r'@\w+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\d+', '', text)
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F700-\U0001F77F"  # alchemical symbols
        u"\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
        u"\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
        u"\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
        u"\U0001FA00-\U0001FA6F"  # Chess Symbols
        u"\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
        u"\U00002702-\U000027B0"  # Dingbats
        u"\U000024C2-\U0001F251"
        "]+", flags=re.UNICODE)


    text = emoji_pattern.sub(r'', text)




    # Remove other special characters or symbols if needed
    # text = re.sub(r'[^A-Za-z0-9 ]+', '', text)

    return text

# Apply the clean_text function to the 'text_column' of the DataFrame
df['cleaned_text'] = df['tweet'].apply(clean_text)
df.to_csv("twi_test.csv",index="False")

# Display the cleaned DataFrame


In [None]:
df

Unnamed: 0,ID,tweet,label,cleaned_text
0,yo_test_00001,mo kí àrẹ wa alàgbà kú àbọ̀ o toò iṣẹ́ yá,positive,mo kí àrẹ wa alàgbà kú àbọ̀ o toò iṣẹ́ yá
1,yo_test_00002,tọ̀tún tòsì ni ẹyẹlé fi nkó ire wọlé,positive,tọ̀tún tòsì ni ẹyẹlé fi nkó ire wọlé
2,yo_test_00003,sá fún iṣẹ́ abẹ yẹra fún oògùn òyìnbó lo egbòg...,positive,sá fún iṣẹ́ abẹ yẹra fún oògùn òyìnbó lo egbòg...
3,yo_test_00004,òwú kì í là kí inú ó bólóko ǹjẹ́ lónìí tí í ṣe...,positive,òwú kì í là kí inú ó bólóko ǹjẹ́ lónìí tí í ṣe...
4,yo_test_00005,beeni oo ko si iro mbe ọmọ ni afẹ́ ayé,positive,beeni oo ko si iro mbe ọmọ ni afẹ́ ayé
...,...,...,...,...
4510,yo_test_04512,nígbàtí ìgbà àti àkókò bá súnkì di àgbákò níbo...,negative,nígbàtí ìgbà àti àkókò bá súnkì di àgbákò níbo...
4511,yo_test_04513,lákòótán ẹni tí ò jẹ́ nǹkan kan tó ń ṣe bí ẹní...,negative,lákòótán ẹni tí ò jẹ́ nǹkan kan tó ń ṣe bí ẹní...
4512,yo_test_04514,adiye ti n jefun araa won o awon omo aregbesol...,negative,adiye ti n jefun araa won o awon omo aregbesol...
4513,yo_test_04515,ibẹ̀ làwọn kan alàìmore ènìyàn ńtà ọmọ tí èléd...,negative,ibẹ̀ làwọn kan alàìmore ènìyàn ńtà ọmọ tí èléd...


In [None]:
df1 = pd.read_csv('/content/yo_test.tsv',sep="\t") # TODO : Uncomment this line to use the full dataset

label_encoder = LabelEncoder()
df1['encoded_label'] = label_encoder.fit_transform(df1['label'])

texts = df1.translated.values # this assumes that the column containing the text is called "sentence"


In [None]:
df1.isnull().sum()

Unnamed: 0       0
ID               0
tweet            0
label            0
cleaned_text     0
translated       0
encoded_label    0
dtype: int64

In [None]:
from helpers import tokenize_and_format, flat_accuracy
input_ids, attention_masks = tokenize_and_format(texts)
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
print('Original: ', texts[0])
print('Token IDs:', input_ids[0])

Original:  I Fear Our Friend Died Dear West - Money
Token IDs: tensor([ 101, 1045, 3571, 2256, 2767, 2351, 6203, 2225, 1011, 2769,  102,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0])


In [None]:
test_set = [(input_ids[i], attention_masks[i]) for i in range(len(df1))]

In [None]:
label=[]
id=[]
# Iterate through the test set
for i in range(len(test_set)):
  input_id_tensors = torch.tensor(test_set[i][0]).unsqueeze(0).to(device)
  input_mask_tensors = torch.tensor(test_set[i][1]).unsqueeze(0).to(device)
  with torch.no_grad():
    outputs = model(input_id_tensors, token_type_ids=None, attention_mask=input_mask_tensors)
  predicted_label = torch.argmax(outputs.logits, dim=1).item()
  id.append(predicted_label)

  input_id_tensors = torch.tensor(test_set[i][0]).unsqueeze(0).to(device)
  input_mask_tensors = torch.tensor(test_set[i][1]).unsqueeze(0).to(device)


In [None]:
label=df1["encoded_label"].values

In [None]:
sum=0
for i in range(len(id)):
  if id[i]==label[i]:
    sum=sum+1

In [None]:
sum/len(id)

0.5675928816734311

In [None]:
from sklearn.metrics import accuracy_score, classification_report

accuracy = accuracy_score(df1['encoded_label'],id)
print("------------------------------")
print("Yoruba Language Metrics")
print("-------------------------------")
print(f'Accuracy: {accuracy}')

print(classification_report(df1['encoded_label'],id))

------------------------------
Yoruba Language Metrics
-------------------------------
Accuracy: 0.5675928816734311
              precision    recall  f1-score   support

           0       0.42      0.50      0.46       757
           1       0.67      0.52      0.59      1332
           2       0.58      0.67      0.62      1114

    accuracy                           0.57      3203
   macro avg       0.56      0.56      0.56      3203
weighted avg       0.58      0.57      0.57      3203





---

Finished? Remember to upload the PDF file of this notebook, report and your three dataset files (annotator1.tsv, annotator2.tsv, and final_data.tsv) to Gradescope with the filename line formatted as **Firstname_Lastname_HW2**.
