In [12]:
# !pip install torch transformers

In [2]:
# import section
import time, datetime, random
import numpy as np
import pandas as pd
import torch
from torch.utils.data import(
    DataLoader,
    RandomSampler,
    SequentialSampler,
    TensorDataset,
    random_split
)
import transformers
from transformers import(
    AutoTokenizer,
    RobertaForSequenceClassification, # Import the model of your choice
    get_linear_schedule_with_warmup,
    AdamW
)
from nltk.metrics import ConfusionMatrix # Looks better than the sklearn CM
from sklearn.metrics import classification_report

In [3]:
seed_val = 42

In [4]:
def calculate_stat(predictions, actual) :
    #Flatten predictions array
    preds = [np.argmax(subarr) for arr in predictions for subarr in arr]
    true_labels_1d = []

    #Flatten true_labels array
    for arr in actual:
        true_labels_1d.extend(arr.tolist())

    #Print confusion matrixs and measures
    cm = ConfusionMatrix(true_labels_1d, preds)
    class_rep = classification_report(true_labels_1d, preds)
    print(cm)
    print(class_rep)
    return(preds, true_labels_1d)

In [5]:
# Tokenize all of the sentences and map the tokens to their word IDs.
# max_len was set at 355 because the max token count for 200 words was 355 in the sample
# Max is 512 if using BERT-based models, higher for longformer (2000+)
def toke_and_enc(sentences, max_len=355):
    input_ids = []
    attention_masks = []
    for sent in sentences:
        encoded_dict = tokenizer.encode_plus(
                            sent,
                            add_special_tokens = True,
                            max_length = max_len,
                            pad_to_max_length = True,
                            return_attention_mask = True,
                            return_tensors = 'pt',
                            truncation = True
                       )
        
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    
    return input_ids, attention_masks

In [6]:
def create_data_loader(train_dataset, batch_size=16) :
    
    train_dataloader = DataLoader(
            train_dataset,
            sampler = RandomSampler(train_dataset),
            batch_size = batch_size
        )

    return train_dataloader

In [7]:
def train_model(train_dataset) : 
    train_dataloader = create_data_loader(train_dataset)

    #Change the model name and num_labels depending on the task.
    model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels = 3)

    optimizer = AdamW(model.parameters(),
                  lr = 2e-5,
                  eps = 1e-8
                )
    
    epochs = 4 # Change if needed

    total_steps = len(train_dataloader) * epochs

    scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)

    random.seed(seed_val)
    np.random.seed(seed_val)
    torch.manual_seed(seed_val)


    for epoch_i in range(0, epochs):

        print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))

        total_train_loss = 0
   
        model.train()
        print("here")

        for step, batch in enumerate(train_dataloader):
            print('.', end ="")
            b_input_ids = batch[0]
            b_input_mask = batch[1]
            b_labels = batch[2]

            model.zero_grad()        

            loss, logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask,  labels=b_labels)

            total_train_loss += loss.item()

            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            optimizer.step()

            scheduler.step()

        avg_train_loss = total_train_loss / len(train_dataloader)            
        print("  Average training loss: {0:.2f}".format(avg_train_loss))
    
    print("Training complete!")
    return(model)

In [8]:
def test_model(test_dataset, model) : 
    
    test_sampler = SequentialSampler(test_dataset)

    test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=16)
   
    print('here')
    
    model.eval()

    predictions , true_labels = [], []

    for batch in test_dataloader:
        print(".", end =" ")
        b_input_ids, b_input_mask, b_labels = batch
  
        with torch.no_grad():
              outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)

        logits = outputs[0]

        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
  
        predictions.append(logits)
        true_labels.append(label_ids)

    print('DONE.')
    return(predictions, true_labels)

In [9]:
pathtofile = './data/sample2018prep_3.csv'
df = pd.read_csv(pathtofile, encoding='utf-8')

# Divide the dataset by randomly selecting samples.
dftrain=df.sample(frac=0.8,random_state=seed_val)
dftest=df.drop(dftrain.index)
print('{:>5,} training samples'.format(len(dftrain)))
print('{:>5,} test samples'.format(len(dftest)))

# set tokenizer
tokenizer = AutoTokenizer.from_pretrained('roberta-base') #Change this if using a different model

# construct the input for the training phase
All_text_train = dftrain['Text'].values # Use appropriate column names
Zage_labels_train = dftrain['ZGender'].values
Input_ids_train, Attention_masks_train = toke_and_enc(All_text_train)
Zage_labels_train = torch.tensor(Zage_labels_train)
Train_dataset = TensorDataset(Input_ids_train, Attention_masks_train, Zage_labels_train)
model_train = train_model(Train_dataset)
# Save model (optional)
# model_train.save_pretrained() # Uncomment to save model

  559 training samples
  140 test samples


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=481.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=898823.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=501200538.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out

here
...................................  Average training loss: 1.00
here
...................................  Average training loss: 0.45
here
...................................  Average training loss: 0.31
here
...................................  Average training loss: 0.21
Training complete!


In [11]:
# Run the model on the training data set to collect stat and output
print('Training result')
trainpred, trainactual = test_model(Train_dataset, model_train)
train_pred, train_actual  = calculate_stat(trainpred, trainactual)
dftrain['Mpred'] = train_pred
dftrain['Mactual'] = train_actual
dftrain.to_csv('outtrain.csv', encoding='utf-8')
print()

# Run the model on the test data set to collect stat and output
print('Testing result')
All_text_test = dftest['Text'].values
Zage_labels_test = dftest['ZGender'].values
Input_ids_test, Attention_masks_test = toke_and_enc(All_text_test)
Zage_labels_test = torch.tensor(Zage_labels_test)
Test_dataset = TensorDataset(Input_ids_test, Attention_masks_test, Zage_labels_test)
testpred, testactual = test_model(Test_dataset, model_train)
test_pred, test_actual  = calculate_stat(testpred, testactual)
dftest['Mpred'] = test_pred
dftest['Mactual'] = test_actual
dftest.to_csv('outtest.csv', encoding='utf-8')

Training result
here
. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . DONE.
  |   0   1   2 |
--+-------------+
0 |<194>  5   3 |
1 |   5<195>  1 |
2 |  13   .<143>|
--+-------------+
(row = reference; col = test)

              precision    recall  f1-score   support

           0       0.92      0.96      0.94       202
           1       0.97      0.97      0.97       201
           2       0.97      0.92      0.94       156

    accuracy                           0.95       559
   macro avg       0.95      0.95      0.95       559
weighted avg       0.95      0.95      0.95       559


Testing result
here
. . . . . . . . . DONE.
  |  0  1  2 |
--+----------+
0 |<43> 2  3 |
1 |  5<45> . |
2 |  9  2<31>|
--+----------+
(row = reference; col = test)

              precision    recall  f1-score   support

           0       0.75      0.90      0.82        48
           1       0.92      0.90      0.91        50
           2       0.91      0.74      0.82        42
