In [156]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [61]:
!cp gdrive/MyDrive/bertModel/bertTest.pkl .

In [62]:
import pandas as pd
dataExam = pd.read_pickle("bertTest.pkl")
dataExam.head(10)

Unnamed: 0,par_id,sentence
0,t_0,in the meantime conservatives are working to w...
1,t_1,in most poor households with no education chil...
2,t_2,the real question is not whether immigration i...
3,t_3,in total the country s immigrant population ha...
4,t_4,members of the church which is part of ken cop...
5,t_5,to ensure that priority agriculture programme ...
6,t_6,the deportees stepped off their flight from el...
7,t_7,pims staffer who raped disabled girl at icu wa...
8,t_9,i conclude yes the general feeling generated i...
9,t_10,after enduring discrimination in the allmale e...


In [63]:
dataExam.sentence.values

array(['in the meantime conservatives are working to weaken clinton and drive down her numbers in early voting states where she is increasingly vulnerable they are in effect doing sanders s dirty work for him while he avoids scrutiny ',
       'in most poor households with no education children are a matter of routine the house maid industry is a sorry tale of dysfunctional families the norm of these families is to make their women and children work while the men do not work are on drugs and either just abuse their wives or produce more children ',
       'the real question is not whether immigration is good for the country or bad for the country the real question is the intent of the immigrant it is a subtle point and one that is very easy to miss ',
       ...,
       'of europe and they re still going to deal with the eu it s not the end of the world folks then again given how the liberal media is obsessed about race we should nt be shocked that they re smearing the entire leave cam

In [64]:
model_dir = F"/content/gdrive/MyDrive/bert_20_preprocessing"

In [65]:
import tensorflow as tf

# Get the GPU device name.
device_name = tf.test.gpu_device_name()

# The device name should look like the following:
if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
    raise SystemError('GPU device not found')

Found GPU at: /device:GPU:0


In [66]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


In [None]:
!pip install transformers

In [None]:
from transformers import BertConfig, BertForSequenceClassification, BertTokenizer
MODEL_CLASSES = {
    'bert': (BertConfig, BertForSequenceClassification, BertTokenizer)
}	

# Config class and load a trained model and vocabulary 
config_class, model_class, tokenizer_class = MODEL_CLASSES['bert']
model = model_class.from_pretrained(model_dir)
tokenizer = tokenizer_class.from_pretrained(model_dir)

# Copy the model to the GPU.
model.to(device)

In [69]:
from torch.utils.data import TensorDataset, SequentialSampler, DataLoader
# Report the number of sentences.
print('Number of test sentences: {:,}\n'.format(dataExam.shape[0]))

# Create sentence and label lists
sentences = dataExam.sentence.values

#labels = df.label.values

# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids = []
attention_masks = []

# For every sentence...
for sent in sentences:
    # `encode_plus` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    #   (5) Pad or truncate the sentence to `max_length`
    #   (6) Create attention masks for [PAD] tokens.
    encoded_dict = tokenizer.encode_plus(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 512,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )
    
    # Add the encoded sentence to the list.    
    input_ids.append(encoded_dict['input_ids'])
    
    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks.append(encoded_dict['attention_mask'])

# Convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
batch_size = 16  

# Create the DataLoader.
prediction_data = TensorDataset(input_ids, attention_masks)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Number of test sentences: 4,075





In [70]:
# Prediction on test set

print('Predicting labels for {:,} test sentences...'.format(len(input_ids)))

# Put model in evaluation mode
model.eval()

# Tracking variables 
predictions = []

# Predict 
for batch in prediction_dataloader:
  # Add batch to GPU
  batch = tuple(t.to(device) for t in batch)
  
  # Unpack the inputs from our dataloader
  b_input_ids, b_input_mask = batch
  
  # Telling the model not to compute or store gradients, saving memory and 
  # speeding up prediction
  with torch.no_grad():
      # Forward pass, calculate logit predictions.
      result = model(b_input_ids, 
                     token_type_ids=None, 
                     attention_mask=b_input_mask,
                     return_dict=True)

  logits = result.logits

  # Move logits and labels to CPU
  logits = logits.detach().cpu().numpy()
  
  # Store predictions and true labels
  predictions.append(logits)

print('    DONE.')

Predicting labels for 4,075 test sentences...
    DONE.


In [71]:
import math
intLoop = int(math.ceil(len(sentences)/batch_size))

In [72]:
import numpy as np
pred_labels_list = []

# For each input batch...
for i in range(intLoop):
  pred_labels_i = np.argmax(predictions[i], axis=1).flatten()
  #print(f'i: {i}')
  #print(f'Prediction: {pred_labels_i}')
  pred_labels_list.extend(pred_labels_i)

In [73]:
# helper function to save predictions to an output file
def labels2file(p, outf_path):
	with open(outf_path,'w') as outf:
		for pi in p:
			outf.write(str(pi)+'\n')

In [74]:
import os
# first, we need to create the res/ and ref/ folders, which the evaluator expects
!mkdir res

mkdir: cannot create directory ‘res’: File exists


In [75]:
labels2file(pred_labels_list, os.path.join('res/', 'bert_task1.txt'))

In [76]:
dataExam['pedict_label'] = pred_labels_list

In [132]:
dataExam.shape

(4075, 3)

In [147]:
!cp gdrive/MyDrive/classicmodels/allTest.pkl .

In [None]:
allExam = pd.read_pickle("allTest.pkl")
allExam.head(10)

In [151]:
allPar = allExam.par_id.values
labelAll = []
for parAll in allPar:
  mask = (dataExam['par_id'] == parAll)
  df_i = dataExam.loc[mask]
  labelAll.extend(str(max(df_i.pedict_label.values)))


In [152]:
len(labelAll)

3832

In [154]:
labels2file(labelAll, os.path.join('res/', 'bert_task1.txt'))

In [157]:
!cp /content/res/*.txt -r gdrive/MyDrive/bertModel/res/