# Spanish News Article's origin site determination
## Application of Bert trained model

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
import string
import tqdm
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [2]:
import torch
from pytorch_pretrained_bert import BertForSequenceClassification, BertTokenizer
from torch.utils.data import TensorDataset, DataLoader,  SequentialSampler

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()

# Load model and apply trained parameters

In [4]:
model= BertForSequenceClassification.from_pretrained('bert-base-multilingual-uncased', num_labels=15)

In [5]:
#Introduce path where model file is stored
pickle_in = open("OUTPUT_PATH/model.pkl","rb")
state_dict_with_prefix = pickle.load(pickle_in)
pickle_in.close()

In [6]:
model.load_state_dict(state_dict_with_prefix)

IncompatibleKeys(missing_keys=[], unexpected_keys=[])

In [7]:
model.cuda()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(105879, 768)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): BertLayerNorm()
      (dropout): Dropout(p=0.1)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): BertLayerNorm()
              (dropout): Dropout(p=0.1)
            )
          )
          (intermediate): BertInterm

# Get Input data and preprocess

In [8]:
#Introduce path to CSV with article's text as created by Preprocess Notebook
dframe=pd.read_csv("PATH_TO_CSV")
dframe.head(5)

Unnamed: 0.1,Unnamed: 0,pais,domain,site,tipo,autor,texto
1,155383,US,8394,milenio.com,news,Ana Ponce,El coloquio de Pueblos Afroamericanos en Méxic...
5,314592,ES,4291,eldiario.es,news,esglobal,"Cancela ha explicado, en declaraciones a Europ..."
6,293626,IE,8316,meneame.net,news,LuSaifer,cerrado 10 clics por gobolino a lainformacion....
8,165953,AR,1177,lanacion.com.ar,news,lanacion,0 La demora nuestra puesta en régimen de nuest...
9,252314,ES,4291,eldiario.es,news,esglobal,más INFO El PSOE discute dos propuestas de res...


In [9]:
dframe=dframe[['site','texto']]
dframe.dropna(inplace=True)

In [12]:
dframe['texto']=dframe.texto.apply(lambda x: str.lower(x))
table = str.maketrans({key: None for key in string.punctuation})
dframe['texto']=dframe.texto.apply(lambda x: x.translate(table))

In [13]:
dframe['words']=dframe.texto.apply(lambda x: x.split())
dframe['len']=dframe['words'].apply(lambda x: len(x))
dframe=dframe[dframe.len>5]

In [14]:
maxlen=500

In [15]:
tags = list(set(dframe["site"].values))
n_tags = len(tags)
n_tags

15

In [34]:
# Get assignation of sites to tag created in the training notebook
# Introduce path where dictionary to saved
pickle_in = open("OUTPUT_PATH/tag_dict.pkl","rb")
tag2idx = pickle.load(pickle_in)
pickle_in.close()

In [17]:
words= [item for sublist in list(dframe.words) for item in sublist]
words = list(set(words))
words.append("ENDPAD")
n_words=len(words)
n_words

203259

## Text Tokenization using Bert model obtained from https://github.com/google-research/bert/blob/master/multilingual.md

In [18]:
#Introduce path to model file
tokenizer = BertTokenizer.from_pretrained('PATH_TO_MODEL', do_lower_case=True)

In [19]:
tokens={}
for w in tqdm.tqdm_notebook(words):
    token=tokenizer.tokenize(w)
    tokens[w]=token   
        

HBox(children=(IntProgress(value=0, max=203259), HTML(value='')))




In [20]:
def tokenize_sentence(ls,tokens):
    t=[]
    for l in ls:
        t=t+tokens[l]
    return t

In [21]:
dframe['tokens']=dframe.words.apply(lambda x: tokenize_sentence(x,tokens))

In [22]:
dframe['input']=dframe.tokens.apply(lambda x: tokenizer.convert_tokens_to_ids(x))
dframe['label']=dframe.site.apply(lambda x:tag2idx[x] )

In [23]:
dframe=dframe[0:100]

In [24]:
inputs=pad_sequences(dframe.input,maxlen=maxlen, dtype="long", truncating="post", padding="post")
tensor_inputs = torch.tensor(inputs)
tensor_labels = torch.tensor(list(dframe.label))

## Prepare data and apply model

In [25]:
bs = 8

test_data = TensorDataset(tensor_inputs, tensor_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=bs)

In [26]:
def flat_accuracy(preds, labels):
    return np.sum(np.argmax(preds, axis=1) == labels) / len(labels)

In [27]:
model.eval()
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0
predictions , true_labels = [], []

for batch in tqdm.tqdm_notebook(test_dataloader):
    batch = tuple(t.to(device) for t in batch)
    b_input_ids,  b_labels = batch
    print(b_labels)
    with torch.no_grad():
        tmp_eval_loss = model(b_input_ids, token_type_ids=None,labels=b_labels)
        logits = model(b_input_ids, token_type_ids=None)
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()
    predictions.extend([list(p) for p in logits])
    true_labels.append(label_ids)

    tmp_eval_accuracy = flat_accuracy(logits, label_ids)

    eval_loss += tmp_eval_loss.mean().item()
    eval_accuracy += tmp_eval_accuracy

    nb_eval_examples += b_input_ids.size(0)
    nb_eval_steps += 1

HBox(children=(IntProgress(value=0, max=13), HTML(value='')))

tensor([ 9, 14,  2,  1, 14,  2,  5,  3], device='cuda:0')
tensor([1, 6, 1, 4, 6, 0, 2, 3], device='cuda:0')
tensor([3, 6, 9, 6, 6, 3, 5, 4], device='cuda:0')
tensor([ 4,  1, 14,  2, 10,  1,  9,  6], device='cuda:0')
tensor([ 3,  6,  2,  1,  2, 14,  1,  4], device='cuda:0')
tensor([ 5,  6,  1,  3,  9, 10, 10,  6], device='cuda:0')
tensor([ 7,  4,  9, 10,  6,  6,  6, 14], device='cuda:0')
tensor([ 1,  3,  5, 10,  2,  6, 10, 10], device='cuda:0')
tensor([13,  5,  2, 14, 14,  2,  6,  2], device='cuda:0')
tensor([14,  9, 14,  9,  6,  6,  6,  4], device='cuda:0')
tensor([ 2,  9,  4,  6,  9, 14,  6, 14], device='cuda:0')
tensor([14,  4, 10, 10,  6, 10,  2,  0], device='cuda:0')
tensor([ 5, 14,  9,  3], device='cuda:0')



In [29]:
predictions=np.argmax(predictions, axis=1)

In [None]:
test_loss = eval_loss/nb_eval_steps
print("Test loss: {}".format(eval_loss))
print("Test Accuracy: {}".format(eval_accuracy/nb_eval_steps))