In [7]:
import logging

from transformers import AutoTokenizer
import json
import tqdm
from transformers import BertTokenizer, BertModel
from torch.utils.data import DataLoader
import logging
from transformers import BartTokenizer, BartModel
from sklearn.preprocessing import LabelBinarizer
from pytorch_lightning import Trainer
import pandas as pd
import numpy as np
import re

# Huggingface transformers
import transformers
from transformers import BertModel,BertTokenizer,AdamW, get_linear_schedule_with_warmup, RobertaTokenizer,RobertaModel

import torch
from torch import nn ,cuda
from torch.utils.data import DataLoader,Dataset,RandomSampler, SequentialSampler

import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

#handling html data
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from pytorch_lightning.callbacks.early_stopping import EarlyStopping

In [8]:
raw_datasets = pd.read_csv('t1_without_ge.csv')

In [9]:
import torch
torch.cuda.empty_cache()

In [10]:
mlb = LabelBinarizer()
y = raw_datasets['task_1'].tolist()
yt = mlb.fit_transform(y)
yt = torch.FloatTensor(yt)

x = raw_datasets['text'].tolist()

In [15]:
model_name = 'bert-base-multilingual-cased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [16]:
def get_bertembedding(text):
    inputs = tokenizer(text, return_tensors="pt")
    embedding = model(**inputs).pooler_output
    embedding = embedding.tolist()[0]
    return embedding


In [17]:
par = tqdm.tqdm(total=len(x), ncols=100)
emb = []
for text in x:
    embedding = get_bertembedding(text)
    par.update(1)
    emb.append(embedding)

par.close()

100%|███████████████████████████████████████████████████████| 34705/34705 [1:09:46<00:00,  8.29it/s]


In [18]:
df = pd.DataFrame({'text':x,'embedding':emb,'Tags':y})

In [19]:
df.to_csv('t1_emb_without_ge.tsv', sep='\t')

In [20]:
x_train,x_test,y_train,y_test = train_test_split(emb, yt , test_size=0.1, random_state=42,shuffle=True)
x_tr,x_val,y_tr,y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=42,shuffle=True)

In [25]:
class HasocmDataset (Dataset):
    def __init__(self,embedding, labels):
        self.embedding = embedding
        self.labels = labels
        
    def __len__(self):
        return len(self.embedding)
    
    def __getitem__(self, item_idx):
        embedding = self.embedding[item_idx]
        
        return {
            'embedding': embedding,
            'label': self.labels[item_idx]
            
        }

class HasocDataModule (pl.LightningDataModule):    
    def __init__(self,x_tr,y_tr,x_val,y_val,x_test,y_test,batch_size=16,max_token_len=200):
        super().__init__()
        self.tr_text = x_tr
        self.tr_label = y_tr
        self.val_text = x_val
        self.val_label = y_val
        self.test_text = x_test
        self.test_label = y_test
        self.tokenizer = tokenizer
        self.batch_size = batch_size
        self.max_token_len = max_token_len

    def setup(self):
        self.train_dataset = HasocmDataset(embedding=self.tr_text,  labels=self.tr_label)
        self.val_dataset= HasocmDataset(embedding=self.val_text, labels=self.val_label)
        self.test_dataset =HasocmDataset(embedding=self.test_text, labels=self.test_label)
        
        
    def train_dataloader(self):
        return DataLoader(self.train_dataset,batch_size= self.batch_size, shuffle = True , num_workers=4)

    def val_dataloader(self):
        return DataLoader (self.val_dataset,batch_size= self.batch_size , num_workers=4)

    def test_dataloader(self):
        return DataLoader (self.test_dataset,batch_size= self.batch_size , num_workers=4)

In [28]:
N_EPOCHS = 20
BATCH_SIZE = 512
MAX_LEN = 150
LR = 1e-04

datamodule = HasocDataModule(x_tr,y_tr,x_val,y_val,x_test,y_test,BATCH_SIZE,MAX_LEN)
datamodule.setup()

In [29]:
class HasocClassifier(pl.LightningModule):
    # Set up the classifier
    def __init__(self,steps_per_epoch=None,n_epochs=3, lr=2e-5):
        super().__init__()
        self.layer1 = nn.Linear(768, 128)
        self.layer2 = nn.Linear(128, 32)
        self.layer3 = nn.Linear(32, 1)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()
        self.dropout = nn.Dropout(p=0.1)
        self.steps_per_epoch = steps_per_epoch
        self.n_epochs = n_epochs
        self.lr = lr
        self.criterion = nn.BCEWithLogitsLoss()

    def forward(self,inputs):
        output = self.layer1(inputs)
        output = self.relu(output)
        output = self.dropout(output)
        output = self.layer2(output)
        output = self.relu(output)
        output = self.dropout(output)
        output = self.layer3(output)
        output = self.sigmoid(output)
        return output

    def training_step(self,batch,batch_idx):
        embedding = batch['embedding']
        labels = batch['label']
        
        outputs = self(embedding)
        loss = self.criterion(outputs,labels)
        self.log('train_loss',loss , prog_bar=True,logger=True)
        
        return {"loss" :loss, "predictions":outputs, "labels": labels }


    def validation_step(self,batch,batch_idx):
        embedding = batch['embedding']
        labels = batch['label']
        
        outputs = self(embedding)
        loss = self.criterion(outputs,labels)
        self.log('val_loss',loss , prog_bar=True,logger=True)        
        return loss

    def test_step(self,batch,batch_idx):
        embedding = batch['embedding']
        labels = batch['label']
        
        outputs = self(embedding)
        loss = self.criterion(outputs,labels)
        self.log('test_loss',loss , prog_bar=True,logger=True)
        
        return loss
    
    
    def configure_optimizers(self):
        optimizer = AdamW(self.parameters() , lr=self.lr)
        warmup_steps = self.steps_per_epoch//3
        total_steps = self.steps_per_epoch * self.n_epochs - warmup_steps

        scheduler = get_linear_schedule_with_warmup(optimizer,warmup_steps,total_steps)

        return [optimizer], [scheduler]

In [30]:
steps_per_epoch = len(x_tr)//BATCH_SIZE
model = HasocClassifier( steps_per_epoch=steps_per_epoch,n_epochs=N_EPOCHS,lr=LR)

checkpoint_callback = ModelCheckpoint(
    monitor='val_loss',# monitored quantity
    filename='QTag-{epoch:02d}-{val_loss:.2f}',
    save_top_k=1, #  save the top 3 models
    mode='min', # mode of the monitored quantity  for optimization
)

trainer = Trainer(max_epochs = N_EPOCHS , gpus = 1, callbacks=[checkpoint_callback, EarlyStopping(monitor="val_loss")],progress_bar_refresh_rate = 30, num_sanity_val_steps=0)
trainer.fit(model, datamodule)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type              | Params
------------------------------------------------
0 | layer1    | Linear            | 98.4 K
1 | layer2    | Linear            | 4.1 K 
2 | layer3    | Linear            | 33    
3 | relu      | ReLU              | 0     
4 | sigmoid   | Sigmoid           | 0     
5 | dropout   | Dropout           | 0     
6 | criterion | BCEWithLogitsLoss | 0     
------------------------------------------------
102 K     Trainable params
0         Non-trainable params
102 K     Total params
0.410     Total estimated model params size (MB)
  rank_zero_warn(


Epoch 0:   0%|          | 0/62 [00:00<00:00, 999.83it/s]

In [1]:
from transformers import RobertaTokenizer, RobertaModel

In [3]:
import torch

In [4]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

In [5]:
model = RobertaModel.from_pretrained('roberta-base')

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [17]:
inputs1 = tokenizer("我是中国人", return_tensors="pt")

In [18]:
inputs2 = tokenizer("我是北京人", return_tensors="pt")

In [19]:
inputs1

{'input_ids': tensor([[    0, 47876,  3602, 48569, 47643, 47516, 10809, 47973,     2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [20]:
inputs2

{'input_ids': tensor([[    0, 47876,  3602, 48569, 48418,  6800, 46499, 11582, 47973,     2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [7]:
outputs = model(**inputs)

In [11]:
last_hidden_states = outputs.pooler_output 

In [13]:
last_hidden_states.shape

torch.Size([1, 768])