In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from transformers import *
import torch
import matplotlib.pyplot as plt


In [None]:
df = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
test = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')
df.head()

In [None]:
df  = df[['text', 'target']]
test = test[['text']]

In [None]:
import re
import nltk
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
def string_process(input):
    #remove html content
    review_text = BeautifulSoup(input).get_text()
    #remove non-alphabetic characters
    review_text = re.sub("[^a-zA-Z]"," ", review_text)
    #tokenize the sentences
    words = word_tokenize(review_text.lower())
    
    #lemmatize each word to its lemma
    lemma_words = [lemmatizer.lemmatize(i) for i in words]
    
    return " ".join(lemma_words)

df['text'] = df['text'].apply(string_process)

In [None]:
test['text'] = test['text'].apply(string_process)

In [None]:
df.head()

In [None]:
df['target'].hist()

In [None]:
df['text'].value_counts()

In [None]:
def lenstr(t):
    return len(t.split())
df['len'] = df['text'].apply(lenstr)

In [None]:
test['len'] = test['text'].apply(lenstr)

In [None]:
df['len'].hist()
test['len'].hist()


In [None]:
model = {'albert':(AlbertModel, AlbertTokenizer,"albert-base-v2"),
        'roberta':(RobertaModel, RobertaTokenizer,'roberta-base')}
model_name, model_tokenizer, pretrain = model['roberta']

In [None]:
'''tokenizer albert'''
from tensorflow.keras.preprocessing.sequence import pad_sequences
tokenize = model_tokenizer.from_pretrained(pretrain)
token = [tokenize.tokenize(t) for t in df['text']]
ids = [tokenize.convert_tokens_to_ids(t) for t in token]
ids = [tokenize.build_inputs_with_special_tokens(id) for id in ids]
ids = pad_sequences(ids, maxlen=24, truncating='post',dtype='long', padding='post')

In [None]:
df['token'] = ids.tolist()

In [None]:
def mask(t):
    mask = []
    for i in t:
        if i >0:
            mask.append(1)
        else:
            mask.append(0)
    return mask
df['mask'] = df['token'].apply(mask)

In [None]:
test['mask'] = test['token'].apply(mask)

In [None]:
df.head()

In [None]:
from sklearn.model_selection import train_test_split
train , valid = train_test_split(df, test_size = 0.15, random_state = 345026)

In [None]:

x_train = torch.tensor([np.array(i) for i in train['token']])
x_mask = torch.tensor([np.array(i) for i in train['mask']])
x_label =  torch.tensor(train['target'].values)

x_val = torch.tensor([np.array(i) for i in valid['token']])
x_maskvalid = torch.tensor([np.array(i) for i in valid['mask']])
x_valid =  torch.tensor(valid['target'].values)

In [None]:
def seed_all(seed_value):
    random.seed(seed_value) # Python
    np.random.seed(seed_value) # cpu vars
    torch.manual_seed(seed_value) # cpu  vars
    
    if torch.cuda.is_available(): 
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value) # gpu vars
        torch.backends.cudnn.deterministic = True  #needed
        torch.backends.cudnn.benchmark = False

In [None]:
from torch.utils.data import TensorDataset, RandomSampler, DataLoader
import random
seed_all(123)
train_tensor = TensorDataset(x_train, x_mask, x_label)
sample = RandomSampler(train_tensor)
train_loader = DataLoader(train_tensor, sampler=sample, batch_size=8)

valid_tensor = TensorDataset(x_val, x_maskvalid, x_valid)
valid_sample = RandomSampler(valid_tensor)
valid_loader = DataLoader(valid_tensor, sampler=valid_sample, batch_size=8)

In [None]:
x_test = torch.tensor([np.array(i) for i in test['token']])
test_mask = torch.tensor([np.array(i) for i in test['mask']])
test_tensor = TensorDataset(x_test, test_mask)
sample = RandomSampler(test_tensor)
test_loader = DataLoader(test_tensor, sampler=sample, batch_size=1)

In [None]:
class AlbertClassification(torch.nn.Module):
    def __init__(self, numbers):
        super(AlbertClassification,self).__init__()
        self.model = model_name.from_pretrained(pretrain)
        self.dense = torch.nn.Linear(768,numbers)
        #self.dropout = torch.nn.Dropout(0.3)
        torch.nn.init.xavier_normal_(self.dense.weight)

    def forward(self, ids , attention_mask=None,token_type_ids=None):
        last_hidden_state = self.model(input_ids=ids, attention_mask = attention_mask,token_type_ids= token_type_ids)
        mean_last_hidden_state = self.pool_hidden_state(last_hidden_state)
        #mean_last_hidden_state = self.dropout(mean_last_hidden_state)
        logits = torch.sigmoid(self.dense(mean_last_hidden_state))
        return logits
    def frezze_parameters(self, index = 0):
        if index:
            self.model.parameters[index].requires_grad = False
        else:
            for param in self.model.parameters():
                param.requires_grad = False
    def unfree_parameters(self, index):
        if index:
            self.model.parameters[index].requires_grad = True
        else:
            for param in self.model.parameters():
                param.requires_grad = True
    def pool_hidden_state(self, last_hidden_state):
        last_hidden_state = last_hidden_state[0]
        mean_last_hidden_state = torch.mean(last_hidden_state, 1)
        return mean_last_hidden_state
    
model = AlbertClassification(1)        


In [None]:
optim = torch.optim.AdamW(model.parameters(),lr=2e-5, weight_decay=0.01)
loss_f = torch.nn.BCEWithLogitsLoss()

In [None]:
from tqdm import tqdm
from torch.autograd import Variable

def fit_cycle( model,epochs, train_loader, valid_loader,loss_f, optim,device = 'cpu'):
    model.to(device)
    model.train()
    train_loss_set = []
    for i in range(epochs):
        print("epochs: ", i )
        train_loss = 0
        num_set = 0
        for batch in tqdm(train_loader):            
            x_train, x_mask, x_label = batch
            x_train = Variable(x_train).cuda(device) #move tensor to cuda
            x_mask = Variable(x_mask).cuda(device) #move tensor to cuda
            x_label = Variable(x_label).cuda(device) #move tensor to cuda
            optim.zero_grad()
            ypred = model(x_train, attention_mask = x_mask)
            loss = loss_f(ypred.reshape(-1), x_label.float())
            train_loss += loss
            num_set += x_train.shape[0]
            loss.backward()
            optim.step()
            train_loss_set.append(float(train_loss/num_set))
    print("Training loss is : ",train_loss)
    model.eval()
    valid_loss = 0
    valid_loss_set = []
    num_valid = 0
    for batch in tqdm(valid_loader):            
            x_train, x_mask, x_label = batch
            x_train = Variable(x_train).cuda(device) #move tensor to cuda
            x_mask = Variable(x_mask).cuda(device) #move tensor to cuda
            x_label = Variable(x_label).cuda(device) #move tensor to cuda
            with torch.no_grad():
                ypred = model(x_train, attention_mask = x_mask)
                loss = loss_f(ypred.reshape(-1), x_label.float())
                train_loss += loss
                num_valid += x_train.shape[0]
            valid_loss_set.append(float(train_loss/num_valid))
    print("Valid loss is : ", valid_loss)
    return model , train_loss_set, valid_loss_set
            
        

In [None]:
use_cuda = torch.cuda.is_available()
device = torch.device('cuda:0' if use_cuda else 'cpu')
print(use_cuda)

In [None]:

model , train_loss, valid_loss = fit_cycle(model, 1, train_loader, valid_loader, loss_f, optim,device)

In [None]:
def predict(model, loader):
    predict = []
    for batch in tqdm(loader):
        x_train, x_mask = batch
        x_train = Variable(x_train).cuda(device) #move tensor to cuda
        x_mask = Variable(x_mask).cuda(device) #move tensor to cuda
        with torch.no_grad():
            opt = model(x_train, attention_mask = x_mask)
            predict.append(opt)
    return predict
ypred = predict(model, test_loader)
pred = []
for i in ypred:
    pred.append(torch.Tensor.cpu(i))
plt.hist(pred, bins=2)

In [None]:
%matplotlib inline
plt.style.use('ggplot')
plt.plot(train_loss)
plt.plot(valid_loss)
plt.show()