In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import zipfile
f = ['train', 'test']
for i in f:
    with zipfile.ZipFile("/kaggle/input/sentiment-analysis-on-movie-reviews/" + str(i)+ ".tsv.zip","r") as zip_ref:
        zip_ref.extractall("/kaggle/working")


In [None]:
df = pd.read_csv("/kaggle/working/train.tsv", sep='\t')
test = pd.read_csv("/kaggle/working/test.tsv", sep='\t')

In [None]:
df.head()

In [None]:
df = df[['Phrase','Sentiment']]
df.columns = ['text', 'label']

In [None]:
test = test[['Phrase']]
test.columns = ['text']

In [None]:
df.info()

In [None]:
#preprocessing data.
#remove punc, html, stop works,
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize

import re
from tqdm import tqdm
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
def clean_sentences(df):
    reviews = []

    for sent in tqdm(df['text']):
        
        #remove html content
        review_text = BeautifulSoup(sent).get_text()
        
        #remove non-alphabetic characters
        review_text = re.sub("[^a-zA-Z]"," ", review_text)
    
        #tokenize the sentences
        words = word_tokenize(review_text.lower())
    
        #lemmatize each word to its lemma
        lemma_words = [lemmatizer.lemmatize(i) for i in words]
    
        reviews.append(" ".join(lemma_words))

    return(reviews)

In [None]:
df['text'] = clean_sentences(df)
test['text'] = clean_sentences(test)

In [None]:
df['text'][:20]

In [None]:
test['text'][:20]

In [None]:
def strl(t):
    return len(t.split())
df['len'] = df['text'].apply(strl)
test['len'] = test['text'].apply(strl)

In [None]:
df['len'].hist()
test['len'].hist()


In [None]:
# preprocessing data with the same input  ---> same output
ps = df['text'].value_counts()
value = list(ps[ps>2].index)
for key in value:
    ls = df[df['text']==key]['label'].value_counts().index[0]
    df.loc[df['text'] ==key, 'label'] = ls

In [None]:
df['label'].hist()

In [None]:
df[df['text']=='']['label'].value_counts()

In [None]:
import torch
from transformers import *


In [None]:
XLMRobertaForSequenceClassification.pretrained_model_archive_map.keys()


In [None]:
MODEL_CLASS = {'distilbert': (DistilBertModel, DistilBertTokenizer, 'distilbert-base'),
              'xlmroberta':(XLMRobertaModel, XLMRobertaTokenizer, 'xlm-roberta-base'),
              'roberta': (RobertaModel, RobertaTokenizer, 'roberta-base'),
              'xlm':(XLMModel,XLMTokenizer,'xlm-mlm-en-2048'),
              'albert':(AlbertModel, AlbertTokenizer, 'albert-base-v2')}


In [None]:
#parameter 
bs = 16
fpt16 = False
seed = 345026
model_name = 'roberta'

In [None]:
import random
def seed_all(seed_value):
    random.seed(seed_value) # Python
    np.random.seed(seed_value) # cpu vars
    torch.manual_seed(seed_value) # cpu  vars
    
    if torch.cuda.is_available(): 
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value) # gpu vars
        torch.backends.cudnn.deterministic = True  #needed
        torch.backends.cudnn.benchmark = False

In [None]:
seed_all(seed)

In [None]:
model_name , model_tokenizer, pretrain = MODEL_CLASS[model_name]

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
def tokenize(df,tokenizer, pretrain):
    tokenizer = tokenizer.from_pretrained(pretrain)
    token = [tokenizer.tokenize(t) for t in df.text]
    ids = [tokenizer.convert_tokens_to_ids(t) for t in token]
    ids = [tokenizer.build_inputs_with_special_tokens(t) for t in ids]
    df['token'] = pad_sequences(ids, maxlen =64, truncating='post', dtype='long',padding='post').tolist()
    return df

    

In [None]:
df = tokenize(df, model_tokenizer,pretrain)

In [None]:
test = tokenize(test, model_tokenizer, pretrain)

In [None]:
def mask_create(t):
    mask = []
    for i in t:
        if i>0:
            mask.append(1)
        else:
            mask.append(0)
    return mask
test['mask']=test['token'].apply(mask_create)
df['mask'] = df['token'].apply(mask_create)

In [None]:
df.head()

In [None]:
from sklearn.model_selection import train_test_split
train ,valid = train_test_split(df, random_state=seed, test_size=0.2)

In [None]:
#model defind
from keras.utils import to_categorical
train_x  = torch.tensor([np.array(t) for t in train['token']])
train_mask = torch.tensor([np.array(t) for t in train['mask']], dtype=torch.long)
train_y = torch.tensor(to_categorical(train['label'].values), dtype=torch.long)
valid_x = torch.tensor([np.array(t) for t in valid['token']])
valid_mask = torch.tensor([np.array(i) for i in valid['mask']], dtype=torch.long)
valid_y = torch.tensor(to_categorical(valid['label'].values), dtype=torch.long)

In [None]:
test_x = torch.tensor([np.array(t) for t in test['token']])
test_mask = torch.tensor([np.array(i) for i in test['mask']], dtype=torch.long)


In [None]:
from torch.utils.data import TensorDataset, RandomSampler, DataLoader

train_tensor = TensorDataset(train_x, train_mask, train_y)
sample = RandomSampler(train_tensor)
trainloader = DataLoader(train_tensor ,sampler=sample, batch_size= bs)

valid_tensor = TensorDataset(valid_x, valid_mask, valid_y)
sample = RandomSampler(valid_tensor)
validloader = DataLoader(valid_tensor ,sampler=sample, batch_size= bs)

test_tensor = TensorDataset(test_x, test_mask)
sample = RandomSampler(test_tensor)
testloader = DataLoader(test_tensor ,sampler=sample, batch_size= 1)

In [None]:
#deploy model with 2 layer : roberta and dense

class RobertaMultilayerClassification(torch.nn.Module):
    def __init__(self,model,pretrain,out_layers=1):
        super(RobertaMultilayerClassification,self).__init__()
        self.model = model.from_pretrained(pretrain)
        self.dense = torch.nn.Linear(768, out_layers)
        torch.nn.init.xavier_normal_(self.dense.weight)

    def forward(self, ids , attention_mask=None,token_type_ids=None):
        last_hidden_state = self.model(input_ids=ids, attention_mask = attention_mask,token_type_ids= token_type_ids)
        mean_last_hidden_state = self.pool_hidden_state(last_hidden_state)
        #mean_last_hidden_state = self.dropout(mean_last_hidden_state)
        logits = self.dense(mean_last_hidden_state)
        return logits
    def frezze_parameters(self, index = 0):
        if index:
            self.model.parameters[index].requires_grad = False
        else:
            for param in self.model.parameters():
                param.requires_grad = False
    def unfree_parameters(self, index):
        if index:
            self.model.parameters[index].requires_grad = True
        else:
            for param in self.model.parameters():
                param.requires_grad = True
    def pool_hidden_state(self, last_hidden_state):
        last_hidden_state = last_hidden_state[0]
        #mean_last_hidden_state = torch.mean(last_hidden_state, 1)
        #return mean_last_hidden_state
        return last_hidden_state
    

In [None]:
labels = len(df['label'].unique())
roberta = RobertaMultilayerClassification(model_name,pretrain,labels)

In [None]:
roberta.model

In [None]:

optim = torch.optim.AdamW(params=roberta.parameters(), lr=2e-5,weight_decay=0.01)

In [None]:
from tqdm import tqdm
from torch.autograd import Variable
def fit_cycle( model, epochs, optim , train_loader , valid_loader, device):
    model.train()
    model.to(device)

    for i in range(epochs):
        loss_set = []
        steps = 0
        loss_sum  = 0
        for batch in tqdm(train_loader):
            x_train ,x_mask, y_train = batch
            x_train = Variable(x_train).cuda(device)
            x_mask  = Variable(x_mask).cuda(device)
            y_train = Variable(y_train).cuda(device)
            ypred = model(x_train, attention_mask=x_mask)
            optim.zero_grad()
            loss = torch.nn.BCEWithLogitsLoss(ypred,y_train)
            loss_sum += loss
            steps +=y_train.shape[0]
            loss.backward()
            optim.step()
            loss_set.append(loss_sum/steps)
    model.eval()
    valid_set = []
    steps = 0
    loss_sum  = 0
    for batch in tqdm(valid_loader):
            x_train ,x_mask, y_train = batch
            x_train = Variable(x_train).cuda(device)
            x_mask  = Variable(x_mask).cuda(device)
            y_train = Variable(y_train).cuda(device)
            ypred = model(x_train, attention_mask=x_mask)
            with torch.no_grad():
                loss = torch.nn.BCEWithLogitsLoss(ypred,y_train)
                loss_sum += loss
                steps +=y_train.shape[0]
                valid_set.append(loss_sum/steps)
    return model,loss_set,valid_set
        
        

In [None]:

use_cuda = torch.cuda.is_available()
device = torch.device('cuda:0' if use_cuda else 'cpu')
print(use_cuda)

In [None]:
roberta, loss, valid = fit_cycle(roberta, 1,optim, trainloader, validloader, device = device)

In [None]:
import matplotlib.pyplot as plt
plt.plot(loss)
plt.plot(valid)
plt.show()

In [None]:
def predict(model, loader):
    model.to('cuda')
    predict = []
    for batch in tqdm(loader):
        x_train, x_mask = batch
        x_train = Variable(x_train).cuda(device) #move tensor to cuda
        x_mask = Variable(x_mask).cuda(device) #move tensor to cuda
        with torch.no_grad():
            opt = model(x_train, attention_mask = x_mask)
            predict.append(opt)
    return predict
ypred = predict(roberta, testloader)


In [None]:
pred = []
for i in ypred:
    pred.append(np.array(torch.Tensor.cpu(i)))
pred = np.array(pred)
pred = pred.reshape(pred.shape[0],-1)

In [None]:
plt.hist(np.argmax(pred, axis=1))

In [None]:
ypred[:5]