In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# !pip install ktrain

## All imports

In [None]:
import numpy as np # linear algebra
import pandas as pd
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
import os
from torch.utils.data import TensorDataset, DataLoader
import re
import string
import transformers
import torch.optim as optim
from tqdm.autonotebook import tqdm
from sklearn.metrics import f1_score,precision_score,recall_score


## Reading Data

In [None]:
train=pd.read_csv("../input/tweet-sentiment-extraction/train.csv").dropna().reset_index(drop=True)
test=pd.read_csv("../input/tweet-sentiment-extraction/test.csv").dropna().reset_index(drop=True)
sample=pd.read_csv("../input/tweet-sentiment-extraction/sample_submission.csv").dropna().reset_index(drop=True)

## Sentiment Classification using BERT

In [None]:
# data path is .csv file with 2 columns, having name 'text', 'sentiment'.
# test is also .csv file with 1 column, name 'text' only
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["VISIBLE_DEVICES"]="0"
import ktrain
from ktrain import text
DATA_PATH = '../input/tweet-sentiment-extraction/train.csv'
NUM_WORDS = 25000
MAXLEN = 128
(x_train, y_train), (x_test, y_test), preproc = text.texts_from_csv(DATA_PATH,
                      'text',
                      label_columns = ["sentiment"],
                      val_filepath='../input/tweet-sentiment-extraction/test.csv', # if None, 10% of data will be used for validation
                      max_features=NUM_WORDS, maxlen=MAXLEN,
                      ngram_range=1,preprocess_mode='bert')
model=text.text_classifier('bert',(x_train,y_train),preproc=preproc)
learner=ktrain.get_learner(model,(x_train,y_train),(x_test,y_test),batch_size=32)
learner.fit(3e-5,3,cycle_len=1,cycle_mult=1)
# learner.lr_find(max_epochs=3)
# learner.lr_plot()
predictor=ktrain.get_predictor(learner.model, preproc)
output=predictor.predict(np.array(test['text']))
count=0;
for i,samp in enumerate(output):
    if samp==test['sentiment'].iloc[i]:
        count+=1
print(f'accuracy on test data: {count/len(test)}')

In [None]:
!pip install torchsummary

## Sentiment Classification using Distil BERT

In [None]:
x_train=train['text'].values
x_test=test['text'].values
y_train=train['sentiment'].values
y_test=test['sentiment'].values
trn, val, preproc = text.texts_from_array(x_train=x_train, y_train=y_train,
                                          x_test=x_test, y_test=y_test,
                                          class_names=[0,1,2],
                                          preprocess_mode='distilbert',
                                          maxlen=150)
text.print_text_classifiers()
model = text.text_classifier('distilbert', train_data=trn, preproc=preproc)
learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=6)
learner.fit_onecycle(3e-5, 4)
p = ktrain.get_predictor(model, preproc)

In [None]:
p.predict(val)

In [None]:
f1macro=f1_score(output,test['sentiment'],average='macro')
f1avg=f1_score(output,test['sentiment'],average='weighted')
recall=recall_score(output,test['sentiment'],average='macro')
recallw=recall_score(output,test['sentiment'],average='weighted')
precision=precision_score(output,test['sentiment'],average='macro')
precisionw=precision_score(output,test['sentiment'],average='weighted')
print(f'f1macro: {f1macro} recall: {recall} precision :{precision}')
print("below result for average")
print(f'f1avg: {f1avg} recall: {recallw} precision :{precisionw}')

## Data cleaning

In [None]:
def clean(text):
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\d', '', text)
    return text

## Q&A model supervised

In [None]:
class QAModel(nn.Module):
    
    def __init__(self):
        super(QAModel,self).__init__()
        self.bert=transformers.BertModel.from_pretrained("../input/bert-base-uncased",output_hidden_states=True)
        self.dropout=nn.Dropout(0.2)
        self.layer1=nn.Linear(768*2,2)
        self.softmax=nn.Softmax()
    
    def forward(self, ids,attn,token_type):
        out,_,_=self.bert(ids,attention_mask=attn,token_type_ids=token_type)
        out=self.dropout(out)
        out=self.layer1(out)
        start_logits, end_logits = out.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1)
        end_logits = end_logits.squeeze(-1)
        start=self.softmax(start_logits)
        end=self.softmax(end_logits)
        return start_logits,end_logits
        

## Model Summary

In [None]:
from torchsummary import summary
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model1=QAModel().to(device)
print(model1)

* negative: 4997
* positive: 3893
* neutral:  8699
* [CLS]   : 101
* [SEP]   : 102

## Preprocessing
token_ids, attn_masks, token_type_ids, start & end logits

In [None]:
token_ids=[]
attn_masks=[]
token_type_ids=[]
start_logits=[]
end_logits=[]
def preprocess(data,typ):
    
    maxlen=115
    tokenizer=transformers.BertTokenizer.from_pretrained('bert-base-uncased')
    
    for i,_ in enumerate(range(len(data))):
        
        #finding start & end logits
        out1=tokenizer.encode_plus(data['text'].iloc[i])
        if typ=='train':
            out2=tokenizer.encode_plus(data['selected_text'].iloc[i])
            l=len(out2['input_ids'])
        if typ=='train':
            for i,_ in enumerate(out1['input_ids']):
                if out1['input_ids'][i:i+l-2]==out2['input_ids'][1:-1]:
                    break
            start=i
            end=i+l-2
            temp=[0]*(maxlen+2)
            temp[start]=1
            start_logits.append(temp)
            temp=[0]*(maxlen+2)
            temp[end-1]=1
            end_logits.append(temp)
        #making length to maximum
        if data['sentiment'].iloc[i]=='positive':
            sentiment_id=4893
        elif data['sentiment'].iloc[i]=='negative':
            sentiment_id=4893
        else:
            sentiment_id=4893
        tkn_len=len(out1['input_ids'])
        out1['input_ids']=[101]+[sentiment_id]+[102]+out1['input_ids'][1:]+(maxlen-tkn_len)*[0]
        token_ids.append(out1['input_ids'])
        
        token_type_ids.append(3*[1]+out1['token_type_ids']+(maxlen-tkn_len-1)*[1])
        attn_masks.append(2*[1]+out1['attention_mask']+(maxlen-tkn_len)*[0])
    if typ=='train':
        data['start']=start_logits
        data['end']=end_logits
    data['token_ids']=token_ids
    data['token_type']=token_type_ids
    data['attn_masks']=attn_masks

In [None]:
preprocess(train,'train')

## Data Loader

In [None]:
class Loader:
    def __init__(self,data):
        self.data=data
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self,idx):
        data= {
                'ids':torch.tensor(self.data['token_ids'].iloc[idx],dtype=torch.long),
                'token_type_ids':torch.tensor(self.data['token_type'].iloc[idx],dtype=torch.long),
                'attn_masks':torch.tensor(self.data['attn_masks'].iloc[idx],dtype=torch.long),
                'start':torch.tensor(self.data['start'].iloc[idx],dtype=torch.float),
                'end':torch.tensor(self.data['end'].iloc[idx],dtype=torch.float)
        }
        return data
        

## Checking gpu

In [None]:
if torch.cuda.is_available():
    device='cuda'
else:
    device='cpu'

## Loss function

In [None]:
def loss_fn(start_logits,end_logits,start_pos,end_pos):
    f=nn.CrossEntropyLoss()
    loss1=f(start_logits,torch.argmax(start_pos,dim=-1))
    loss2=f(end_logits,torch.argmax(end_pos,dim=-1))
    return loss1+loss2

In [None]:
def training(model,train_data,device,optimizer):
    total_loss=0
    model.train()
    train_data=tqdm(train_data,total=len(train_data))
    for i,train_data1 in enumerate(train_data):
        optimizer.zero_grad()
        start,end=model(train_data1['ids'].to(device),train_data1['attn_masks'].to(device),train_data1['token_type_ids'].to(device))
        loss_=loss_fn(start,end,train_data1['start'].to(device),train_data1['end'].to(device))
        loss_.backward()
        optimizer.step()
        total_loss+=loss_.item()
    return total_loss

def evaluation(model,eval_data,device,optimizer):
    model.eval()
    start_logits=[]
    end_logits=[]
    total_loss=0
    with torch.no_grad():
        for eval_data1 in eval_data:
            optimizer.zero_grad()
            start,end=model(eval_data1['ids'].to(device),eval_data1['attn_masks'].to(device),
                            eval_data1['token_type_ids'].to(device))
#             loss_=loss(start,end,eval_data1['start'].to(device),
#                        eval_data1['end'].to(device))
#             total_loss+=loss_.item()
            
            start=(start.cpu().detach().numpy())
            end=(end.cpu().detach().numpy())
            start=np.argmax(start,axis=-1)
            end=np.argmax(end,axis=-1)
            start_logits.append(start)
            end_logits.append(end)
    return total_loss,start_logits,end_logits,eval_data


In [None]:
torch.cuda.empty_cache()

## Training model

In [None]:
train,test=train_test_split(train,test_size=0.2)
train_data=Loader(train)
test_data=Loader(test)
train_data=torch.utils.data.DataLoader(train_data,batch_size=16,shuffle=True)
test_data=torch.utils.data.DataLoader(test_data,batch_size=16,shuffle=False)
epochs=10
model=QAModel()
model.to(device)
optimizer=optim.Adam(model.parameters(),lr=1e-5)
for _ in range(epochs):
    loss=training(model,train_data,device,optimizer)
    print(loss)
torch.save(model.state_dict(),'bertmodel.pkl')


## Evaluating function

In [None]:
model=QAModel()
model.state_dict(torch.load('./bertmodel.pkl'))
model.to(device)
preprocess(test,'test')
test_data=Loader(test)
test_data=torch.utils.data.DataLoader(test_data,batch_size=16,shuffle=False)
loss,start_idx,end_idx,data=evaluation(model,test_data,device,optimizer)

In [None]:
start=np.concatenate(start_idx)
end=np.concatenate(end_idx)
count=0
for i in range(len(start)):
    if(start[i]>end[i]):
        count+=1
count

## Selecting the extracted text

In [None]:
selec_text=[]
def result(start,end):
    score=0
    for i in range(len(start)):
        if start[i]>=end[i]:
            selec_text.append(test['text'].iloc[i])
        else:
            selec_text.append(test['text'].iloc[i][start[i]:end[i]])
result(start,end)            
sample['selected_text']=selec_text
sample.to_csv('sample_submission.csv',header=None,index=None)

## jaccard score

In [None]:
score=0
for i in range(len(selec_text)):
    a=text['selected_text'].iloc[i].split()
    b=selec_text[i].split()
    numer=len(len(set(a).intersection(b)))
    denom=len(len(set(a).union(b)))
    score+=numer/denom
score=score/len(test)