In [3]:

 # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/tweet-sentiment-extraction/sample_submission.csv
/kaggle/input/tweet-sentiment-extraction/train.csv
/kaggle/input/tweet-sentiment-extraction/test.csv
/kaggle/input/bert-base-uncased/config.json
/kaggle/input/bert-base-uncased/pytorch_model.bin
/kaggle/input/bert-base-uncased/vocab.txt


In [None]:
# !pip install ktrain

## All imports

In [4]:
import numpy as np # linear algebra
import pandas as pd
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
import os
from torch.utils.data import TensorDataset, DataLoader
import re
import string
import transformers
import torch.optim as optim
from tqdm.autonotebook import tqdm
from sklearn.metrics import f1_score,precision_score,recall_score


## Reading Data

In [112]:
train=pd.read_csv("../input/tweet-sentiment-extraction/train.csv").dropna().reset_index(drop=True)
test=pd.read_csv("../input/tweet-sentiment-extraction/test.csv").dropna().reset_index(drop=True)
sample=pd.read_csv("../input/tweet-sentiment-extraction/sample_submission.csv").dropna().reset_index(drop=True)

In [25]:
# data path is .csv file with 2 columns, having name 'text', 'sentiment'.
# test is also .csv file with 1 column, name 'text' only
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["VISIBLE_DEVICES"]="0"
import ktrain
from ktrain import text
DATA_PATH = '../input/tweet-sentiment-extraction/train.csv'
NUM_WORDS = 25000
MAXLEN = 128
(x_train, y_train), (x_test, y_test), preproc = text.texts_from_csv(DATA_PATH,
                      'text',
                      label_columns = ["sentiment"],
                      val_filepath='../input/tweet-sentiment-extraction/test.csv', # if None, 10% of data will be used for validation
                      max_features=NUM_WORDS, maxlen=MAXLEN,
                      ngram_range=1,preprocess_mode='bert')
model=text.text_classifier('bert',(x_train,y_train),preproc=preproc)
learner=ktrain.get_learner(model,(x_train,y_train),(x_test,y_test),batch_size=32)
learner.fit(3e-5,3,cycle_len=1,cycle_mult=1)
# learner.lr_find(max_epochs=3)
# learner.lr_plot()
predictor=ktrain.get_predictor(learner.model, preproc)
output=predictor.predict(np.array(test['text']))
count=0;
for i,samp in enumerate(output):
    if samp==test['sentiment'].iloc[i]:
        count+=1
print(f'accuracy on test data: {count/len(test)}')

detected encoding: utf-8 (if wrong, set manually)
preprocessing train...
language: en


Is Multi-Label? False
preprocessing test...
language: en


Is Multi-Label? False
maxlen is 128
done.
Epoch 1/3
Epoch 2/3
Epoch 3/3
accuracy on test data: 0.7891907187323146


In [23]:
x_test

[array([[ 101, 2130, 2295, ...,    0,    0,    0],
        [ 101, 2204, 2000, ...,    0,    0,    0],
        [ 101, 1045, 2081, ...,    0,    0,    0],
        ...,
        [ 101, 4283,  999, ...,    0,    0,    0],
        [ 101, 2138, 1045, ...,    0,    0,    0],
        [ 101, 1045, 2428, ...,    0,    0,    0]]),
 array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]])]

## Distil bert

In [18]:
x_train=train['text'].values
x_test=test['text'].values
y_train=train['sentiment'].values
y_test=test['sentiment'].values
trn, val, preproc = text.texts_from_array(x_train=x_train, y_train=y_train,
                                          x_test=x_test, y_test=y_test,
                                          class_names=[0,1,2],
                                          preprocess_mode='distilbert',
                                          maxlen=150)
text.print_text_classifiers()
model = text.text_classifier('distilbert', train_data=trn, preproc=preproc)
learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=6)
learner.fit_onecycle(3e-5, 4)
p = ktrain.get_predictor(model, preproc)

preprocessing train...
language: en
train sequence lengths:
	mean : 13
	95percentile : 25
	99percentile : 28




Is Multi-Label? False
preprocessing test...
language: en
test sequence lengths:
	mean : 13
	95percentile : 25
	99percentile : 28


task: text classification
fasttext: a fastText-like model [http://arxiv.org/pdf/1607.01759.pdf]
logreg: logistic regression using a trainable Embedding layer
nbsvm: NBSVM model [http://www.aclweb.org/anthology/P12-2018]
bigru: Bidirectional GRU with pretrained fasttext word vectors [https://fasttext.cc/docs/en/crawl-vectors.html]
standard_gru: simple 2-layer GRU with randomly initialized embeddings
bert: Bidirectional Encoder Representations from Transformers (BERT) from keras_bert [https://arxiv.org/abs/1810.04805]
distilbert: distilled, smaller, and faster BERT from Hugging Face transformers [https://arxiv.org/abs/1910.01108]
Is Multi-Label? False
maxlen is 150
done.


begin training using onecycle policy with max lr of 3e-05...
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [None]:
p.predict(val)

In [None]:
test['sentiment']

In [None]:
f1macro=f1_score(output,test['sentiment'],average='macro')
f1avg=f1_score(output,test['sentiment'],average='weighted')
recall=recall_score(output,test['sentiment'],average='macro')
recallw=recall_score(output,test['sentiment'],average='weighted')
precision=precision_score(output,test['sentiment'],average='macro')
precisionw=precision_score(output,test['sentiment'],average='weighted')
print(f'f1macro: {f1macro} recall: {recall} precision :{precision}')
print("below result for average")
print(f'f1avg: {f1avg} recall: {recallw} precision :{precisionw}')

## Q&A model supervised

In [95]:
class QAModel(nn.Module):
    
    def __init__(self):
        super(QAModel,self).__init__()
        self.bert=transformers.BertForQuestionAnswering.from_pretrained("../input/bert-base-uncased")
        # make it correct
#         self.dropout=nn.Dropout(0.2)
#         self.layer1=nn.Linear(768*2,2)
        self.softmax=nn.Softmax()
    
    def forward(self, ids,attn,token_type):
        start,end=self.bert(ids,attention_mask=attn,token_type_ids=token_type)
#         out=self.dropout(out2)
#         out=self.layer1(out)
#         start=self.softmax(start)
#         end=self.softmax(end)
        

        return start, end
        

## unsupervised model

In [7]:
## coming soon

## Training, evaluation, inference function

In [8]:
# def training():
    
    
# def evaluate():
    

# def inference():



## Clean Data

In [96]:
def clean(text):
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\d', '', text)
    return text

* negative: 4997
* positive: 3893
* neutral:  8699
* [CLS]   : 101
* [SEP]   : 102

## token_ids, attn_masks, token_type_ids, start & end logits

In [108]:
token_ids=[]
attn_masks=[]
token_type_ids=[]
start_logits=[]
end_logits=[]
def preprocess(data,typ):
    
    maxlen=115
    tokenizer=transformers.BertTokenizer.from_pretrained('bert-base-uncased')
    
    for i,_ in enumerate(range(len(data))):
        
        #finding start & end logits
        out1=tokenizer.encode_plus(data['text'].iloc[i])
        if typ=='train':
            out2=tokenizer.encode_plus(data['selected_text'].iloc[i])
            l=len(out2['input_ids'])
        if typ=='train':
            for i,_ in enumerate(out1['input_ids']):
                if out1['input_ids'][i:i+l-2]==out2['input_ids'][1:-1]:
                    break
            start=i
            end=i+l-2
            temp=[0]*(maxlen+2)
            temp[start]=1
            start_logits.append(temp)
            temp=[0]*(maxlen+2)
            temp[end-1]=1
            end_logits.append(temp)
        #making length to maximum
        if data['sentiment'].iloc[i]=='positive':
            sentiment_id=4893
        elif data['sentiment'].iloc[i]=='negative':
            sentiment_id=4893
        else:
            sentiment_id=4893
        tkn_len=len(out1['input_ids'])
        out1['input_ids']=[101]+[sentiment_id]+[102]+out1['input_ids'][1:]+(maxlen-tkn_len)*[0]
        token_ids.append(out1['input_ids'])
        
        token_type_ids.append(3*[1]+out1['token_type_ids']+(maxlen-tkn_len-1)*[1])
        attn_masks.append(2*[1]+out1['attention_mask']+(maxlen-tkn_len)*[0])
    if typ=='train':
        data['start']=start_logits
        data['end']=end_logits
    data['token_ids']=token_ids
    data['token_type']=token_type_ids
    data['attn_masks']=attn_masks
       
    
#     start_logits=[]
#     end_logits=[]
#     for i in range(len(train)):
#         a=train['text'].iloc[i].index(train['selected_text'].iloc[i])
#         start_logits.append(a)
#         end_logits.append(a+len(train['selected_text'].iloc[i]))
    

In [11]:
preprocess(train,'train')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




In [12]:
np.shape(train['attn_masks'][29])


(117,)

## Train 

In [13]:
class Loader:
    def __init__(self,data):
        self.data=data
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self,idx):
        data= {
                'ids':torch.tensor(self.data['token_ids'].iloc[idx],dtype=torch.long),
                'token_type_ids':torch.tensor(self.data['token_type'].iloc[idx],dtype=torch.long),
                'attn_masks':torch.tensor(self.data['attn_masks'].iloc[idx],dtype=torch.long),
                'start':torch.tensor(self.data['start'].iloc[idx],dtype=torch.float),
                'end':torch.tensor(self.data['end'].iloc[idx],dtype=torch.float)
#                 'ids':torch.tensor(self.data['token_ids'][idx],dtype=torch.long),
        }
        return data
        

## Training Function

In [14]:
if torch.cuda.is_available():
    device='cuda'
else:
    device='cpu'

## Loss function

In [31]:
def loss_fn(start_logits,end_logits,start_pos,end_pos):
    f=nn.CrossEntropyLoss()
    loss1=f(start_logits,torch.argmax(start_pos,dim=-1))
    loss2=f(end_logits,torch.argmax(end_pos,dim=-1))
    return loss1+loss2

In [29]:
def training(model,train_data,device,optimizer):
    total_loss=0
    model.train()
    train_data=tqdm(train_data,total=len(train_data))
    for i,train_data1 in enumerate(train_data):
        optimizer.zero_grad()
        start,end=model(train_data1['ids'].to(device),train_data1['attn_masks'].to(device),train_data1['token_type_ids'].to(device))
        loss_=loss_fn(start,end,train_data1['start'].to(device),train_data1['end'].to(device))
        loss_.backward()
        optimizer.step()
        total_loss+=loss_.item()
    return total_loss

def evaluation(model,eval_data,device,optimizer):
    model.eval()
    start_logits=[]
    end_logits=[]
    total_loss=0
    with torch.no_grad():
        for eval_data1 in eval_data:
            optimizer.zero_grad()
            start,end=model(eval_data1['ids'].to(device),eval_data1['attn_masks'].to(device),
                            eval_data1['token_type_ids'].to(device))
#             loss_=loss(start,end,eval_data1['start'].to(device),
#                        eval_data1['end'].to(device))
#             total_loss+=loss_.item()
            
            start=(start.cpu().detach().numpy())
            end=(end.cpu().detach().numpy())
            start=np.argmax(start,axis=-1)
            end=np.argmax(end,axis=-1)
            start_logits.append(start)
            end_logits.append(end)
    return total_loss,start_logits,end_logits,eval_data


In [34]:
torch.cuda.empty_cache()

In [80]:
# def train_fn(train):
    
train,test=train_test_split(train,test_size=0.2)
train_data=Loader(train)
test_data=Loader(test)
#     print(train['text'].iloc[0])
#     print(train['selected_text'].iloc[0])
train_data=torch.utils.data.DataLoader(train_data,batch_size=16,shuffle=True)
test_data=torch.utils.data.DataLoader(test_data,batch_size=16,shuffle=False)
epochs=15
model=QAModel()
model.to(device)
optimizer=optim.Adam(model.parameters(),lr=1e-5)
for _ in range(epochs):
    loss=training(model,train_data,device,optimizer)
    print(loss)
torch.save(model.state_dict(),'bertmodel.pkl')
#     for train_data1 in train_data:
#         start1,start2=model(train_data1['ids'],train_data1['attn_masks'],train_data1['token_type_ids'])
#         break
#     return start1,start2,train_data1['start'],train_data1['end']
# a,b,c,d=train_fn(train)
# train_fn(train)

Some weights of the model checkpoint at ../input/bert-base-uncased were not used when initializing BertForQuestionAnswering: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at ../input/b

HBox(children=(FloatProgress(value=0.0, max=563.0), HTML(value='')))


2330.219277024269


HBox(children=(FloatProgress(value=0.0, max=563.0), HTML(value='')))


1671.6843252182007


HBox(children=(FloatProgress(value=0.0, max=563.0), HTML(value='')))


1436.8315666913986


HBox(children=(FloatProgress(value=0.0, max=563.0), HTML(value='')))


1238.4238522052765


HBox(children=(FloatProgress(value=0.0, max=563.0), HTML(value='')))


1042.2496291399002


HBox(children=(FloatProgress(value=0.0, max=563.0), HTML(value='')))


873.5108373761177


HBox(children=(FloatProgress(value=0.0, max=563.0), HTML(value='')))


706.7730885744095


HBox(children=(FloatProgress(value=0.0, max=563.0), HTML(value='')))


562.1287714540958


HBox(children=(FloatProgress(value=0.0, max=563.0), HTML(value='')))


457.0490790605545


HBox(children=(FloatProgress(value=0.0, max=563.0), HTML(value='')))


357.32158890366554


HBox(children=(FloatProgress(value=0.0, max=563.0), HTML(value='')))


294.252699457109


HBox(children=(FloatProgress(value=0.0, max=563.0), HTML(value='')))


255.48266354203224


HBox(children=(FloatProgress(value=0.0, max=563.0), HTML(value='')))


208.87947703152895


HBox(children=(FloatProgress(value=0.0, max=563.0), HTML(value='')))


188.66268216073513


HBox(children=(FloatProgress(value=0.0, max=563.0), HTML(value='')))


157.11604451015592


In [1]:
model=QAModel()
model.state_dict(torch.load('./bertmodel.pkl'))
model.to(device)
preprocess(test,'test')
test_data=Loader(test)
test_data=torch.utils.data.DataLoader(test_data,batch_size=16,shuffle=False)
loss,start_idx,end_idx,data=evaluation(model,test_data,device,optimizer)

NameError: name 'QAModel' is not defined

In [110]:

s1=np.concatenate(start_idx)
e1=np.concatenate(end_idx)
count=0
for i in range(len(s1)):
    if(s1[i]>e1[i]):
        count+=1
count

932

In [111]:
e1.shape

(2251,)

In [84]:
selec_text=[]
def jaccard_score(start,end):
    score=0
    for i in range(len(start)):
#         if start[i]>=end[i]:
#             selec_text.append(test['text'].iloc[i])
#         else:
        selec_text.append(test['text'].iloc[i][start[i]:end[i]])
jaccard_score(s1,e1)            
sample['selected_text']=selec_text
sample.to_csv('sample_submission.csv',header=None,index=None)

ValueError: Length of values (2251) does not match length of index (3518)

In [79]:
k=19
print(test['text'].iloc[k],test['selected_text'].iloc[k],s1[k],e1[k],np.argmax(test['start'].iloc[k]),
     np.argmax(test['end'].iloc[k]),test['sentiment'].iloc[k])

Just got my heart ripped out  i love you guys Just got my heart ripped out  i love you guys 15 0 1 10 positive


In [None]:
for i,value in enumerate(data):
    print(len(samp),value)
    if i==3:
        break

In [None]:
print(np.argmax(test['start'].iloc[i]),np.argmax(test['end'].iloc[i]))
print(np.argmax(s[0][i]),np.argmax(e[0][i]))
data{'ids'}

In [68]:
a=np.random.rand((3))
a=[3,4,5]
b=[0,0,1]

array([0.14084189, 0.80429543, 0.92213003])

In [None]:
f=torch.nn.BCEWithLogitsLoss()
# z=f(a[0],c[0])
# z
# max(a[0])
output = f(a,c)
output

In [None]:
target = target.squeeze()
target

In [None]:
target = torch.ones(3, 5, requires_grad=True)
target.shape
target=target.squeeze(dim=1)
target.shape

In [None]:
train,test=train_test_split(train,test_size=0.2)
train_data=Loader(train)
test_data=Loader(test)

In [None]:
train['token_ids']

In [None]:
a=train['text'].iloc[6]
b=train['selected_text'].iloc[6]
tokenizer=transformers.BertTokenizer.from_pretrained('bert-base-uncased')