In [103]:
import nltk
import string
import pandas as pd
from tqdm import tqdm
from scipy import stats
from os.path import join
from nltk.util import ngrams
from sklearn.utils import shuffle
from nltk.tokenize import word_tokenize
tqdm.pandas()
# nltk.download('all')
import json
from pathlib import Path
from transformers import BertTokenizerFast, pipeline, AutoModelForQuestionAnswering
from torch.utils.data import DataLoader
from transformers import AdamW




In [93]:
def clean(df):
    return df

def read_data(path, test_ratio = 0.3):
    csv_path = Path(path)
    df = pd.read_csv(csv_path)
    df = clean(df)

    df['POI'] = df.progress_apply(lambda row: row['POI/street'].split('/')[0], axis=1)
    df['street'] = df.progress_apply(lambda row: row['POI/street'].split('/')[-1], axis=1)
    
    df_poi = df.copy() 
    df_poi['question'] = 'minat?'

    df_poi = df_poi[df_poi.POI != ''].reset_index(drop=True)

    df_poi['start'] = df_poi.progress_apply(lambda row: \
        row['raw_address'].find(row['POI']) , axis=1)
    df_poi = df_poi[df_poi.start != -1].reset_index(drop=True)

    df_street = df.copy() 
    df_street['question'] = 'jalan?'
    
    df_street = df_street[df_street.street != ''].reset_index(drop=True)

    df_street['start'] = df_street.progress_apply(lambda row: \
        row['raw_address'].find(row['street']) , axis=1)
    df_street = df_street[df_street.start != -1].reset_index(drop=True)
    

    df_poi['POI'] = df_poi.progress_apply(lambda row: \
        {'text':row['POI'],'answer_start':row.start}, axis=1)
    df_street['street'] = df_street.progress_apply(lambda row: \
        {'text':row['street'],'answer_start':row.start}, axis=1)

    result = pd.DataFrame()
    result['answers']   =   \
        df_poi.POI.values.tolist() + df_street.street.values.tolist()
    result['questions'] =   \
        df_poi.question.values.tolist() + df_street.question.values.tolist()
    result['contexts']  =   \
        df_poi.raw_address.values.tolist() + df_street.raw_address.values.tolist()
    
    result = shuffle(result)
    
    train = result.iloc[: int(len(result) * (1 - test_ratio)), :]
    test =  result.iloc[int(len(result) * (1 - test_ratio)) : , :]
    print(len(train), len(test))
    return train, test
    # return train['contexts'], train['questions'], train['answers']
    
train, test = read_data('train.csv',test_ratio = 0.1)

train_contexts, train_questions, train_answers = \
    train['contexts'].values.tolist(),  \
    train['questions'].values.tolist(), \
    train['answers'].values.tolist()    

val_contexts, val_questions, val_answers = \
    test['contexts'].values.tolist(),   \
    test['questions'].values.tolist(),  \
    test['answers'].values.tolist() 

100%|██████████| 300000/300000 [00:01<00:00, 167212.16it/s]
100%|██████████| 300000/300000 [00:01<00:00, 169234.14it/s]
100%|██████████| 121491/121491 [00:01<00:00, 112241.61it/s]
100%|██████████| 229857/229857 [00:01<00:00, 116819.82it/s]
100%|██████████| 75351/75351 [00:00<00:00, 94292.79it/s]
100%|██████████| 212470/212470 [00:02<00:00, 95145.36it/s]
259038 28783


In [94]:
list(zip(train_contexts, train_questions, train_answers))[:10]

[('lion star plastics toni, bintang rangkui',
  'minat?',
  {'text': 'lion star plastics', 'answer_start': 0}),
 ('pedurungan tengah gg. buntu 3 50192 pedurungan',
  'jalan?',
  {'text': 'gg. buntu', 'answer_start': 18}),
 ('gedong haji tai 5 rt 8 rw 10 13760 pasar rebo',
  'jalan?',
  {'text': 'haji tai', 'answer_start': 7}),
 ('kamp kali 36 kedaung kali angke rt 6 3 cengkareng',
  'jalan?',
  {'text': 'kamp kali', 'answer_start': 0}),
 ('pangkalan lareh kel.ikurkoto koto panjang.kec koto tangah kota padang',
  'minat?',
  {'text': 'pangkalan lareh', 'answer_start': 0}),
 ('sibo bor - para longat balige',
  'jalan?',
  {'text': 'sibo bor - para', 'answer_start': 0}),
 ('leba pasir jaya cikupa', 'jalan?', {'text': 'leba', 'answer_start': 0}),
 ('teru cimun, cibeunying kidul',
  'jalan?',
  {'text': 'teru cimun', 'answer_start': 0}),
 ('sur, jend ahmad yani, polewali',
  'jalan?',
  {'text': 'jend ahmad yani', 'answer_start': 5}),
 ('got roy no 37 jun high sch pa van der steur, jatiraha

In [95]:
def add_end_idx(answers, contexts):
    for answer, context in zip(answers, contexts):
        gold_text = answer['text']
        start_idx = answer['answer_start']
        end_idx = start_idx + len(gold_text)

        # sometimes squad answers are off by a character or two – fix this
        if context[start_idx:end_idx] == gold_text:
            answer['answer_end'] = end_idx
        elif context[start_idx-1:end_idx-1] == gold_text:
            answer['answer_start'] = start_idx - 1
            answer['answer_end'] = end_idx - 1     # When the gold label is off by one character
        elif context[start_idx-2:end_idx-2] == gold_text:
            answer['answer_start'] = start_idx - 2
            answer['answer_end'] = end_idx - 2     # When the gold label is off by two characters

add_end_idx(train_answers, train_contexts)
add_end_idx(val_answers, val_contexts)

In [97]:
from transformers import BertTokenizerFast, pipeline, AutoModelForQuestionAnswering

tokenizer = BertTokenizerFast.from_pretrained(
    'Wikidepia/indobert-lite-squad'
)

train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True)
val_encodings = tokenizer(val_contexts, val_questions, truncation=True, padding=True)

In [98]:
def add_token_positions(encodings, answers):
    start_positions = []
    end_positions = []
    for i in range(len(answers)):
        start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))
        end_positions.append(encodings.char_to_token(i, answers[i]['answer_end'] - 1))

        # if start position is None, the answer passage has been truncated
        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length
        if end_positions[-1] is None:
            end_positions[-1] = tokenizer.model_max_length

    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

add_token_positions(train_encodings, train_answers)
add_token_positions(val_encodings, val_answers)

In [100]:
import torch

class ShopeeDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

train_dataset = ShopeeDataset(train_encodings)
val_dataset = ShopeeDataset(val_encodings)

In [102]:
model = AutoModelForQuestionAnswering.from_pretrained('Wikidepia/indobert-lite-squad')

In [105]:

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
num_epochs = 1
model.to(device)
model.train()

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=True)

optim = AdamW(model.parameters(), lr=5e-5)

def train_epoch(train_loader, device, optimizer, model):
    model.train()
    for batch in tqdm(train_loader):
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
        loss = outputs[0]
        loss.backward()
        optim.step()

def val_epoch(val_loader, device, model):
    model.eval()
    with torch.no_grad():
        pass
for epoch in range(num_epochs):
    train_epoch(train_loader = train_loader, device = device, optimizer = optim, model = model)
    # val_epoch(train_loader = train_loader, device = device, optimizer = optim, model = model)

  0%|          | 3/16190 [00:05<8:05:44,  1.80s/it]


KeyboardInterrupt: 