In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install pytorch-transformers



In [3]:
import os
import csv
from pathlib import Path
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from pytorch_transformers import BertTokenizer, BertForSequenceClassification, BertConfig
from torch.optim import Adam
import torch.nn.functional as F

In [4]:
def load_csv( path ) : 
  header = None
  data   = list()
  with open( path, encoding='utf-8') as csvfile:
    reader = csv.reader( csvfile ) 
    for row in reader : 
      if header is None : 
        header = row
        continue
      data.append( row ) 
  return header, data

In [5]:
def write_csv( data, location ) : 
  with open( location, 'w', encoding='utf-8') as csvfile:
    writer = csv.writer( csvfile ) 
    writer.writerows( data ) 
  print( "Wrote {}".format( location ) ) 

In [6]:
def _get_train_data( data_location, file_name, include_context, include_idiom ) :
    
    file_name = os.path.join( data_location, file_name ) 

    header, data = load_csv( file_name )

    out_header = [ 'label', 'sentence1' ]
    if include_idiom :
        out_header = [ 'label', 'sentence1', 'sentence2' ]
        
    # train: ['DataID', 'Language', 'MWE', 'Setting', 'Previous', 'Target', 'Next', 'Label']
    out_data = list()
    for elem in data :
        label     = elem[ header.index( 'Label'  ) ]
        sentence1 = elem[ header.index( 'Target' ) ]
        if include_context :
            sentence1 = ' '.join( [ elem[ header.index( 'Previous' ) ], elem[ header.index( 'Target' ) ], elem[ header.index( 'Next' ) ] ] )
        this_row = None
        if not include_idiom :
            this_row = [ label, sentence1 ] 
        else :
            sentence2 = elem[ header.index( 'MWE' ) ]
            this_row = [ label, sentence1, sentence2 ]
        out_data.append( this_row )
        assert len( out_header ) == len( this_row )
    return [ out_header ] + out_data

In [7]:
def _get_dev_eval_data( data_location, input_file_name, gold_file_name, include_context, include_idiom ) :

    input_headers, input_data = load_csv( os.path.join( data_location, input_file_name ) )
    gold_header  = gold_data = None
    if not gold_file_name is None : 
        gold_header  , gold_data  = load_csv( os.path.join( data_location, gold_file_name  ) )
        assert len( input_data ) == len( gold_data )

    # dev, eval: ['ID', 'Language', 'MWE', 'Previous', 'Target', 'Next']
    # gold: ['ID', 'DataID', 'Language', 'Label']
    
    out_header = [ 'label', 'sentence1' ]
    if include_idiom :
        out_header = [ 'label', 'sentence1', 'sentence2' ]

    out_data = list()
    for index in range( len( input_data ) ) :
        label = 1 # gold 값이 없는 경우 모두 1
        if not gold_file_name is None : 
            this_input_id = input_data[ index ][ input_headers.index( 'ID' ) ]
            this_gold_id  = gold_data [ index ][ gold_header  .index( 'ID' ) ]
            assert this_input_id == this_gold_id
            
            label     = gold_data[ index ][ gold_header.index( 'Label'  ) ]
            
        elem      = input_data[ index ]
        sentence1 = elem[ input_headers.index( 'Target' ) ]
        if include_context :
            sentence1 = ' '.join( [ elem[ input_headers.index( 'Previous' ) ], elem[ input_headers.index( 'Target' ) ], elem[ input_headers.index( 'Next' ) ] ] )
        this_row = None
        if not include_idiom :
            this_row = [ label, sentence1 ] 
        else :
            sentence2 = elem[ input_headers.index( 'MWE' ) ]
            this_row = [ label, sentence1, sentence2 ]
        assert len( out_header ) == len( this_row ) 
        out_data.append( this_row )
        

    return [ out_header ] + out_data

In [8]:
def create_data( input_location, output_location ) :

    
    ## Zero shot data
    train_data = _get_train_data(
        data_location   = input_location,
        file_name       = 'train_zero_shot.csv',
        include_context = True,
        include_idiom   = False
    )
    write_csv( train_data, os.path.join( output_location, 'ZeroShot', 'train.csv' ) )
    
    dev_data = _get_dev_eval_data(
        data_location    = input_location,
        input_file_name  = 'dev.csv',
        gold_file_name   = 'dev_gold.csv', 
        include_context  = True,
        include_idiom    = False
    )        
    write_csv( dev_data, os.path.join( output_location, 'ZeroShot', 'dev.csv' ) )
    
    eval_data = _get_dev_eval_data(
        data_location    = input_location,
        input_file_name  = 'eval.csv',
        gold_file_name   = None , ## Don't have gold evaluation file -- submit to CodaLab
        include_context  = True,
        include_idiom    = False
    )
    write_csv( eval_data, os.path.join( output_location, 'ZeroShot', 'eval.csv' ) )


    ## OneShot Data (combine both for training)
    train_zero_data = _get_train_data(
        data_location   = input_location,
        file_name       = 'train_zero_shot.csv',
        include_context = False,
        include_idiom   = True
    )
    train_one_data = _get_train_data(
        data_location   = input_location,
        file_name       = 'train_one_shot.csv',
        include_context = False,
        include_idiom   = True
    )

    assert train_zero_data[0] == train_one_data[0] ## Headers
    train_data = train_one_data + train_zero_data[1:]
    write_csv( train_data, os.path.join( output_location, 'OneShot', 'train.csv' ) )
    
    dev_data = _get_dev_eval_data(
        data_location    = input_location,
        input_file_name  = 'dev.csv',
        gold_file_name   = 'dev_gold.csv', 
        include_context  = False,
        include_idiom    = True
    )        
    write_csv( dev_data, os.path.join( output_location, 'OneShot', 'dev.csv' ) )
    
    eval_data = _get_dev_eval_data(
        data_location    = input_location,
        input_file_name  = 'eval.csv',
        gold_file_name   = None,
        include_context  = False,
        include_idiom    = True
    )
    write_csv( eval_data, os.path.join( output_location, 'OneShot', 'eval.csv' ) )


In [9]:
def make_input(df):
    df['sentence']=''
    for i in range(len(df)):
        df['sentence'][i] = str(df['sentence1'][i])+str(df['sentence2'][i])

In [10]:
input_location = '/content/drive/MyDrive/Colab Notebooks/data/'
output_location = '/content/drive/MyDrive/Colab Notebooks/data/preproc/'

Path( os.path.join( output_location, 'ZeroShot' ) ).mkdir(parents=True, exist_ok=True)
Path( os.path.join( output_location, 'OneShot' ) ).mkdir(parents=True, exist_ok=True)

create_data( input_location, output_location )

Wrote /content/drive/MyDrive/Colab Notebooks/data/preproc/ZeroShot/train.csv
Wrote /content/drive/MyDrive/Colab Notebooks/data/preproc/ZeroShot/dev.csv
Wrote /content/drive/MyDrive/Colab Notebooks/data/preproc/ZeroShot/eval.csv
Wrote /content/drive/MyDrive/Colab Notebooks/data/preproc/OneShot/train.csv
Wrote /content/drive/MyDrive/Colab Notebooks/data/preproc/OneShot/dev.csv
Wrote /content/drive/MyDrive/Colab Notebooks/data/preproc/OneShot/eval.csv


In [11]:
# OneShot
train = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/data/preproc/OneShot/train.csv', encoding='utf-8')
dev = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/data/preproc/OneShot/dev.csv', encoding='utf-8')
eval = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/data/preproc/OneShot/eval.csv', encoding='utf-8')
make_input(train)
make_input(dev)
make_input(eval)
train.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


Unnamed: 0,label,sentence1,sentence2,sentence
0,1,Despite having the riches to afford the high l...,high life,Despite having the riches to afford the high l...
1,1,Minister of Family and Social Policy Marlena M...,birth rate,Minister of Family and Social Policy Marlena M...
2,0,So Aaron faced the same brutal racism other Bl...,home run,So Aaron faced the same brutal racism other Bl...
3,0,Program leaders said the scholarship defines p...,public service,Program leaders said the scholarship defines p...
4,1,"In the ensuing years, Wennberg might not have ...",public service,"In the ensuing years, Wennberg might not have ..."


In [12]:
class Idiom_Dataset(Dataset):
    
    def __init__(self, df):
        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        text = self.df.iloc[idx, 3]
        label = self.df.iloc[idx, 0]
        return text, label

In [13]:
train_dataset = Idiom_Dataset(train)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, num_workers=0)

In [13]:
device = torch.device("cuda")
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased')
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

In [15]:
optimizer = Adam(model.parameters(), lr=1e-6)

itr = 1
p_itr = 100
epochs = 3
total_loss = 0
total_len = 0
total_correct = 0


model.train()
for epoch in range(epochs):
    

    for text, label in train_loader:
        optimizer.zero_grad()

        encoded_list = [tokenizer.encode(t, add_special_tokens=True) for t in text]
        padded_list =  [e + [0] * (512-len(e)) for e in encoded_list]
        sample = torch.tensor(padded_list)
        sample, label = sample.to(device), label.to(device)
        labels = torch.tensor(label)
        outputs = model(sample, labels=labels)
        loss, logits = outputs

        pred = torch.argmax(F.softmax(logits), dim=1)
        correct = pred.eq(labels)
        total_correct += correct.sum().item()
        total_len += len(labels)
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        
        if itr % p_itr == 0:
            print('[Epoch {}/{}] Iteration {} -> Train Loss: {:.4f}, Accuracy: {:.3f}'.format(epoch, epochs-1, itr, total_loss/p_itr, total_correct/total_len))
            total_loss = 0
            total_len = 0
            total_correct = 0
        itr+=1
        
        
    torch.save(model, '/content/drive/MyDrive/Colab Notebooks/checkpoint/'+'epochs{}.pt'.format(epoch))
    

    #torch.save(model.state_dict(), '/content/drive/MyDrive/Colab Notebooks'+'epochs{}.pt'.format(epoch))
        



[Epoch 0/2] Iteration 100 -> Train Loss: 0.6875, Accuracy: 0.568
[Epoch 0/2] Iteration 200 -> Train Loss: 0.6881, Accuracy: 0.546
[Epoch 0/2] Iteration 300 -> Train Loss: 0.6772, Accuracy: 0.580
[Epoch 0/2] Iteration 400 -> Train Loss: 0.6710, Accuracy: 0.586
[Epoch 0/2] Iteration 500 -> Train Loss: 0.6629, Accuracy: 0.615
[Epoch 1/2] Iteration 600 -> Train Loss: 0.6575, Accuracy: 0.631
[Epoch 1/2] Iteration 700 -> Train Loss: 0.6385, Accuracy: 0.661
[Epoch 1/2] Iteration 800 -> Train Loss: 0.6161, Accuracy: 0.662
[Epoch 1/2] Iteration 900 -> Train Loss: 0.5926, Accuracy: 0.705
[Epoch 1/2] Iteration 1000 -> Train Loss: 0.5329, Accuracy: 0.744
[Epoch 1/2] Iteration 1100 -> Train Loss: 0.5481, Accuracy: 0.731
[Epoch 2/2] Iteration 1200 -> Train Loss: 0.5354, Accuracy: 0.735
[Epoch 2/2] Iteration 1300 -> Train Loss: 0.4729, Accuracy: 0.770
[Epoch 2/2] Iteration 1400 -> Train Loss: 0.4713, Accuracy: 0.779
[Epoch 2/2] Iteration 1500 -> Train Loss: 0.4470, Accuracy: 0.805
[Epoch 2/2] Iterati

In [16]:
model = torch.load('/content/drive/MyDrive/Colab Notebooks/checkpoint/epochs2.pt')
model.to(device)
model.eval()

eval_dataset = Idiom_Dataset(dev)
eval_loader = DataLoader(eval_dataset, batch_size=1, shuffle=False, num_workers=0)

total_loss = 0
total_len = 0
total_correct = 0

for text, label in eval_loader:
    encoded_list = [tokenizer.encode(t, add_special_tokens=True) for t in text]
    padded_list =  [e + [0] * (512-len(e)) for e in encoded_list]
    sample = torch.tensor(padded_list)
    sample, label = sample.to(device), label.to(device)
    labels = torch.tensor(label)
    outputs = model(sample, labels=labels)
    _, logits = outputs

    pred = torch.argmax(F.softmax(logits), dim=1)
    correct = pred.eq(labels)
    total_correct += correct.sum().item()
    total_len += len(labels)

print('Test accuracy: ', total_correct / total_len)



Test accuracy:  0.6968876860622463
