In [1]:
config = {
    "pretrained_model": "bert-base-cased",
    "tokenizer": "bert-base-cased",
    "max_seq_length": 256,
    "batch_size": 10,
    "lr": 2e-5,
    "epochs": 10,
    "device": "cuda",
    "gpu_ids": "0,1,2,3",
    "seed": 2020,
    "fp16": False,
    "loss_scale": 0,
    "gradient_accumulation_steps":1,
    "warmup_proportion": 0.1,
    "gradient_accumulation_steps": 1,
    "num_labels": 4,
    "is_multilabel": False,
    "valid_metric": "macro_f1",
    "model_save_dir": "../checkpoints/bert_cased_512_biocaster_4cate_20200903/",
    "patience": 4,
}

In [2]:
def set_seed(seed=2020):
    import numpy as np
    import random
    import torch

    if type(seed) != int:
        raise ValueError("Error: seed is invalid type")
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

In [3]:
import sys
sys.path.append("../")
from beta_nlp.utils.data_util import biocaster2df
data_file = "/home/zm324/workspace/doc_cls/datasets/biocaster/BioCaster.3.xml"
data_df = biocaster2df(data_file)
data_df["source"] = "Biocaster"
data_df.head(3)

parse biocaser data from /home/zm324/workspace/doc_cls/datasets/biocaster/BioCaster.3.xml, docs number:1003, lablels number:1003


Unnamed: 0,docs,labels,source
0,\nBird Flu Outbreak Drill Spooks Manitoba Town...,0,Biocaster
1,\nTyphoid outbreak in Agusan del Sur town unde...,3,Biocaster
2,\n Typhoid Outbreak In Central Nepal November...,3,Biocaster


In [4]:
from sklearn.utils import shuffle
set_seed()
data_df = shuffle(data_df).reset_index()
data_df["flag"]=None
train_index = int(len(data_df.index)*0.8)
valid_index = train_index + round(len(data_df.index)*0.1)
data_df.loc[:train_index,"flag"]="train"
data_df.loc[train_index:valid_index,"flag"]="valid"
data_df.loc[valid_index:,"flag"]="test"

In [5]:
train_set = data_df[data_df["flag"]=="train"]
dev_set = data_df[data_df["flag"]=="valid"]
test_set = data_df[data_df["flag"]=="test"]
# train_set.to_csv("/home/zm324/workspace/doc_cls/datasets/biocaster/train.csv")
# dev_set.to_csv("/home/zm324/workspace/doc_cls/datasets/biocaster/dev.csv")
# test_set.to_csv("/home/zm324/workspace/doc_cls/datasets/biocaster/test.csv")

In [6]:
len(train_set.index),len(dev_set.index),len(test_set.index)

(802, 100, 101)

In [7]:
import pandas as pd
import os
from google.cloud import translate_v2 as translate
import numpy as np

def gtranslate(text,  source_language='es', target_language='es'):
    os.environ['GOOGLE_APPLICATION_CREDENTIALS']="/home/zm324/.google/apikey.json"
    # Instantiates a client
    translate_client = translate.Client()
    translation = translate_client.translate(text, target_language=target_language, source_language=source_language)
    return [x['translatedText'] for x in translation]


# language codes: https://cloud.google.com/translate/docs/languages

def back_gtranslate(text, source_language='en', target_language='zh'):
    target_text = gtranslate(text, source_language=source_language, target_language=target_language)
    english = gtranslate(target_text, source_language=target_language, target_language=source_language)
    return english

def reduce_text(text, length=5500):
    if len(text)>length:
        new_text = text[:length]
        if text[length]==" ":
            return new_text
    else:
        return text
    while len(new_text)>0:
        if new_text[len(new_text)-1]==" ":
            new_text=new_text[:len(new_text)-1]
            break
        else:
            new_text=new_text[:len(new_text)-1]
    return new_text

data_test  = train_set
lang_list = ["ar","zh","fr","es","pt","ru"]
# lang_list = ["zh"]
docs = list(data_test.docs.values)
# docs = [reduce_text(doc) for doc in docs]
docs = [doc for doc in docs]
labels = data_test.labels
flag = data_test.flag
data_test.loc[:,"lan"]="en"
for lang in lang_list:
    bt_docs = [back_gtranslate([doc], target_language=lang)[0] for doc in docs]
    lan = [lang]*len(docs)
    data_test = data_test.append(pd.DataFrame({"docs":bt_docs,"labels":labels,"flag":flag,"lan":lan}))
data_test.to_csv("/home/zm324/workspace/doc_cls/datasets/biocaster/train_bt_full.csv")
len(data_test.index)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)


5614

In [None]:
from beta_nlp.models.bert_cls import BertModel
cls = BertModel(config)

In [15]:
len(train_set.index),len(dev_set.index),len(test_set.index)

(802, 100, 101)

In [None]:
cls.train(train_set,dev_set)

In [None]:
cls.test(test_set)