In [None]:
!pip install spacy==3.2.4
!pip install ginza==5.1.0
!pip install ja-ginza==5.1.0

Ginzaをインストールした後カーネルを再起動

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# 自身の環境のパスを指定
base_folder = "drive/MyDrive/Colab\ Notebooks/cpt-hanrei-1st-refactor/src"

In [None]:
cd {base_folder}

In [None]:
from bs4 import BeautifulSoup
import requests
import time
from tqdm.notebook import  tqdm
import spacy
import pandas
import re
import pandas as pd
import numpy as np
import random

nlp = spacy.load('ja_ginza')

In [None]:
def parse(txt):
    return [token.text for token in nlp(txt)]

def get_replace_map(tag_count, prob, seed=32):
    np.random.seed(seed=seed)
    return (np.random.rand(tag_count) < prob).tolist()
def get_tag_token(token_list, category, file_id):
    ls = [[f"B-{category}",token_list[0],file_id]]
    for token in token_list[1:]:
        ls.append([f"I-{category}",token, file_id])
    return ls

def get_seq_list(data):
    seq_list = []
    begin = False
    seq = []
    for i, (tag, token, file_id) in tqdm(enumerate(data), total=len(data)):
        if tag.startswith("B"):
            if seq:
                seq_list.append(seq)
                seq = []
            begin = True
        elif tag == "O" and begin:
            seq_list.append(seq)
            seq = []
            begin = False
        seq.append((tag, token, file_id))
        if i == len(data) -1:
            seq_list.append(seq)
    return seq_list

def create_new_df(new_seq):
    new_df = pd.DataFrame(new_seq,columns=["tag","token","file_id"])
    df_ls = []
    for _,df in new_df.groupby("file_id"):
        df["token_id"] = range(len(df))
        df_ls.append(df)
    new_df = pd.concat(df_ls)
    return new_df

In [None]:
ginza_train_data = pd.read_csv("data/preprocessed/ginza_train_data.csv")
pi_df = pd.read_csv("data/preprocessed/pi_df.csv")
orgs_df = pd.read_csv("data/preprocessed/orgs_df.csv").dropna()
disease_df = pd.read_csv("data/preprocessed/disease_df.csv")

In [None]:
ginza_train_data.head()

In [None]:
data = ginza_train_data[["tag","token","file_id"]].apply(tuple,axis=1)

In [None]:
pi_list = pi_df["氏名"].sample(len(pi_df)).tolist()
org_list = orgs_df["orgs"].sample(len(orgs_df)).tolist()
disease_list = disease_df["tag"].sample(len(disease_df)).tolist()

In [None]:
raw_pi_token_list = [parse(txt) for txt in pi_list]
raw_org_token_list = [parse(txt) for txt in org_list]

In [None]:
seq_list =  get_seq_list(data)

In [None]:
for i in range(3):
    pi_token_list = [i for i in raw_pi_token_list]
    org_token_list = [i for i in raw_org_token_list]
    random.Random(i).shuffle(pi_token_list)
    random.Random(i+1).shuffle(org_token_list)
    new_seq = []
    for seq in seq_list:
        tag = seq[0][0]
        file_id = seq[0][2]
        if tag == "O":
            new_seq.extend(seq)
        else:
            category = tag.split("-")[1]
            if category in ["PERSON", "ORGFACPOS","TIMEX"]:
                if category == "PERSON":
                    token_list = pi_token_list.pop() 
                elif category == "ORGFACPOS":
                    token_list = org_token_list.pop() 
                elif category == "TIMEX":
                    token_list = [str(random.randint(1,12)),
                                "月",
                                str(random.randint(1,31)),
                                "日"]
                psudo_data = get_tag_token(token_list, category, file_id)
                new_seq.extend(psudo_data)
            else:
                new_seq.extend(seq)
    new_df = create_new_df(new_seq)
    save_path = f"data/preprocessed/ginza_train_data_aug_{i}.csv"
    new_df.to_csv(save_path,index=False)