---
## 任意の単語数で分割する関数

In [64]:
import os
import re
import gc
import json
import glob
import pickle
import string
import random
import itertools
from collections import defaultdict
from dotenv import load_dotenv

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder

import nltk
from nltk.corpus import stopwords

from tensorflow.keras.preprocessing.sequence import pad_sequences

import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import CosineAnnealingLR
import pytorch_lightning as pl
from pytorch_lightning import Trainer, seed_everything
from pytorch_lightning.loggers import CometLogger
from torchcrf import CRF
import transformers
from transformers import BertModel, BertForTokenClassification

%matplotlib inline

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 300)
pd.set_option('display.max_colwidth', 300)
pd.options.display.float_format = '{:.5f}'.format

load_dotenv('.env')

True

In [65]:
def load_data(data_dir):
    # takapyさんのNERDatasetを読み込む
    TRAIN_FOR_NER_PATH = '../input/train_0528.pkl'
    with open(TRAIN_FOR_NER_PATH, 'rb') as f:
        train = pickle.load(f)
    
    # Testデータの読み込み
    test_files = glob.glob(data_dir + "test/*.json")

    test = pd.DataFrame()

    # jsonからDataFrameに
    for tar in test_files:
        file_data = pd.read_json(tar)
        file_data.insert(0,'pub_id', tar.split('/')[-1].split('.')[0])
        test = pd.concat([test, file_data])
    

    # testデータのIdはデータフレームにまとめておく
    test_ids = pd.DataFrame({
        'Id': test['pub_id'].unique()
    })
    
    return train, test, test_ids


def preprocess_text(text: str) -> str:
    """
    テキストの前処理　クリーニング
    """
    text = re.sub('[^A-Za-z0-9]+', ' ', str(text).lower()).strip()
    
    return text

In [66]:
data_dir = '../input/'
train, test, test_ids = load_data(data_dir)

test['text'] = test['text'].apply(lambda x: preprocess_text(x) if isinstance(x, str) else x)

In [67]:
test.head()

Unnamed: 0,pub_id,section_title,text
0,2100032a-7c33-4bff-97ef-690822c43466,Abstract,cognitive deficits and reduced educational achievement are common in psychiatric illness understanding the genetic basis of cognitive and educational deficits may be informative about the etiology of psychiatric disorders a recent large genomewide association study gwas reported a genome wide si...
1,2100032a-7c33-4bff-97ef-690822c43466,Introduction,a general cognitive ability factor also termed g typically captures just under half of the overall variance in performance on diverse laboratory measures of neurocognitive functioning general performance on neurocognitive tests has remarkable predictive value across a diverse range of social hea...
2,2100032a-7c33-4bff-97ef-690822c43466,Methods and Materials,cogent is an international gwas collaboration formed to conduct genetic analyses of g and related neurocognitive processes in healthy individuals donohoe et al 2012 though common gwas markers have been proposed to account for 30 or more of the variance in general intelligence in adults individua...
3,2100032a-7c33-4bff-97ef-690822c43466,2011) Plink was used to analyze datasets comprising unrelated individuals,and gcta used to analyze five datasets in which multiple family members were known to be included a priori gcta has implemented a mixed linear model association mlma analytic function that corrects for population or relatedness structure through a correction that is specific to the structure of ...
4,2100032a-7c33-4bff-97ef-690822c43466,Results,


## Expand Dataset

In [68]:
# 確認のため一つのpub_idに絞る
test_ids = test['pub_id'].unique()
tar = test[test['pub_id'] == test_ids[0]]

In [69]:
tar

Unnamed: 0,pub_id,section_title,text
0,2100032a-7c33-4bff-97ef-690822c43466,Abstract,cognitive deficits and reduced educational achievement are common in psychiatric illness understanding the genetic basis of cognitive and educational deficits may be informative about the etiology of psychiatric disorders a recent large genomewide association study gwas reported a genome wide si...
1,2100032a-7c33-4bff-97ef-690822c43466,Introduction,a general cognitive ability factor also termed g typically captures just under half of the overall variance in performance on diverse laboratory measures of neurocognitive functioning general performance on neurocognitive tests has remarkable predictive value across a diverse range of social hea...
2,2100032a-7c33-4bff-97ef-690822c43466,Methods and Materials,cogent is an international gwas collaboration formed to conduct genetic analyses of g and related neurocognitive processes in healthy individuals donohoe et al 2012 though common gwas markers have been proposed to account for 30 or more of the variance in general intelligence in adults individua...
3,2100032a-7c33-4bff-97ef-690822c43466,2011) Plink was used to analyze datasets comprising unrelated individuals,and gcta used to analyze five datasets in which multiple family members were known to be included a priori gcta has implemented a mixed linear model association mlma analytic function that corrects for population or relatedness structure through a correction that is specific to the structure of ...
4,2100032a-7c33-4bff-97ef-690822c43466,Results,
5,2100032a-7c33-4bff-97ef-690822c43466,COGENT Analysis,using the approach described above the two snps previously associated with the dichotomous variable of completing college rs11584700 and rs4851266 in rietveld et al 2013 were not significantly associated with g in cogent1 p s 05 the third snp rs9320913 associated with years of schooling was neit...
6,2100032a-7c33-4bff-97ef-690822c43466,SSGAC Meta-Analysis,the ssgac reanalyzed their gwas data utilizing a two stage design in order to examine whether educational attainment is a valid proxy phenotype for cognitive ability rietveld et al 2014b subjects with available neurocognitive data n 24 189 were removed from the larger cohort and a gwas of years ...
7,2100032a-7c33-4bff-97ef-690822c43466,CHARGE Meta-Analysis,a gwas of general cognitive function that included 53 000 individuals from 31 population based cohorts was recently published by the cohorts for heart and aging research in genomic epidemiology charge consortium davies et al 2015 the study reported genome wide significant snp associations with g...
8,2100032a-7c33-4bff-97ef-690822c43466,Conclusions,the current study identified a significant association of a snp at chromosome 6q16 1 with general cognitive ability across 21 well characterized international samples of europeanancestry individuals from the general population sensitivity analysis suggested that the association between rs1906252...
9,2100032a-7c33-4bff-97ef-690822c43466,Supplementary Material,refer to web version on pubmed central for supplementary material


In [70]:
# それぞれのセクションのテキストの数を計算する
tar['text_len'] = tar['text'].apply(lambda x: len(x.split(' ')) if isinstance(x, str) else 0)
tar[['pub_id', 'section_title', 'text_len']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tar['text_len'] = tar['text'].apply(lambda x: len(x.split(' ')) if isinstance(x, str) else 0)


In [72]:
tar.drop('text_len', axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [73]:
# 指定したmax_lenより大きい場合は分割して行を分ける
# 重複も許すように設計する

# テキストの数
max_len = 32
# 重複する単語数
override = 10
# 結果格納用データフレーム
res = pd.DataFrame()

for i in range(len(tar)):
    row = tar.iloc[i]
    tar_text = row['text'].split(' ')
    text_len = len(tar_text)
    
    # 単語数がmax_lenより小さい場合はそのまま
    if text_len < max_len:
        res = pd.concat([res, pd.DataFrame(row).T], axis=0)
        continue
    
    # 単語数がmax_lenより大きい場合は分割する
    elif text_len > max_len:
        # 分割する数を計算する
        num_divide = int(np.ceil(text_len / (max_len - override)))
        # 分割する分行を複製しておく（データフレーム化）
        tmp_df = pd.DataFrame([row] * num_divide)
        # 分割後のテキストを格納しておくリスト
        divided_texts = []
        
        # max_lenごとのテキストに分割する
        for i in range(len(tmp_df)):
            div_text = tar_text[int(i * (max_len - override)) : int(i * (max_len - override) + max_len)]
            # リストから文字列に直す
            div_text = ' '.join(div_text)
            # 結果を一旦リストにまとめておく
            divided_texts.append(div_text)
            
        # 複製しておいたデータフレームに置換
        tmp_df['text'] = divided_texts
        # 全体のデータフレームに結合
        res = pd.concat([res, tmp_df], axis=0)
        
    
    break

In [74]:
res

Unnamed: 0,pub_id,section_title,text
0,2100032a-7c33-4bff-97ef-690822c43466,Abstract,cognitive deficits and reduced educational achievement are common in psychiatric illness understanding the genetic basis of cognitive and educational deficits may be informative about the etiology of psychiatric disorders a recent large
0,2100032a-7c33-4bff-97ef-690822c43466,Abstract,informative about the etiology of psychiatric disorders a recent large genomewide association study gwas reported a genome wide significant locus for years of education which subsequently demonstrated association to general cognitive ability
0,2100032a-7c33-4bff-97ef-690822c43466,Abstract,of education which subsequently demonstrated association to general cognitive ability g in overlapping cohorts the current study was designed to test whether gwas hits for educational attainment are involved in general cognitive
0,2100032a-7c33-4bff-97ef-690822c43466,Abstract,gwas hits for educational attainment are involved in general cognitive ability in an independent large scale collection of cohorts using cohorts in the cognitive genomics consortium cogent up to 20 495 healthy
0,2100032a-7c33-4bff-97ef-690822c43466,Abstract,the cognitive genomics consortium cogent up to 20 495 healthy individuals we examined the relationship between g and variants associated with educational attainment we next conducted meta analyses with 24 189 individuals
0,2100032a-7c33-4bff-97ef-690822c43466,Abstract,attainment we next conducted meta analyses with 24 189 individuals with neurocognitive data from the educational attainment studies and then with 53 188 largely independent individuals from a recent gwas of cognition
0,2100032a-7c33-4bff-97ef-690822c43466,Abstract,188 largely independent individuals from a recent gwas of cognition a snp rs1906252 located at chromosome 6q16 1 previously associated with years of schooling was significantly associated with g p 1 47
0,2100032a-7c33-4bff-97ef-690822c43466,Abstract,of schooling was significantly associated with g p 1 47 10 4 in cogent the first joint analysis of 43 381 non overlapping individuals for this a priori designated locus was strongly
0,2100032a-7c33-4bff-97ef-690822c43466,Abstract,overlapping individuals for this a priori designated locus was strongly significant p 4 94 10 7 and the second joint analysis of 68 159 non overlapping individuals was even more robust p
0,2100032a-7c33-4bff-97ef-690822c43466,Abstract,68 159 non overlapping individuals was even more robust p 1 65 10 9 these results provide independent replication in a large scale dataset of a genetic locus associated with cognitive function


In [75]:
# 単語数の確認
res['text_len'] = res['text'].apply(lambda x: len(x.split(' ')))
res

In [77]:
# 対象のテキストを一応表示しておく
tar['text'].values[0]

'cognitive deficits and reduced educational achievement are common in psychiatric illness understanding the genetic basis of cognitive and educational deficits may be informative about the etiology of psychiatric disorders a recent large genomewide association study gwas reported a genome wide significant locus for years of education which subsequently demonstrated association to general cognitive ability g in overlapping cohorts the current study was designed to test whether gwas hits for educational attainment are involved in general cognitive ability in an independent large scale collection of cohorts using cohorts in the cognitive genomics consortium cogent up to 20 495 healthy individuals we examined the relationship between g and variants associated with educational attainment we next conducted meta analyses with 24 189 individuals with neurocognitive data from the educational attainment studies and then with 53 188 largely independent individuals from a recent gwas of cognition 

## 関数化

上の処理を関数化しておく

In [84]:
def expand_data(df, max_len, override=0) -> pd.DataFrame:
    """
    指定したmax_lenを超えるテキストに対して分割を行う関数
    
    ---------------------------------------
    Parameters
    
    df: pd.DataFrame
        拡張対象のデータフレーム
        pub_id, section_title, textが存在していること
    max_len: int
        分割する単語数
    override: int
        分割する際に重複する単語数
        
    ---------------------------------------
    Returns
    
    res: pd.DataFrame
        分割したテキストで構成されたデータフレーム
    
    """
    # 結果格納用データフレーム
    res = pd.DataFrame()
    ids = df['pub_id'].unique()
    
    for _id in ids:   
        tar = df[df['pub_id'] == _id]

        for i in range(len(tar)):
            row = tar.iloc[i]
            tar_text = row['text'].split(' ')
            text_len = len(tar_text)

            # 単語数がmax_lenより小さい場合はそのまま
            if text_len <= max_len:
                res = pd.concat([res, pd.DataFrame(row).T], axis=0)

            # 単語数がmax_lenより大きい場合は分割する
            elif text_len > max_len:
                # 分割する数を計算する
                num_divide = int(np.ceil(text_len / (max_len - override)))
                # 分割する分行を複製しておく（データフレーム化）
                tmp_df = pd.DataFrame([row] * num_divide)
                # 分割後のテキストを格納しておくリスト
                divided_texts = []

                # max_lenごとのテキストに分割する
                for i in range(len(tmp_df)):
                    div_text = tar_text[int(i * (max_len - override)) : int(i * (max_len - override) + max_len)]
                    # リストから文字列に直す
                    div_text = ' '.join(div_text)
                    # 結果を一旦リストにまとめておく
                    divided_texts.append(div_text)

                # 複製しておいたデータフレームに置換
                tmp_df['text'] = divided_texts
                # 全体のデータフレームに結合
                res = pd.concat([res, tmp_df], axis=0)
                
    res = res.reset_index(drop=True)
    # 余計な列を削除する
    res = res[['pub_id', 'section_title', 'text']]
                
    return res

In [87]:
%%time
res = expand_data(test, max_len=128, override=3)

CPU times: user 202 ms, sys: 0 ns, total: 202 ms
Wall time: 194 ms


In [88]:
res

Unnamed: 0,pub_id,section_title,text
0,2100032a-7c33-4bff-97ef-690822c43466,Abstract,cognitive deficits and reduced educational achievement are common in psychiatric illness understanding the genetic basis of cognitive and educational deficits may be informative about the etiology of psychiatric disorders a recent large genomewide association study gwas reported a genome wide si...
1,2100032a-7c33-4bff-97ef-690822c43466,Abstract,educational attainment studies and then with 53 188 largely independent individuals from a recent gwas of cognition a snp rs1906252 located at chromosome 6q16 1 previously associated with years of schooling was significantly associated with g p 1 47 10 4 in cogent the first joint analysis of 43 ...
2,2100032a-7c33-4bff-97ef-690822c43466,Abstract,other polygenic quantitative traits which may be relevant to psychiatric illness
3,2100032a-7c33-4bff-97ef-690822c43466,Introduction,a general cognitive ability factor also termed g typically captures just under half of the overall variance in performance on diverse laboratory measures of neurocognitive functioning general performance on neurocognitive tests has remarkable predictive value across a diverse range of social hea...
4,2100032a-7c33-4bff-97ef-690822c43466,Introduction,understanding the neurobiology of human cognition is potentially critical to improving physical and mental health outcomes in society deary et al 2010 while both genetic background and environmental experience interact to shape cognitive development twin and family studies have consistently demo...
...,...,...,...
562,8e6996b4-ca08-4c0b-bed2-aaf07a4c6a60,Conclusion,we offer two notable findings on store choices about limited assortment supermarkets and supercenters the analysis of the intercept survey shows that one in four urban residents use limited assortment stores as their primary store and that a larger share of program participants shop at limited a...
563,8e6996b4-ca08-4c0b-bed2-aaf07a4c6a60,Conclusion,show superstores with increased market share we did not find that trend from the iri or the intercept data for the ne our review of the literature also indicates that there is a paucity of research on rural households food purchasing habits a gap that should be filled with more studies and teste...
564,8e6996b4-ca08-4c0b-bed2-aaf07a4c6a60,Conclusion,to rahkovsky and snyder 2015 see page 6 additional research on the role of children in shifting purchase patterns would help define opportunities for intervention another finding worth emphasizing is that while differences in purchasing patterns across household income levels may be statisticall...
565,8e6996b4-ca08-4c0b-bed2-aaf07a4c6a60,Conclusion,these types of differences as it is to acknowledge that several recent studies point out that demand for healthy foods is a much greater driver of purchases than supply is research that focuses on why these differences exist may be more useful to programs and interventions and even policies than...


In [89]:
print('拡張前')
print(test.shape)
print('拡張後')
print(res.shape)

拡張前
(118, 3)
拡張後
(567, 3)
