## 医学論文の自動仕分けチャレンジ - 前処理

In [1]:
import os
import sys
import gc
#from google.colab import drive
#drive.mount('/gdrive')

#!cp /gdrive/MyDrive/Datasets/signate-471/train.csv .
#!cp /gdrive/MyDrive/Datasets/signate-471/test.csv .
#!cp /gdrive/MyDrive/Datasets/signate-471/sample_submit.csv .

In [2]:
#!pip install -q transformers

In [19]:
!pip install nltk
!pip install bs4



In [None]:
#import nltk
#nltk.download()

In [3]:
import math
import random
import time
import warnings

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import transformers as T
from sklearn.metrics import fbeta_score
from sklearn.model_selection import StratifiedKFold
from torch.utils.data import DataLoader, Dataset
from tqdm.notebook import tqdm

from transformers import AutoTokenizer, AutoModel
from transformers import (
    WEIGHTS_NAME,
    AdamW,
    AutoConfig,
    AutoModel,
    AutoTokenizer,
    get_cosine_schedule_with_warmup,
    get_linear_schedule_with_warmup,
    logging,
    MODEL_FOR_QUESTION_ANSWERING_MAPPING,
)

import matplotlib.pyplot as plt

In [4]:
class CFG:
    batch_size = 12 #5 #16
    num_workers = 4 #4
    max_length =  256 #72
    n_splits =  5 #5
    version = 135
    drop_rate = 0 #0.1
    output_size = 1

    model = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext"
    tokenizer = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext"
    epochs = 3 #3
    iters_to_accumulate = 2

In [5]:
DATA_DIR = "../input/"
OUTPUT_DIR = "../output/"

In [6]:
warnings.filterwarnings("ignore")

In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#device = 'cpu'

In [9]:
def seed_torch(seed=42):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True

seed =  471 #471
seed_torch(seed)

## データ読み込み

In [43]:
train = pd.read_csv(DATA_DIR + "train.csv")
test = pd.read_csv(DATA_DIR + "test.csv")
sub = pd.read_csv(DATA_DIR + "sample_submit.csv", header=None)
sub.columns = ["id", "judgement"]
TARGET = "judgement"

## 前処理

In [44]:
 # preprocess
train["text"] = train["title"] + " " + train["abstract"].fillna("")
test["text"] = test["title"] + " " + test["abstract"].fillna("")

In [None]:
def add_cv_number(train):
    # 交差検証 用の番号を振ります。
    Fold = StratifiedKFold(n_splits=CFG.n_splits, shuffle=True, random_state=seed) #5
    for n, (train_index, val_index) in enumerate(Fold.split(train, train[TARGET])):
        train.loc[val_index, "fold"] = int(n)
    train["fold"] = train["fold"].astype(np.uint8)

    return train

In [None]:
def get_train_data(train):

    # 交差検証 用の番号を振ります。
    Fold = StratifiedKFold(n_splits=CFG.n_splits, shuffle=True, random_state=seed) #5
    for n, (train_index, val_index) in enumerate(Fold.split(train, train[TARGET])):
        train.loc[val_index, "fold"] = int(n)
    train["fold"] = train["fold"].astype(np.uint8)

    return train

In [14]:
def get_test_data(test):
    return test

In [29]:
train = get_train_data(train)
test = get_test_data(test)

## text前処理

In [20]:
#https://zenn.dev/deepblackinc/books/ad568c611643c6/viewer/c37a9f
import re
import requests
import unicodedata
import nltk
from nltk.corpus import wordnet
from bs4 import BeautifulSoup

def clean_text(text):
    replaced_text = text.lower()
    replaced_text = re.sub(r'[【】]', ' ', replaced_text)       # 【】の除去
    replaced_text = re.sub(r'[（）()]', ' ', replaced_text)     # （）の除去
    replaced_text = re.sub(r'[［］\[\]]', ' ', replaced_text)   # ［］の除去
    replaced_text = re.sub(r'[@＠]\w+', '', replaced_text)  # メンションの除去
    replaced_text = re.sub(
        r'https?:\/\/.*?[\r\n ]', '', replaced_text)  # URLの除去
    replaced_text = re.sub(r'　', ' ', replaced_text)  # 全角空白の除去
    return replaced_text


def clean_html_tags(html_text):
    soup = BeautifulSoup(html_text, 'html.parser')
    cleaned_text = soup.get_text()
    cleaned_text = ''.join(cleaned_text.splitlines())
    return cleaned_text

def clean_html_and_js_tags(html_text):
    soup = BeautifulSoup(html_text, 'html.parser')
    [x.extract() for x in soup.findAll(['script', 'style'])]
    cleaned_text = soup.get_text()
    cleaned_text = ''.join(cleaned_text.splitlines())
    return cleaned_text

def clean_url(html_text):
    cleaned_text = re.sub(r'http\S+', '', html_text)
    return cleaned_text

def normalize(text):
    normalized_text = normalize_unicode(text)
    normalized_text = normalize_number(normalized_text)
    normalized_text = lower_text(normalized_text)
    return normalized_text

def lower_text(text):
    return text.lower()

def normalize_unicode(text, form='NFKC'):
    normalized_text = unicodedata.normalize(form, text)
    return normalized_text

def normalize_number(text):
    replaced_text = re.sub(r'\d+', '0', text)
    return replaced_text

def lemmatize_term(term, pos=None):
    if pos is None:
        synsets = wordnet.synsets(term)
        if not synsets:
            return term
        pos = synsets[0].pos()
        if pos == wordnet.ADJ_SAT:
            pos = wordnet.ADJ
    return nltk.WordNetLemmatizer().lemmatize(term, pos=pos)

def clean_text2(text):
    replaced_text = text.lower()
    replaced_text = re.sub(r'objective:', ' ', replaced_text)       # ""の除去
    replaced_text = re.sub(r'background:', ' ', replaced_text)       # ""の除去
    replaced_text = re.sub(r'copyright', ' ', replaced_text)       # ""の除去
    replaced_text = re.sub(r'the', ' ', replaced_text)       # ""の除去
    return replaced_text


def text_cleaning(text):
    text = clean_text(text)
    text = clean_text2(text)
    text = clean_html_tags(text)
    text = clean_html_and_js_tags(text)
    text = clean_url(text)
    text = normalize(text)
    text = lower_text(text)
    text = normalize_unicode(text)
    #text = "".join(lemmatize_term(e) for e in text.split())
    return text

def data_cleaning(data):
    return [text_cleaning(text) for text in data]

In [22]:
#https://zenn.dev/deepblackinc/books/ad568c611643c6/viewer/c37a9f
import re
import requests
import unicodedata
import nltk
from nltk.corpus import wordnet
from bs4 import BeautifulSoup

def text_freq(tmp):
    token = nltk.word_tokenize(tmp)
    frequency = nltk.FreqDist(w.lower() for w in token)
    return ' '.join(map(str, list(frequency.keys())))
    #return ' '.join(map(str, list(frequency)))

In [45]:
column='text'
test["text2"] = test["id"]
for i in range(len(test[column])):
    test['text2'][i] = text_cleaning(test[column][i])

In [46]:
column='text'
train["text2"] = train["id"]
for i in range(len(train[column])):
    train['text2'][i] = text_cleaning(train[column][i])

In [47]:
test.head()

Unnamed: 0,id,title,abstract,text,text2
0,27145,Estimating the potential effects of COVID-19 p...,The objective of the paper is to analyse chang...,Estimating the potential effects of COVID-19 p...,estimating potential effects of covid-0 pand...
1,27146,Leukoerythroblastic reaction in a patient with...,,Leukoerythroblastic reaction in a patient with...,leukoerythroblastic reaction in a patient with...
2,27147,[15O]-water PET and intraoperative brain mappi...,[15O]-water PET was performed on 12 patients w...,[15O]-water PET and intraoperative brain mappi...,0o -water pet and intraoperative brain mappin...
3,27148,Adaptive image segmentation for robust measure...,We present a method that significantly improve...,Adaptive image segmentation for robust measure...,adaptive image segmentation for robust measure...
4,27149,Comparison of Epidemiological Variations in CO...,The objective of this study is to compare the ...,Comparison of Epidemiological Variations in CO...,comparison of epidemiological variations in co...


In [48]:
train.head()

Unnamed: 0,id,title,abstract,judgement,text,text2
0,0,One-year age changes in MRI brain volumes in o...,Longitudinal studies indicate that declines in...,0,One-year age changes in MRI brain volumes in o...,one-year age changes in mri brain volumes in o...
1,1,Supportive CSF biomarker evidence to enhance t...,The present study was undertaken to validate t...,0,Supportive CSF biomarker evidence to enhance t...,supportive csf biomarker evidence to enhance ...
2,2,Occurrence of basal ganglia germ cell tumors w...,Objective: To report a case series in which ba...,0,Occurrence of basal ganglia germ cell tumors w...,occurrence of basal ganglia germ cell tumors w...
3,3,New developments in diagnosis and therapy of C...,The etiology and pathogenesis of idiopathic ch...,0,New developments in diagnosis and therapy of C...,new developments in diagnosis and rapy of cro...
4,4,Prolonged shedding of SARS-CoV-2 in an elderly...,,0,Prolonged shedding of SARS-CoV-2 in an elderly...,prolonged shedding of sars-cov-0 in an elderly...
