# ファイルの準備

* ファイルの解凍
* 一定の行数に分割
* tsv や csv の DB 化

In [None]:
import os
import shutil
import zipfile
import gzip
import pandas as pd
from tqdm import tqdm
import csv, sqlite3

## 解凍

In [None]:
# zip ファイルを解凍

def unpack_zip(zip_path, output_dir):
    shutil.unpack_archive(zip_path, output_dir)
    print(f"Unpacked {zip_path} to {output_dir}")

In [None]:
# .gz 形式のファイルを解凍

def unpack_zip(gz_path, output_path):
    with gzip.open(gz_path, mode="rb") as gzip_file:
        with open(output_path, mode="wb") as decompressed_file:
            shutil.copyfileobj(gzip_file, decompressed_file)

## 分割

In [None]:
# 50000 行ごとに区切る
# もとのファイルサイズが大きく、エンコーディングが変なところがあるので、 pandas などではなく with で対応

def split_csv(input_file, output_dir, chunk_size=50000):
    line_index = 1
    file_seqno = 1

    output_file = os.path.join(output_dir, f"%d_semmed_pred.csv")

    with open(input_file, "r", encoding="utf-8", errors="ignore") as in_file:
        out_file = open(output_file % file_seqno, "w", encoding="utf-8")

        for line in in_file:
            out_file.write(line)
            line_index += 1

            if line_index > chunk_size:
                out_file.close()
                file_seqno += 1
                line_index = 1
                out_file = open(output_file % file_seqno, "w", encoding="utf-8")

        out_file.close()

## tsv、csv を DB に変更

In [None]:
# 複数の csv を 1 つの db に変換

def tsv_to_db(input_path, db_path, file_number):
    conn = sqlite3.connect(db_path)
    curs = conn.cursor()
    curs.execute(f"""CREATE TABLE IF NOT EXISTS raw (
                    sent_id INT,
                    pmid INT,
                    type TEXT,
                    start INT,
                    sentence TEXT
                );""")

    reader = csv.reader(open(f"{input_path}/{file_number}_semmed_sent.csv", 'r'))

    for row in reader:
        to_db = [row[0], row[1], row[2], row[4], row[5]]
        curs.execute(f"INSERT INTO raw (sent_id, pmid, type, start, sentence) VALUES (?, ?, ?, ?, ?);", to_db)

    conn.commit()
    conn.close()

In [None]:
# tsv を db に変換

def tsv_to_db(input_path, db_path):
    conn = sqlite3.connect(db_path)
    curs = conn.cursor()
    curs.execute(f"""CREATE TABLE IF NOT EXISTS raw (
                    pred_id TEXT,
                    sent_id TEXT,
                    pmid TEXT,
                    predicate TEXT,
                    sub_name TEXT,
                    sub_ty TEXT,
                    obj_name TEXT,
                    obj_ty TEXT
                );""")

    reader = csv.reader(open(input_path, 'r'))

    for row in reader:
        to_db = [row[0], row[1], row[2], row[3], row[5], row[6], row[9], row[10]]
        curs.execute(f"INSERT INTO raw (pred_id, sent_id, pmid, predicate, sub_name, sub_ty, obj_name, obj_ty) VALUES (?, ?, ?, ?, ?, ?, ?, ?);", to_db)

    conn.commit()
    conn.close()

## index の作成

In [6]:
# 高速化のため index 作成

def add_index_to_raw(db_dir, file_number):
    conn = sqlite3.connect(f'{db_dir}/{file_number}_semmed_pred.db')
    cursor = conn.cursor()

    cursor.execute('CREATE INDEX IF NOT EXISTS idx_raw_sentid_pmid ON raw (sent_id, pmid)')

    conn.commit()
    conn.close()