# アンサンブルと予測結果の補正

---

## セットアップ

### モジュールの読み込み

In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from pathlib import Path
from matplotlib import pyplot as plt

### データの読み込み

In [2]:
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")
traintest = pd.concat([train, test], ignore_index = True)

---

## アンサンブル

In [3]:
submit = pd.read_csv("data/sample_submission.csv")
submit["LOAN_AMOUNT"] = 0
count = 0
for path in [
    "submit/maruyama_20220213_02_epoch10.csv",
    "submit/maruyama_20220213_02_epoch09.csv",
    "submit/maruyama_20220213_02_epoch08.csv",
    "submit/maruyama_20220213_01_epoch10.csv",
    "submit/maruyama_20220213_01_epoch09.csv",
    "submit/maruyama_20220213_01_epoch08.csv",
    "submit/maruyama_20220205_01.csv",
    "submit/maruyama_20220123_01.csv",
    "submit/maruyama_20220118_01.csv",
    "submit/maruyama_20220116_02.csv",
    "submit/maruyama_20220116_01.csv",
    "submit/maruyama_20220108_02.csv",
    "submit/maruyama_20220105_01.csv",
    "submit/maruyama_20220104_01.csv",
    "submit/maruyama_20220103_01.csv",
    "submit/maruyama_20220101_01.csv",
]:
    print(path)
    submit_ = pd.read_csv(path)
    submit["LOAN_AMOUNT"] = submit["LOAN_AMOUNT"] + submit_["LOAN_AMOUNT"]
    count = count + 1
submit["LOAN_AMOUNT"] = submit["LOAN_AMOUNT"] / count

submit/maruyama_20220213_02_epoch10.csv
submit/maruyama_20220213_02_epoch09.csv
submit/maruyama_20220213_02_epoch08.csv
submit/maruyama_20220213_01_epoch10.csv
submit/maruyama_20220213_01_epoch09.csv
submit/maruyama_20220213_01_epoch08.csv
submit/maruyama_20220205_01.csv
submit/maruyama_20220123_01.csv
submit/maruyama_20220118_01.csv
submit/maruyama_20220116_02.csv
submit/maruyama_20220116_01.csv
submit/maruyama_20220108_02.csv
submit/maruyama_20220105_01.csv
submit/maruyama_20220104_01.csv
submit/maruyama_20220103_01.csv
submit/maruyama_20220101_01.csv


---

## 予測結果の補正

説明に希望金額が書かれている場合、予測結果をドル換算した希望金額に書き換える。

### 希望金額の抽出

In [4]:
# 説明から希望金額を抽出する
def extract(data, currency_list):
    # 説明が欠損していたら、翻訳前の説明で埋める
    data.loc[lambda df: df["DESCRIPTION_TRANSLATED"].isna(), "DESCRIPTION_TRANSLATED"] = data.loc[lambda df: df["DESCRIPTION_TRANSLATED"].isna(), "DESCRIPTION"]

    # 文に分割する
    sentences = (
        data.
        set_index(["LOAN_ID", "CURRENCY"])["DESCRIPTION_TRANSLATED"].
        str.replace("<br />", " ", regex = False).
        str.split(r"\. ").explode()
    )
    sentences = sentences + " "

    # 通貨ごとに、説明から希望金額を抽出する
    local = []
    for currency in tqdm(currency_list):
        local.append(
            sentences.
            loc[lambda s: s.str.contains(currency)].
            str.extractall(
                f"{currency} ?" + r"([0-9]{1,3} ?, ?[0-9]{3} ?, ?[0-9]{3}) |" +
                f"{currency} ?" + r"([0-9]{1,3} ?, ?[0-9]{3}) |" +
                f"{currency} ?" + r"([0-9]{1,9}) |" +
                f"{currency} ?" + r"([0-9]{1,3} ?, ?[0-9]{3} ?, ?[0-9]{3}\.[0-9]{2}) |" +
                f"{currency} ?" + r"([0-9]{1,3} ?, ?[0-9]{3}\.[0-9]{2}) |" +
                f"{currency} ?" + r"([0-9]{1,9}\.[0-9]{2}) |" +
                r" ([0-9]{1,3} ?, ?[0-9]{3} ?, ?[0-9]{3})" + f" ?{currency}|" +
                r" ([0-9]{1,3} ?, ?[0-9]{3})" + f" ?{currency}|" +
                r" ([0-9]{1,9})" + f" ?{currency}|" +
                r" ([0-9]{1,3} ?, ?[0-9]{3} ?, ?[0-9]{3}\.[0-9]{2})" + f" ?{currency}|" +
                r" ([0-9]{1,3} ?, ?[0-9]{3}\.[0-9]{2})" + f" ?{currency}|" +
                r" ([0-9]{1,9}\.[0-9]{2})" + f" ?{currency}"
            ).stack().
            str.replace(r"[, ]", "", regex = True).
            astype(float).
            groupby(["LOAN_ID", "CURRENCY"]).max().
            rename("LOAN_AMOUNT_LOCAL").
            reset_index().
            loc[lambda df: df["CURRENCY"] == currency, :].copy()
        )
    local = pd.concat(local)
    local = data[["LOAN_ID", "CURRENCY"]].merge(local, how = "left", on = ["LOAN_ID", "CURRENCY"])

    return local

In [5]:
currency_list = traintest["CURRENCY"].unique()
train_local = extract(train, currency_list)
test_local = extract(test, currency_list)

100%|██████████| 51/51 [00:20<00:00,  2.46it/s]
100%|██████████| 51/51 [00:20<00:00,  2.48it/s]


### 希望金額のドル換算

In [6]:
# ドルに換算する
def convert_usd(train_local, test_local, train):
    # 希望金額 (現地通貨) と融資額 (ドル) の比を取ることで、為替レートを求める
    rate = (
        train_local.
        merge(train[["LOAN_ID", "LOAN_AMOUNT"]], how = "left", on = "LOAN_ID").
        assign(rate = lambda df: df["LOAN_AMOUNT_LOCAL"] / df["LOAN_AMOUNT"]).
        loc[lambda df: df["LOAN_ID"] > 1800000, :].  # テスト期間に近い為替レートを得るために、できるだけ最近のデータに絞る
        groupby(["CURRENCY"])["rate"].median()
    )

    # ドルに換算する
    train_local = (
        train_local.
        merge(rate, how = "left", on = "CURRENCY").
        assign(LOAN_AMOUNT_DESC = lambda df: df["LOAN_AMOUNT_LOCAL"] / df["rate"])
        [["LOAN_ID", "LOAN_AMOUNT_DESC"]].copy()
    )
    test_local = (
        test_local.
        merge(rate, how = "left", on = "CURRENCY").
        assign(LOAN_AMOUNT_DESC = lambda df: df["LOAN_AMOUNT_LOCAL"] / df["rate"])
        [["LOAN_ID", "LOAN_AMOUNT_DESC"]].copy()
    )

    return train_local, test_local

In [7]:
train_local, test_local = convert_usd(train_local, test_local, train)

### 予測結果の補正

In [8]:
submit2 = (
    submit.
    merge(test[["LOAN_ID", "CURRENCY"]], how = "left", on = "LOAN_ID").
    merge(test_local, how = "left", on = "LOAN_ID")
)
submit2["LOAN_AMOUNT_DESC"] = submit2["LOAN_AMOUNT_DESC"].where(submit2["CURRENCY"].isin(["PHP", "KES", "UGX", "INR", "PKR", "KGS", "IDR", "TJS", "KHR", "JOD", "HNL", "USD"]))
submit2["LOAN_AMOUNT_CORECCTED"] = submit2["LOAN_AMOUNT_DESC"].mask(submit2["LOAN_AMOUNT_DESC"].isna(), submit2["LOAN_AMOUNT"])
submit2 = submit2[["LOAN_ID", "LOAN_AMOUNT_CORECCTED"]].rename(columns = { "LOAN_AMOUNT_CORECCTED": "LOAN_AMOUNT" })

In [9]:
submit2.to_csv("submit/maruyama_20220213_03.csv", index = False, header = True)