---
# 將以下4個模型的預測結果做ensembling
1. satrn
2. sar
3. robust scanner
4. NRTR

In [None]:
import os
import numpy as np
import pandas as pd
from ast import literal_eval

## 讀取4個模型預測出的結果

In [None]:
df0 = pd.read_csv('./mmocr/sub_test_satrn_score.csv')
df0.score = df0.score.apply(literal_eval)
df0['text'] = df0['text'].astype(str)
df1 = pd.read_csv('./mmocr/sub_test_sar_score.csv')
df1.score = df1.score.apply(literal_eval)
df1['text'] = df1['text'].astype(str)
df1.columns = [f'{f}_1' for f in df1.columns]
df2 = pd.read_csv('./mmocr/sub_test_robustscanner_score.csv')
df2.score = df2.score.apply(literal_eval)
df2['text'] = df2['text'].astype(str)
df2.columns = [f'{f}_2' for f in df2.columns]
df3 = pd.read_csv('./mmocr/sub_test_nrtr_score.csv')
df3.score = df3.score.apply(literal_eval)
df3['text'] = df3['text'].astype(str)
df3.columns = [f'{f}_3' for f in df3.columns]

mdf = pd.concat([df0, df1, df2, df3], axis=1)

## 取出4個模型預測不完全一樣的部分

In [None]:
diff = []
for idx in mdf.index:
    if mdf.loc[idx, 'text'] == mdf.loc[idx, 'text_1'] == mdf.loc[idx, 'text_2'] == mdf.loc[idx, 'text_3']:
        pass
    else:
        diff.append(idx)
        
diff = mdf.loc[diff, :]

## 進行ensemble

In [None]:
res = []
for idx in diff.index:
    if len(diff.loc[idx, 'text']) == len(diff.loc[idx, 'text_1']) == len(diff.loc[idx, 'text_2']) == len(diff.loc[idx, 'text_3']):
        # same length, merge by char
        cur = []
        text_len = len(diff.loc[idx, 'text'])
        s = diff.loc[idx, 'text']
        s1 = diff.loc[idx, 'text_1']
        s2 = diff.loc[idx, 'text_2']
        s3 = diff.loc[idx, 'text_3']
        score = diff.loc[idx, 'score']
        score1 = diff.loc[idx, 'score_1']
        score2 = diff.loc[idx, 'score_2']
        score3 = diff.loc[idx, 'score_3']
        for i in range(text_len):
            scores = np.array([score[i], score1[i], score2[i], score3[i]])
            cur_texts = [s[i], s1[i], s2[i], s3[i]]
            max_score_idx = scores.argmax()
            cur.append(cur_texts[max_score_idx])
                
        cur = ''.join(cur)
    else:            
        candidate_scores = []
        candidate_text = []
        if len(diff.loc[idx, 'text']) >= 8:
            candidate_scores.append('min_score')
            candidate_text.append('text')
        if len(diff.loc[idx, 'text_1']) >= 8:
            candidate_scores.append('min_score_1')
            candidate_text.append('text_1')
        if len(diff.loc[idx, 'text_2']) >= 8:
            candidate_scores.append('min_score_2')
            candidate_text.append('text_2')
        if len(diff.loc[idx, 'text_3']) >= 8:
            candidate_scores.append('min_score_3')
            candidate_text.append('text_3')
            
        # different length, Select the most confidence sentence
        most_confidence_idx = diff.loc[idx, candidate_scores].values.argmax()
        cur = diff.loc[idx, candidate_text].to_list()[most_confidence_idx]
    
    res.append(cur)

diff['result'] = res
mdf.loc[diff.index, ['text']] = diff['result']

## 輸出包含預測信心值的預測結果

In [None]:
# Export the ensemble result with min_score
ensemble_score_df = mdf.loc[:, ["id", "text"]]
ensemble_score_df['min_score'] = mdf.loc[:, ["min_score", "min_score_1", "min_score_2", "min_score_3"]].max(axis=1)
ensemble_score_df.to_csv("ensemble_4models_private_score_v2.csv", index=False)

## 輸出上傳使用的預測結果

In [None]:
# Export the ensemble result for submission
output_filename = 'ensemble_4models_private_v2.csv'
res = [['id', 'text']]
filenames = mdf['id'].to_list()
results = mdf['text'].to_list()
for f, r in zip(filenames, results):
    print(r)
    res.append([os.path.splitext(f)[0], r])

res = [','.join(r) for r in res]
sub_text = '\r\n'.join(res)
sub_text.rstrip()
with open(output_filename, 'w') as f:
    f.write(sub_text)