In [153]:
%matplotlib inline
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import openslide

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [154]:
df = pd.read_csv('../data/res_score_from_level_0_ver2.csv')
df_org = pd.read_csv('../data/input/train.csv')

In [155]:
df.head()

Unnamed: 0,image_id,score_1,score_2,score_3,score_4,score_5
0,0018ae58b01bdadc8e347995b69f99aa,0.775559,0.006396,0.0,0.218045,0.0
1,004dd32d9cd167d9cc31c13b704498af,0.957907,0.042093,0.0,0.0,0.0
2,0068d4c7529e34fd4c9da863ce01a161,0.95604,0.0439,5.9e-05,0.0,0.0
3,006f6aa35a78965c92fffd1fbd53a058,0.720316,0.0,0.053168,0.226516,0.0
4,007433133235efc27a39f11df6940829,0.952311,0.047689,0.0,0.0,0.0


In [156]:
df_org.head()

Unnamed: 0,image_id,data_provider,isup_grade,gleason_score
0,0005f7aaab2800f6170c399693a96917,karolinska,0,0+0
1,000920ad0b612851f8e01bcc880d9b3d,karolinska,0,0+0
2,0018ae58b01bdadc8e347995b69f99aa,radboud,4,4+4
3,001c62abd11fa4b57bf7a6c603a11bb9,karolinska,4,4+4
4,001d865e65ef5d2579c190a0e0350d8f,karolinska,0,0+0


In [157]:
df = pd.merge(df, df_org, on='image_id', how='left')
df.head()

Unnamed: 0,image_id,score_1,score_2,score_3,score_4,score_5,data_provider,isup_grade,gleason_score
0,0018ae58b01bdadc8e347995b69f99aa,0.775559,0.006396,0.0,0.218045,0.0,radboud,4,4+4
1,004dd32d9cd167d9cc31c13b704498af,0.957907,0.042093,0.0,0.0,0.0,radboud,1,3+3
2,0068d4c7529e34fd4c9da863ce01a161,0.95604,0.0439,5.9e-05,0.0,0.0,radboud,3,4+3
3,006f6aa35a78965c92fffd1fbd53a058,0.720316,0.0,0.053168,0.226516,0.0,radboud,3,4+3
4,007433133235efc27a39f11df6940829,0.952311,0.047689,0.0,0.0,0.0,radboud,0,negative


In [158]:
df.shape

(5058, 9)

In [159]:
def check_wrong(df):
    
    # score_3, 4, 5内での割合を表示
    df['temp'] = df['score_3'] + df['score_4'] + df['score_5']
    df['score_3'] = df['score_3'] / df['temp']
    df['score_4'] = df['score_4'] / df['temp']
    df['score_5'] = df['score_5'] / df['temp']
    del df['temp'], df['score_1'], df['score_2']
    
    df.fillna(0, inplace=True)
    
    # 5%以下のものは除外する
    for c in ['score_3', 'score_4', 'score_5']:
        df[c] = df[c].apply(lambda x: 0 if x < 0.05 else x)
        
        
    res = {}
    scores = []
    # Scoreを計算する
    for i in range(len(df)):
        img_id = df.iloc[i]['image_id']
        tar = df.iloc[i][1:4].values
        if tar.sum() == 0.0:
            score = '0+0'
        else:
            max_idx = np.argmax(tar)
            if tar[max_idx] > 0.95:
                score = '{}+{}'.format(max_idx+3, max_idx+3)
            else:
                tar[max_idx] = 0
                max_idx_2 = np.argmax(tar)
                score = '{}+{}'.format(max_idx+3, max_idx_2+3)
                
        res[img_id] = score
        scores.append(score)
        
    df['new_score'] = scores
    df['gleason_score'] = df['gleason_score'].apply(lambda x: '0+0' if x == 'negative' else x) 
        
    return res, df
    

In [160]:
res, df = check_wrong(df)

In [161]:
df.head()

Unnamed: 0,image_id,score_3,score_4,score_5,data_provider,isup_grade,gleason_score,new_score
0,0018ae58b01bdadc8e347995b69f99aa,0.0,1.0,0.0,radboud,4,4+4,4+4
1,004dd32d9cd167d9cc31c13b704498af,0.0,0.0,0.0,radboud,1,3+3,0+0
2,0068d4c7529e34fd4c9da863ce01a161,1.0,0.0,0.0,radboud,3,4+3,3+3
3,006f6aa35a78965c92fffd1fbd53a058,0.190101,0.809899,0.0,radboud,3,4+3,4+3
4,007433133235efc27a39f11df6940829,0.0,0.0,0.0,radboud,0,0+0,0+0


In [162]:
df[df['gleason_score'] != df['new_score']].head()

Unnamed: 0,image_id,score_3,score_4,score_5,data_provider,isup_grade,gleason_score,new_score
1,004dd32d9cd167d9cc31c13b704498af,0.0,0.0,0.0,radboud,1,3+3,0+0
2,0068d4c7529e34fd4c9da863ce01a161,1.0,0.0,0.0,radboud,3,4+3,3+3
5,0076bcb66e46fb485f5ba432b9a1fe8a,0.0,0.993653,0.0,radboud,3,4+3,4+4
7,00928370e2dfeb8a507667ef1d4efcbb,0.0,0.161978,0.838022,radboud,5,4+5,5+4
11,00bbc1482301d16de3ff63238cfd0b34,0.35923,0.64077,0.0,radboud,2,3+4,4+3


In [163]:
df_org = pd.read_csv('../data/input/train.csv')
new_train = df_org.copy()


new_train['new_score'] = new_train['image_id'].map(res)



def rep_score(row):
    if row['new_score'] is np.nan:
        return row['gleason_score']
    else:
        return row['new_score']
    
new_train['new_score'] = new_train.apply(rep_score, axis=1)



rep_dict = {
    'negative': 0,
    '0+0': 0,
    '3+3': 1,
    '3+4': 2,
    '4+3': 3,
    '4+4': 4,
    '3+5': 4,
    '5+3': 4,
    '4+5': 5,
    '5+4': 5,
    '5+5': 5
}

new_train['new_grade'] = new_train['new_score'].map(rep_dict)

new_train.drop(['gleason_score', 'isup_grade'], axis=1, inplace=True)
new_train = new_train.rename(columns={'new_grade': 'isup_grade', 'new_score': 'gleason_score'})
new_train['isup_grade'] = new_train['isup_grade'].astype(int)

new_train.to_csv('../data/input/modified_train_v2.csv', index=False)

In [152]:
new_train.shape

(10616, 4)