# Import libraries

In [None]:
import pandas as pd
import numpy as np
from openpyxl import load_workbook
from tqdm.notebook import tqdm
from typing import List
import os
from numpy import arange
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, f1_score
from collections import Counter

# Late fusion

In [None]:
# Integration utility function
def __get_all_accs(df, classes, d_types):
    accs = {}
    sum_of_accs = np.zeros(len(classes))
    for d_type in d_types:
        accs[d_type] = []
        for i in range(len(classes)):
            df_c = df.loc[(df['Real'] == classes[i]) & (df['Has '+d_type] != -1)]
            acc = accuracy_score(df_c['Real'].values, df_c[d_type+' Pred'].values)
            accs[d_type].append(acc)
            sum_of_accs[i] += acc
    return accs, sum_of_accs

def __get_all_f1s(df, classes, d_types):
    f1s = {}
    sum_of_f1s = np.zeros(len(classes))
    for d_type in d_types:
        #accs[d_type] = []
        df_c = df.loc[df['Has '+d_type] != -1]
        f1_scores = f1_score(df_c['Real'].values, df_c[d_type+' Pred'].values, average=None)
        f1s[d_type] = [f1_scores[0], f1_scores[1], f1_scores[2]]
        for i in range(len(classes)):
            sum_of_f1s[i] += f1_scores[i]
    return f1s, sum_of_f1s

def __get_all_f1s_total(df, classes, d_types):
    f1s = {}
    sum_of_f1s = 0
    for d_type in d_types:
        #accs[d_type] = []
        df_c = df.loc[df['Has '+d_type] != -1]
        f1 = f1_score(df_c['Real'].values, df_c[d_type+' Pred'].values, average='weighted')
        f1s[d_type] = f1
        sum_of_f1s += f1
    return f1s, sum_of_f1s

def __get_all_error_rates(df, classes, d_types):
    error_rates = {}
    sum_of_er = np.zeros(len(classes))
    for d_type in d_types:
        error_rates[d_type] = []
        for i in range(len(classes)):
            df_c = df.loc[(df['Real'] == classes[i]) & (df['Has '+d_type] != -1)]
            acc = accuracy_score(df_c['Real'].values, df_c[d_type+' Pred'].values)
            error_rate = (1-acc)*100
            error_rates[d_type].append(error_rate)
            sum_of_er[i] += error_rate
    return error_rates, sum_of_er

def get_probs_alphas_notgeneral(df: pd.DataFrame, classes: List[int], d_types: List[str]) -> List[float]:
    """ Not general version of get alphas """
    def __get_all_accs(df, classes, d_types):
        accs = {}
        sum_of_accs = np.zeros(len(classes))
        for d_type in d_types:
            accs[d_type] = []
            for i in range(len(classes)):
                df_c = df.loc[(df['Real'] == classes[i]) & (df['Has '+d_type] != -1)]
                acc = accuracy_score(df_c['Real'].values, df_c[d_type+' Pred'].values)
                accs[d_type].append(acc)
                sum_of_accs[i] += acc
        return accs, sum_of_accs
    
    accs, sum_of_accs = __get_all_accs(df,classes,d_types)
    alphas = {}
    alphas[d_types[0]] = []
    alphas[d_types[1]] = []
    for i in range(len(classes)):
        diff = abs(accs[d_types[0]][i] - accs[d_types[1]][i])
        if accs[d_types[0]][i] > accs[d_types[1]][i]:
            alphas[d_types[0]].append(0.5 + diff)
            alphas[d_types[1]].append(0.5 - diff)  
        else:
            alphas[d_types[0]].append(0.5 - diff)
            alphas[d_types[1]].append(0.5 + diff)

    return alphas

def get_probs_alphas(df: pd.DataFrame, classes: List[int], d_types: List[str]) -> List[float]:
    accs, sum_of_accs = __get_all_f1s(df,classes,d_types)
    alphas = {}
    for d_type in d_types:
        alphas[d_type] = []
        for i in range(len(classes)):
            alpha = accs[d_type][i]/sum_of_accs[i]
            alphas[d_type].append(alpha)
    return alphas

def get_probs_alphas_total(df: pd.DataFrame, classes: List[int], d_types: List[str]) -> List[float]:
    accs, sum_of_accs = __get_all_f1s_total(df,classes,d_types)
    alphas = {}
    for d_type in d_types:
        alphas[d_type] = accs[d_type]/sum_of_accs
    return alphas

def optimize_alpha_train(df: pd.DataFrame, classes: List[int], d_types: List[str]) -> List[float]:
    d_type1, dtype_2 = d_types[0], d_types[1]
    best_alpha = [0,0]
    best_acc = 0
    for alpha in arange(0.1, 1, 0.05):
        preds = []
        df_c = df.loc[(df['Has ' + d_type1] != -1) & (df['Has ' + d_type2] != -1)]
        real = df_c['Real'].values
        luad_prob = (df_c[d_type1+' Prob LUAD']*alpha+(1-alpha)*df_c[d_type2+' Prob LUAD'])
        hlt_prob = (df_c[d_type1+' Prob HLT']*alpha+(1-alpha)*df_c[d_type2+' Prob HLT'])
        lusc_prob = (df_c[d_type1+' Prob LUSC']*alpha+(1-alpha)*df_c[d_type2+' Prob LUSC'])
        for prob1, prob2, prob3 in zip(luad_prob, hlt_prob, lusc_prob):
            pred = np.argmax([prob1, prob2, prob3])
            preds.append(pred)
        
        acc = accuracy_score(real, preds)
        if best_acc < acc:
            best_acc = acc
            best_alpha[0] = alpha
            best_alpha[1] = 1-alpha
    
    print(best_acc)
    return best_alpha

def integrate_probs(probs: List[float], class_index: int, alphas=None) -> float:
    if alphas:
        if type(alphas) == dict:
            if class_index is not None:
                weights_class = [x[1][class_index] for x in alphas.items()]
            else:
                weights_class = [x[1] for x in alphas.items()]
        else:
            weights_class = alphas
        assert len(probs) == len(weights_class)
        # use weights for each type of data
        product = np.multiply(probs, weights_class)
        return np.sum(product)
    else:
        # naive integration
        product = probs.copy()
    
    # ommit zeros
    idx = np.where(product != 0)[0]
    product_clean = product[idx]
    return np.sum(product_clean) / len(product_clean)

def integrate_preds(preds: List[int]) -> int:
    from collections import Counter
    if len(preds) == 1:
        return preds[0]
    assert len(preds) > 2

    return Counter(preds).most_common(1)[0][0]
    
def integration_model(data_types: List[str], datasets: List[str], name: str, path: str, 
                      fusion_type='probs', use_alphas=False, m_alphas=None) -> None:
    for d in datasets:
        writer = pd.ExcelWriter(path+'data_integration_model_'+d+'_'+fusion_type+'_'+name+'.xlsx', engine='openpyxl')
        data = pd.read_excel('early_integration/RNA6_CNV12_'+d+'_new.xlsx',
              sheet_name=[0,1,2,3,4,5,6,7,8,9],engine='openpyxl')
        
        if use_alphas:
            print('Getting alphas from training set...')
            data_train = pd.read_excel('early_integration/RNA6_CNV12_train_new.xlsx',
              sheet_name=[0,1,2,3,4,5,6,7,8,9],engine='openpyxl')
            splits_alphas = {}
            for df_name, df in data_train.items():
                df = df.loc[(df['Has '+data_types[0]] != -1) & (df['Has '+data_types[1]] != -1)]
                alphas = get_probs_alphas(df, [0,1,2], data_types)
                splits_alphas[df_name] = alphas
                print(alphas)

        for df_name, df in data.items():
            integration_probs = {
            'LUAD': [],
            'HLT': [],
            'LUSC': []
            }
            integration_preds = []
            for _, row in tqdm(df.iterrows()):
                local_probs = {
                    'LUAD': [],
                    'HLT': [],
                    'LUSC': []
                }
                local_preds = []
                for d_type in data_types:
                    if row['Has ' + d_type] != -1:
                        luad = row[d_type+ ' Prob LUAD']
                        hlt = row[d_type+ ' Prob HLT']
                        lusc = row[d_type+ ' Prob LUSC']
                        luad_new = luad / (luad + hlt +lusc)
                        hlt_new = hlt / (luad + hlt +lusc)
                        lusc_new = lusc / (luad + hlt +lusc)
                        local_probs['LUAD'].append(luad_new)
                        local_probs['HLT'].append(hlt_new)
                        local_probs['LUSC'].append(lusc_new)
                        local_preds.append(row[d_type + ' Pred'])
                    elif use_alphas:
                        local_probs['LUAD'].append(0)
                        local_probs['HLT'].append(0)
                        local_probs['LUSC'].append(0)
                        
                if fusion_type == 'probs':
                    if use_alphas:
                        if m_alphas:
                            alphas_manual = m_alphas
                            luad_prob = integrate_probs(local_probs['LUAD'], 0, alphas_manual)
                            hlt_prob = integrate_probs(local_probs['HLT'], 1, alphas_manual)
                            lusc_prob = integrate_probs(local_probs['LUSC'], 2, alphas_manual)
                        else:
                            luad_prob_new = integrate_probs(local_probs['LUAD'], 0, splits_alphas[df_name])
                            hlt_prob_new = integrate_probs(local_probs['HLT'], 1, splits_alphas[df_name])
                            lusc_prob_new = integrate_probs(local_probs['LUSC'], 2, splits_alphas[df_name])
                            luad_prob = luad_prob_new/ (luad_prob_new + hlt_prob_new + lusc_prob_new)
                            lusc_prob = lusc_prob_new/ (luad_prob_new + hlt_prob_new + lusc_prob_new)
                            hlt_prob = hlt_prob_new/ (luad_prob_new + hlt_prob_new + lusc_prob_new)
                    else:
                        luad_prob = integrate_probs(local_probs['LUAD'])
                        hlt_prob = integrate_probs(local_probs['HLT'])
                        lusc_prob = integrate_probs(local_probs['LUSC'])
                    integration_probs['LUAD'].append(luad_prob)
                    integration_probs['HLT'].append(hlt_prob)
                    integration_probs['LUSC'].append(lusc_prob)

                    pred = np.argmax([luad_prob,hlt_prob,lusc_prob], axis=0)
                    integration_preds.append(pred)

                elif fusion_type == 'preds':
                    if len(local_preds) == 2:
                        # if there are only two predictions, we need to fuse the probabilities
                        luad_prob = integrate_probs(local_probs['LUAD'])
                        hlt_prob = integrate_probs(local_probs['HLT'])
                        lusc_prob = integrate_probs(local_probs['LUSC'])
                        
                        integration_probs['LUAD'].append(luad_prob)
                        integration_probs['HLT'].append(hlt_prob)
                        integration_probs['LUSC'].append(lusc_prob)
                        
                        pred = np.argmax([luad_prob,hlt_prob,lusc_prob], axis=0)
                        
                    else:
                        pred = integrate_preds(local_preds)
                        if pred == 0:
                            luad_prob = 1
                            hlt_prob = 0
                            lusc_prob = 0
                        elif pred == 1:
                            luad_prob = 0
                            hlt_prob = 1
                            lusc_prob = 0
                        else:
                            luad_prob = 0
                            hlt_prob = 0
                            lusc_prob = 1

                        integration_probs['LUAD'].append(luad_prob)
                        integration_probs['HLT'].append(hlt_prob)
                        integration_probs['LUSC'].append(lusc_prob)
                    
                
                    integration_preds.append(pred)

            
            for cls in integration_probs.keys():
                df['Integration Prob '+ cls] = integration_probs[cls]

            df['Integration Pred'] = integration_preds

            # save to sheet
            df.to_excel(writer, sheet_name='split_'+str(df_name), index=False)

        writer.close()

In [None]:
integration_model(data_types=["RNA", "CNV"],
                  datasets=['test','train'], name="RNA-CNV-alphas-XGBOOST_both",
                  path='',
                  fusion_type='probs', use_alphas=True)

In [None]:
accs = {}
f1_scores = {}
aucs = {}

data_model = pd.read_excel('data_integration_model_test_probs_RNA-CNV-alphas-XGBOOST_both_total.xlsx',
              sheet_name=[0,1,2,3,4,5,6,7,8,9],engine='openpyxl')
d_type1 = 'RNA'
d_type2 = 'CNV'
name = d_type1 + d_type2
accs[name] = []
f1_scores[name] = []
aucs[name] = []

writer = pd.ExcelWriter('excel_results/RNA-CNV-Integration-late-fusion-XGBOOST_both.xlsx', engine='openpyxl')
for df_name, df in data_model.items():
    # take those where the two sources has data
    df_only = df.loc[(df['Has '+ d_type1] != -1) & (df['Has ' + d_type2] != -1)]

    probs = [[x,y,z] for x,y,z in zip(df_only['Integration Prob LUAD'], df_only['Integration Prob HLT'], df_only['Integration Prob LUSC'])]
    preds = df_only['Integration Pred'].values
    real = df_only['Real'].values
    acc = accuracy_score(real, preds)*100
    f1 = f1_score(real, preds, average='weighted')*100
    print(Counter(real))
    print('Acc: {}; F1; {}'.format(acc, f1))
    try:
        auc = roc_auc_score(real, probs, multi_class='ovr')
        aucs[name].append(auc)
    except:
        pass
    accs[name].append(acc)
    f1_scores[name].append(f1)
    columns_save = ['Case IDs', 'Has RNA', 'Has CNV', 'RNA Prob LUAD', 'RNA Prob HLT',
                    'RNA Prob LUSC', 'RNA Pred', 'CNV Prob LUAD', 'CNV Prob HLT',
                    'CNV Prob LUSC', 'CNV Pred', 'Integration Prob LUAD', 'Integration Prob HLT',
                    'Integration Prob LUSC', 'Integration Pred', 'Real']
    new_df = df_only[columns_save]
    new_df.to_excel(writer, sheet_name='split_'+str(df_name), index=False)
writer.close()