In [138]:
import ascends as asc
import numpy as np
from minepy import MINE
import pandas as pd
import matplotlib.pyplot as plt

In [156]:
def correlation_analysis_all(data_df, target_col, top_k=10, file_to_save = None, save_chart = None):
    pcc = data_df.corr()[target_col]
    if(len(pcc)<top_k):
        top_k=len(correlation)
    pcc = pcc.sort_values(ascending = False).dropna()
    pcc = pcc.rename("PCC")
    del pcc[target_col]

    pcc_sqrt = pcc.apply(lambda x: np.sqrt(x* x))
    pcc_sqrt = pcc_sqrt.sort_values(ascending = False).dropna()
    pcc_sqrt = pcc_sqrt.rename("PCC_SQRT")
    MICs = []
    MASs = []
    MEVs = []
    MCNs = []
    MCN_generals = []
    GMICs = []
    TICs = []

    for col in data_df.columns:
        if col!=target_col:
            x = data_df[col].values
            y = data_df[target_col].values
            mine = MINE()
            mine.compute_score(x,y)
            MICs.append((col,mine.mic()))
            MASs.append((col,mine.mas()))
            MEVs.append((col,mine.mev()))
            MCNs.append((col,mine.mcn(0)))
            MCN_generals.append((col,mine.mcn_general()))
            GMICs.append((col,mine.gmic()))
            TICs.append((col,mine.tic())) 

    top_k_pcc = list(pcc.keys())[:top_k]
    top_k_pcc_sqrt = list(pcc_sqrt.keys())[:top_k]
    top_k_mic = [tup[0] for tup in sorted(MICs, key=lambda tup: tup[1], reverse = True)[:top_k]]
    top_k_mas = [tup[0] for tup in sorted(MASs, key=lambda tup: tup[1], reverse = True)[:top_k]]
    top_k_mev = [tup[0] for tup in sorted(MEVs, key=lambda tup: tup[1], reverse = True)[:top_k]]
    top_k_mcn = [tup[0] for tup in sorted(MCNs, key=lambda tup: tup[1], reverse = True)[:top_k]]
    top_k_mcn_general = [tup[0] for tup in sorted(MCN_generals, key=lambda tup: tup[1], reverse = True)[:top_k]]
    top_k_gmic = [tup[0] for tup in sorted(GMICs, key=lambda tup: tup[1], reverse = True)[:top_k]]
    top_k_tic = [tup[0] for tup in sorted(TICs, key=lambda tup: tup[1], reverse = True)[:top_k]]
    
    if file_to_save is not None:
        # save to correlation report
        mic_df = pd.DataFrame([tup[1] for tup in MICs],columns=['MIC'],index=[tup[0] for tup in MICs])
        mas_df = pd.DataFrame([tup[1] for tup in MASs],columns=['MAS'],index=[tup[0] for tup in MASs])
        mev_df = pd.DataFrame([tup[1] for tup in MEVs],columns=['MEV'],index=[tup[0] for tup in MEVs])
        mcn_df = pd.DataFrame([tup[1] for tup in MCNs],columns=['MCN'],index=[tup[0] for tup in MCNs])
        mcn_general_df = pd.DataFrame([tup[1] for tup in MCN_generals],columns=['MCN_general'],index=[tup[0] for tup in MCN_generals])
        gmic_df = pd.DataFrame([tup[1] for tup in GMICs],columns=['GMIC'],index=[tup[0] for tup in GMICs])
        tic_df = pd.DataFrame([tup[1] for tup in TICs],columns=['TIC'],index=[tup[0] for tup in TICs])
              
        final_report = mic_df.join(mas_df).join(mev_df).join(mcn_df).join(mcn_general_df).join(gmic_df).join(tic_df).sort_index().join(pcc_sqrt).join(pcc)
        final_report.to_csv(file_to_save)
    
    if save_chart is not None:
        for col in final_report.keys():
            ax = final_report[col].sort_values(ascending=False).plot(kind='bar',alpha=0.8)
            ax.set_ylabel(col+" (target_col = '"+target_col+"')", fontsize=12)
            plt.axhline(0, color='k')
            plt.savefig(save_chart+","+col+" (target_col = '"+target_col+"').png")
            plt.close()

    return final_report, top_k_pcc, top_k_pcc_sqrt, top_k_mic, top_k_mas, top_k_mev,\
top_k_mcn, top_k_mcn, top_k_mcn_general, top_k_gmic, top_k_tic

In [157]:
data_df, x_train, y_train, header_x, header_y = asc.data_load_shuffle('data/creep.csv', None, ['id','Name','logRT'], 'LMP', random_state=None)

In [158]:
final_report, top_k_pcc, top_k_pcc_sqrt, top_k_mic, top_k_mas, top_k_mev,\
top_k_mcn, top_k_mcn, top_k_mcn_general, top_k_gmics, top_k_tic = correlation_analysis_all(data_df, 'LMP', 5, file_to_save = 'test', save_chart = 'test')