In [1]:
import numpy as np
import random
import pandas as pd
import os

from common import OUTPUTPATH
from models import linear_pcc
import data



In [2]:
# estimators here
linear_pcc.fit(data.x1, data.y)

<modules.multilabel.ProbabilisticClassifierChain at 0x7fd328188460>

In [3]:
num_importance = 100 # defult values show all variables
from sklearn.inspection import permutation_importance

def individual_importance_dataframe(est, X, y, scoring='neg_log_loss', num_importance=100000, n_repeats=100, seeds=None):
    """
    This algorithm use linear estimators and get the importance variables where p < 0.05
    To easy comparison, we set importance score: 1- p
    """
    r = permutation_importance(est, X, y, n_repeats=n_repeats, scoring=scoring, random_state=seeds)
    non_zero_indx = np.where(r.importances_mean > 0)
    indx = r.importances_mean[non_zero_indx].argsort()[::-1]
    important_variables = X.columns[non_zero_indx][indx]
    importance_scores = r.importances_mean[non_zero_indx][indx]
    coef = est.coef_[0][non_zero_indx][indx]
    df = pd.DataFrame({'variables': important_variables[:num_importance], 'coef': coef[:num_importance], 'importance': importance_scores[:num_importance]})
    return df

def linear_importance_dataframe(est, X, y, scoring = 'neg_log_loss', num_importance=100000, n_repeats=100, seeds=None):
    """
    Combine all morphologies plots together
    """
    data1 = pd.concat([X, y], axis=1)
    cnt = None
    for i in range(3):
        col_indx = -4 + i

        estimator, X, y = est[i], data1.iloc[:, :col_indx], data.y.iloc[:, i]
        temp_df = individual_importance_dataframe(est=estimator, X=X, y=y, scoring=scoring, num_importance=num_importance, n_repeats=n_repeats, seeds=seeds)
        if not cnt: 
            df = temp_df
            cnt = 1
        else:
            df = pd.concat([df, temp_df], axis=1)

    upper_columns = ['Sphere', 'Worm', 'Vesicle']
    lower_columns = ['variables', 'coef', 'importance']
    df.columns = pd.MultiIndex.from_product([upper_columns, lower_columns], names=['Phase', 'Property'])
    return df

show all importance dataframe for Sphere, Worm, Vesicle

In [4]:
linear_importance_dataframe(linear_pcc.fitted_, data.abbrev_x1, data.y, num_importance=100000, seeds=1000)



Phase,Sphere,Sphere,Sphere,Worm,Worm,Worm,Vesicle,Vesicle,Vesicle
Property,variables,coef,importance,variables,coef,importance,variables,coef,importance
0,mw_tot_cre,0.000568,6.478597,mw_tot_cre,-0.000434,6.301531,mw_tot_cre,-0.000203,2.329613
1,dp_cre,-0.04612,3.091151,mv_tot_cre,0.024796,1.595216,mv_tot_cre,0.01301,1.140847
2,mv_tot_cre,-0.034102,2.686515,dp_cre,0.02743,1.47201,dp_cre,0.015465,1.115013
3,apol_cna,0.338995,1.689558,mw_tot_cna,-0.000255,0.1530948,mw_tot_cna,-0.000153,0.05825421
4,mv_tot_cna,0.112893,1.088051,apol_cna,-0.060722,0.1323355,psa_cna,0.009726,0.03360638
5,mw_tot_cna,-0.000677,0.5488024,mv_tot_cna,0.023598,0.07744897,mw_cna,-0.003527,0.01776039
6,dp_cna,0.070488,0.4030307,conc,0.071982,0.04393955,apol_cna,-0.011883,0.00831973
7,psa_cna,-0.048058,0.3688261,mw_cna,0.004913,0.03258519,temp,0.013655,0.007317012
8,psa_cre,-0.292014,0.2539096,psa_cre,0.054039,0.02486725,conc,0.010836,0.003864777
9,conc,-0.148649,0.09256409,psa_cna,0.007714,0.01873763,mv_tot_cna,0.003911,0.003705008


In [6]:
top5_linear_dataframe = linear_importance_dataframe(linear_pcc.fitted_, data.abbrev_x1, data.y, num_importance=5, seeds=1000)
top5_linear_dataframe.to_csv(os.path.join(OUTPUTPATH, 'Top5_linear_dataframe.csv'))
top5_linear_dataframe



Phase,Sphere,Sphere,Sphere,Worm,Worm,Worm,Vesicle,Vesicle,Vesicle
Property,variables,coef,importance,variables,coef,importance,variables,coef,importance
0,mw_tot_cre,0.000568,6.478597,mw_tot_cre,-0.000434,6.301531,mw_tot_cre,-0.000203,2.329613
1,dp_cre,-0.04612,3.091151,mv_tot_cre,0.024796,1.595216,mv_tot_cre,0.01301,1.140847
2,mv_tot_cre,-0.034102,2.686515,dp_cre,0.02743,1.47201,dp_cre,0.015465,1.115013
3,apol_cna,0.338995,1.689558,mw_tot_cna,-0.000255,0.153095,mw_tot_cna,-0.000153,0.058254
4,mv_tot_cna,0.112893,1.088051,apol_cna,-0.060722,0.132336,psa_cna,0.009726,0.033606
