In [5]:
import numpy as np
import random
import pandas as pd
import os

from common import OUTPUTPATH
from models import linear_pcc
import data

In [6]:
# estimators here
linear_pcc.fit(data.x1, data.y)

<modules.multilabel.ProbabilisticClassifierChain at 0x12be45040>

In [3]:
num_importance = 100 # defult values show all variables
from sklearn.inspection import permutation_importance

def individual_importance_dataframe(est, X, y, scoring='neg_log_loss', num_importance=100000, n_repeats=100, seeds=None):
    """
    This algorithm use linear estimators and get the importance variables where p < 0.05
    To easy comparison, we set importance score: 1- p
    """
    r = permutation_importance(est, X, y, n_repeats=n_repeats, scoring=scoring, random_state=seeds)
    non_zero_indx = np.where(r.importances_mean > 0)
    indx = r.importances_mean[non_zero_indx].argsort()[::-1]
    important_variables = X.columns[non_zero_indx][indx]
    importance_scores = r.importances_mean[non_zero_indx][indx]
    coef = est.coef_[0][non_zero_indx][indx]
    df = pd.DataFrame({'variables': important_variables[:num_importance], 'coef': coef[:num_importance], 'importance': importance_scores[:num_importance]})
    return df

def linear_importance_dataframe(est, X, y, scoring = 'neg_log_loss', num_importance=100000, n_repeats=100, seeds=None):
    """
    Combine all morphologies plots together
    """
    data1 = pd.concat([X, y], axis=1)
    cnt = None
    for i in range(3):
        col_indx = -4 + i

        estimator, X, y = est[i], data1.iloc[:, :col_indx], data.y.iloc[:, i]
        temp_df = individual_importance_dataframe(est=estimator, X=X, y=y, scoring=scoring, num_importance=num_importance, n_repeats=n_repeats, seeds=seeds)
        if not cnt: 
            df = temp_df
            cnt = 1
        else:
            df = pd.concat([df, temp_df], axis=1)

    upper_columns = ['Sphere', 'Worm', 'Vesicle']
    lower_columns = ['variables', 'coef', 'importance']
    df.columns = pd.MultiIndex.from_product([upper_columns, lower_columns], names=['Phase', 'Property'])
    return df

show all importance dataframe for Sphere, Worm, Vesicle

In [9]:
linear_importance_dataframe(linear_pcc.fitted_, data.abbrev_x1, data.y, num_importance=100000, seeds=1000, n_repeats=500)



Phase,Sphere,Sphere,Sphere,Worm,Worm,Worm,Vesicle,Vesicle,Vesicle
Property,variables,coef,importance,variables,coef,importance,variables,coef,importance
0,mw_tot_cre,0.000781,8.26313,mw_tot_cre,-0.000436,6.320292,mw_tot_cre,-0.000936,7.715791
1,mv_tot_cre,-0.050703,4.039665,mv_tot_cre,0.025036,1.631529,mv_tot_cre,0.080782,7.336668
2,dp_cre,-0.056522,3.82661,dp_cre,0.027473,1.490471,dp_cre,0.041079,3.312259
3,apol_cna,0.257623,1.199842,mw_tot_cna,-0.000258,0.1590352,mw_tot_cna,-0.000785,1.013335
4,psa_cna,-0.037091,0.2475773,apol_cna,-0.062277,0.1376481,mv_tot_cna,0.080923,0.5745564
5,dp_cna,0.049359,0.2380387,mv_tot_cna,0.024094,0.07833423,apol_cna,-0.128724,0.3902514
6,mv_tot_cna,0.040055,0.1922278,conc,0.072983,0.04419836,psa_cna,0.039046,0.3075598
7,psa_cre,-0.20601,0.1491745,mw_cna,0.005097,0.03448851,sphere,-0.697121,0.05157315
8,conc,-0.12756,0.07881694,psa_cre,0.055504,0.02518712,clogp_cna,-0.194776,0.03248889
9,mw_tot_cna,-0.000103,0.02148267,psa_cna,0.007698,0.01870047,psa_cre,0.086631,0.0270051


In [5]:
top5_linear_dataframe = linear_importance_dataframe(linear_pcc.fitted_, data.abbrev_x1, data.y, num_importance=5, seeds=1000)
top5_linear_dataframe.to_csv(os.path.join(OUTPUTPATH, 'Top5_linear_dataframe.csv'))
top5_linear_dataframe



Phase,Sphere,Sphere,Sphere,Worm,Worm,Worm,Vesicle,Vesicle,Vesicle
Property,variables,coef,importance,variables,coef,importance,variables,coef,importance
0,mw_tot_cre,0.000781,8.209574,mw_tot_cre,-0.000436,6.328502,mw_tot_cre,-0.000936,7.671937
1,mv_tot_cre,-0.050703,4.05984,mv_tot_cre,0.025036,1.611345,mv_tot_cre,0.080782,7.356431
2,dp_cre,-0.056522,3.846369,dp_cre,0.027473,1.473559,dp_cre,0.041079,3.33412
3,apol_cna,0.257623,1.206764,mw_tot_cna,-0.000258,0.157105,mw_tot_cna,-0.000785,1.012136
4,psa_cna,-0.037091,0.248166,apol_cna,-0.062277,0.137773,mv_tot_cna,0.080923,0.577537
