In [1]:
import numpy as np
import pandas as pd
from scipy import stats
import os

from common import OUTPUTPATH
from models import linear_pcc
import data

In [2]:
import warnings
warnings.filterwarnings('ignore') # Warnings all come from non-feature names which are useless.

In [3]:
# estimators here
STATE = np.random.RandomState(seed=1000)
linear_pcc.fit(data.x1, data.y)

<modules.multilabel.ProbabilisticClassifierChain at 0x7fc16618b520>

In [4]:
num_importance = 100 # defult values show all variables
from sklearn.inspection import permutation_importance

def individual_importance_dataframe(est, X, y, scoring='neg_log_loss', num_importance=100000, n_repeats=30):
    """
    This algorithm use linear estimators and get the importance variables where p < 0.05
    To easy comparison, we set importance score: 1- p
    """
    r = permutation_importance(est, X, y, n_repeats=n_repeats, scoring=scoring)
    non_zero_indx = np.where(r.importances_mean > 0)
    indx = r.importances_mean[non_zero_indx].argsort()[::-1]
    important_variables = X.columns[non_zero_indx][indx]
    importance_scores = r.importances_mean[non_zero_indx][indx]
    coef = est.coef_[0][non_zero_indx][indx]
    df = pd.DataFrame({'variables': important_variables[:num_importance], 'coef': coef[:num_importance], 'importance': importance_scores[:num_importance]})
    return df

def linear_importance_dataframe(est, X, y, scoring = 'neg_log_loss', num_importance=100000, n_repeats=30):
    """
    Combine all morphologies plots together
    """
    data1 = pd.concat([X, y], axis=1)
    cnt = None
    for i in range(3):
        col_indx = -4 + i

        estimator, X, y = est[i], data1.iloc[:, :col_indx], data.y.iloc[:, i]
        temp_df = individual_importance_dataframe(est=estimator, X=X, y=y, scoring=scoring, num_importance=num_importance, n_repeats=n_repeats)
        if not cnt: 
            df = temp_df
            cnt = 1
        else:
            df = pd.concat([df, temp_df], axis=1)

    upper_columns = ['Sphere', 'Worm', 'Vesicle']
    lower_columns = ['variables', 'coef', 'importance']
    df.columns = pd.MultiIndex.from_product([upper_columns, lower_columns], names=['Phase', 'Property'])
    return df

show all importance dataframe for Sphere, Worm, Vesicle

In [5]:
linear_importance_dataframe(linear_pcc.fitted_, data.abbrev_x1, data.y, num_importance=100000)

Phase,Sphere,Sphere,Sphere,Worm,Worm,Worm,Vesicle,Vesicle,Vesicle
Property,variables,coef,importance,variables,coef,importance,variables,coef,importance
0,mw_tot_cre,0.00036,4.530841,mw_tot_cre,-0.000574,7.791975,mw_tot_cre,-0.000205,2.408438
1,dp_cre,-0.037014,2.449948,mv_tot_cre,0.042427,2.757101,mv_tot_cre,0.013374,1.145326
2,apol_cna,0.312548,1.509042,dp_cre,0.025709,1.306496,dp_cre,0.015333,1.068779
3,mv_tot_cna,0.119659,1.23258,mw_tot_cna,-0.000572,0.5879808,mw_tot_cna,-0.000156,0.05955554
4,mv_tot_cre,-0.016181,1.098329,apol_cna,-0.127326,0.3950927,psa_cna,0.009302,0.03392087
5,mw_tot_cna,-0.000747,0.6492821,mv_tot_cna,0.063264,0.3682914,mw_cna,-0.003252,0.01271223
6,dp_cna,0.06677,0.3841466,psa_cre,0.150546,0.08810649,temp,0.013737,0.007609308
7,psa_cna,-0.044898,0.3429419,psa_cna,0.018629,0.08518754,apol_cna,-0.012416,0.007114367
8,psa_cre,-0.280656,0.2545343,mw_cna,0.008702,0.08445428,mv_tot_cna,0.00431,0.004491646
9,conc,-0.137321,0.08263919,conc,0.122434,0.08033545,conc,0.011238,0.00411241


In [7]:
top5_linear_dataframe = linear_importance_dataframe(linear_pcc.fitted_, data.abbrev_x1, data.y, num_importance=5)
top5_linear_dataframe.to_csv(os.path.join(OUTPUTPATH, 'Top5_linear_dataframe.csv'))
top5_linear_dataframe


Phase,Sphere,Sphere,Sphere,Worm,Worm,Worm,Vesicle,Vesicle,Vesicle
Property,variables,coef,importance,variables,coef,importance,variables,coef,importance
0,mw_tot_cre,0.00036,4.537523,mw_tot_cre,-0.000574,7.613551,mw_tot_cre,-0.000205,2.356357
1,dp_cre,-0.037014,2.450072,mv_tot_cre,0.042427,2.738487,mv_tot_cre,0.013374,1.165152
2,apol_cna,0.312548,1.496282,dp_cre,0.025709,1.295499,dp_cre,0.015333,1.083557
3,mv_tot_cna,0.119659,1.217303,mw_tot_cna,-0.000572,0.590799,mw_tot_cna,-0.000156,0.062601
4,mv_tot_cre,-0.016181,1.099023,apol_cna,-0.127326,0.397351,psa_cna,0.009302,0.028074
