In [1]:
import numpy as np
import random
import pandas as pd
from scipy import stats
import os

from common import OUTPUTPATH, set_seed
from models import linear_pcc
import data

In [2]:
# import warnings
# warnings.filterwarnings('ignore') # Warnings all come from non-feature names which are useless.

In [3]:
# estimators here
set_seed(1000)
linear_pcc.fit(data.x1, data.y)

<modules.multilabel.ProbabilisticClassifierChain at 0x7fd7d54d5970>

In [4]:
num_importance = 100 # defult values show all variables
from sklearn.inspection import permutation_importance

def individual_importance_dataframe(est, X, y, scoring='neg_log_loss', num_importance=100000, n_repeats=30):
    """
    This algorithm use linear estimators and get the importance variables where p < 0.05
    To easy comparison, we set importance score: 1- p
    """
    r = permutation_importance(est, X, y, n_repeats=n_repeats, scoring=scoring)
    non_zero_indx = np.where(r.importances_mean > 0)
    indx = r.importances_mean[non_zero_indx].argsort()[::-1]
    important_variables = X.columns[non_zero_indx][indx]
    importance_scores = r.importances_mean[non_zero_indx][indx]
    coef = est.coef_[0][non_zero_indx][indx]
    df = pd.DataFrame({'variables': important_variables[:num_importance], 'coef': coef[:num_importance], 'importance': importance_scores[:num_importance]})
    return df

def linear_importance_dataframe(est, X, y, scoring = 'neg_log_loss', num_importance=100000, n_repeats=30):
    """
    Combine all morphologies plots together
    """
    data1 = pd.concat([X, y], axis=1)
    cnt = None
    for i in range(3):
        col_indx = -4 + i

        estimator, X, y = est[i], data1.iloc[:, :col_indx], data.y.iloc[:, i]
        temp_df = individual_importance_dataframe(est=estimator, X=X, y=y, scoring=scoring, num_importance=num_importance, n_repeats=n_repeats)
        if not cnt: 
            df = temp_df
            cnt = 1
        else:
            df = pd.concat([df, temp_df], axis=1)

    upper_columns = ['Sphere', 'Worm', 'Vesicle']
    lower_columns = ['variables', 'coef', 'importance']
    df.columns = pd.MultiIndex.from_product([upper_columns, lower_columns], names=['Phase', 'Property'])
    return df

show all importance dataframe for Sphere, Worm, Vesicle

In [5]:
top5_linear_dataframe = linear_importance_dataframe(linear_pcc.fitted_, data.abbrev_x1, data.y, num_importance=5)
top5_linear_dataframe.to_csv(os.path.join(OUTPUTPATH, 'Top5_linear_dataframe.csv'))
top5_linear_dataframe       




Phase,Sphere,Sphere,Sphere,Worm,Worm,Worm,Vesicle,Vesicle,Vesicle
Property,variables,coef,importance,variables,coef,importance,variables,coef,importance
0,mw_tot_cre,0.000788,8.270657,mw_tot_cre,-0.000438,6.369069,mw_tot_cre,-0.000204,2.322912
1,mv_tot_cre,-0.051806,4.149974,mv_tot_cre,0.025935,1.668712,mv_tot_cre,0.013202,1.139139
2,dp_cre,-0.056634,3.854761,dp_cre,0.026868,1.428273,dp_cre,0.015386,1.079762
3,apol_cna,0.260236,1.242725,mw_tot_cna,-0.000237,0.139921,mw_tot_cna,-0.000158,0.061568
4,psa_cna,-0.047116,0.361828,apol_cna,-0.057969,0.119575,psa_cna,0.009236,0.031803
