In [1]:
import numpy as np
import pandas as pd
from scipy import stats
import os

from common import OUTPUTPATH
from models import linear_pcc
import data

In [2]:
# estimators here
STATE = np.random.RandomState(seed=1000)
linear_pcc.fit(data.x1, data.y)

<modules.multilabel.ProbabilisticClassifierChain at 0x7f82bd986b80>

In [3]:
num_importance = 100 # defult values show all variables

def individual_importance_dataframe(est, X, y, alpha=0.05, num_importance=100000):
    """
    This algorithm use linear estimators and get the importance variables where p < 0.05
    To easy comparison, we set importance score: 1- p
    """
    params = np.append(est.intercept_, est.coef_)

    newX = np.append(np.ones((len(X),1)), X, axis=1)
    predictions = est.predict(X)
    MSE = (sum((y-predictions)**2))/(len(newX)-len(newX[0]))
    # get variance, std, critical values (ts_b)
    var_b = MSE*(np.linalg.inv(np.dot(newX.T,newX)).diagonal())
    sd_b = np.sqrt(var_b)
    ts_b = params/ sd_b
    # based on t distribution
    pvalues =[2*(1-stats.t.cdf(np.abs(i),(len(newX)-1))) for i in ts_b][1:]
    # set order
    sorted_idx = np.array(pvalues).argsort()
    order = list(X.columns[sorted_idx])
    pvalues = np.array(pvalues)[sorted_idx]
    coef = est.coef_[0][sorted_idx]
    # format result
    important_variables = [order[i] for i in range(len(pvalues)) if pvalues[i] < alpha]
    importance_scores = 1.0-np.array([each for each in pvalues if each < alpha])
    coef = [coef[i] for i in range(len(pvalues)) if pvalues[i] < alpha]

    df = pd.DataFrame({'variables': important_variables[:num_importance], 'coef': coef[:num_importance], 'importance': importance_scores[:num_importance]})
    return df

def linear_importance_dataframe(est, X, y, alpha=0.05, num_importance=100000):
    """
    Combine all morphologies plots together
    """
    data1 = pd.concat([X, y], axis=1)
    cnt = None
    for i in range(3):
        col_indx = -4 + i

        estimator, X, y = est[i], data1.iloc[:, :col_indx], data.y.iloc[:, i]
        temp_df = individual_importance_dataframe(est=estimator, X=X, y=y, alpha=alpha, num_importance=num_importance)
        if not cnt: 
            df = temp_df
            cnt = 1
        else:
            df = pd.concat([df, temp_df], axis=1)

    upper_columns = ['Sphere', 'Worm', 'Vesicle']
    lower_columns = ['variables', 'coef', 'importance']
    df.columns = pd.MultiIndex.from_product([upper_columns, lower_columns], names=['Phase', 'Property'])
    return df

show all importance dataframe for Sphere, Worm, Vesicle

In [4]:
linear_importance_dataframe(linear_pcc.fitted_, data.x1, data.y, alpha=0.05, num_importance=100000)



Phase,Sphere,Sphere,Sphere,Worm,Worm,Worm,Vesicle,Vesicle,Vesicle
Property,variables,coef,importance,variables,coef,importance,variables,coef,importance
0,conc,-0.1072,1.0,mon_corona_apol,-0.065693,1.0,core_mv_total,0.013208,1.0
1,mon_corona_apol,0.231271,1.0,conc,0.075894,1.0,dp_core,0.01538,1.0
2,core_mv_total,-0.046841,1.0,core_mv_total,0.024149,1.0,core_mw_total,-0.000204,1.0
3,mon_corona_psa,-0.043151,1.0,core_mw_total,-0.00042,1.0,mon_corona_psa,0.009357,0.999162
4,dp_corona,0.044611,1.0,dp_core,0.026126,1.0,temp,0.013764,0.997077
5,core_mw_total,0.000738,1.0,mon_core_psa,0.061203,0.999998,conc,0.011315,0.982817
6,dp_core,-0.054974,1.0,temp,0.0199,0.99999,,,
7,mon_core_psa,-0.169942,1.0,mon_corona_psa,0.010289,0.999856,,,
8,corona_mv_total,0.044544,0.975052,,,,,,


In [6]:
top5_linear_dataframe = linear_importance_dataframe(linear_pcc.fitted_, data.x1, data.y, alpha=0.05, num_importance=5)
top5_linear_dataframe.to_csv(os.path.join(OUTPUTPATH, 'Top5_linear_dataframe.csv'))
top5_linear_dataframe




Phase,Sphere,Sphere,Sphere,Worm,Worm,Worm,Vesicle,Vesicle,Vesicle
Property,variables,coef,importance,variables,coef,importance,variables,coef,importance
0,conc,-0.1072,1.0,mon_corona_apol,-0.065693,1.0,core_mv_total,0.013208,1.0
1,mon_corona_apol,0.231271,1.0,conc,0.075894,1.0,dp_core,0.01538,1.0
2,core_mv_total,-0.046841,1.0,core_mv_total,0.024149,1.0,core_mw_total,-0.000204,1.0
3,mon_corona_psa,-0.043151,1.0,core_mw_total,-0.00042,1.0,mon_corona_psa,0.009357,0.999162
4,dp_corona,0.044611,1.0,dp_core,0.026126,1.0,temp,0.013764,0.997077
