In [206]:
import pandas as pd
import numpy as np
import os

BASEDIR = 'output/nmla/selected_markers/'

n_gene_limits = [16, 9, 23, 8, 32, 5, 13, 18, 23, 23]

r, i = None, 0

for f in os.listdir(BASEDIR):
    
    if 'genefpkm' in f:
        
        k = pd.read_csv(os.path.join(BASEDIR, f))['entropy']
        n = len(k)
        k = pd.concat([k.apply([np.mean, np.std, np.min, np.max, np.sum]), k.quantile([.25, .5, .75])])
        k = pd.DataFrame(k).T
        k['total'] = n
        k['n_genes'] = n_gene_limits[i]
        k.index = [i]
        r = k if r is None else pd.concat([r, k], axis=0)
        i += 1
        

r.to_csv('output/entropy_x_n_genes.csv', sep=',', index=False)

r['n_genes_p'] = r['n_genes'] / r['total']

r.to_csv('output/entropy_x_n_genes_p.csv', sep=',', index=False)

del r['n_genes']

r

Unnamed: 0,mean,std,amin,amax,sum,0.25,0.5,0.75,total,n_genes_p
0,7.198611,2.140553,2.09282,9.342075,388.724979,5.925859,7.980587,8.933924,54,0.296296
1,7.434641,2.326484,1.484325,9.293554,341.993504,6.279832,8.77599,9.180374,46,0.195652
2,7.613964,2.033209,2.066915,9.33985,373.084215,6.882964,8.465006,9.127235,49,0.469388
3,7.189776,2.202679,2.078221,9.258142,287.591044,5.075708,8.165678,9.050086,40,0.2
4,6.838792,2.075481,2.069935,9.343442,519.74822,5.389278,7.438123,8.595636,76,0.421053
5,6.745862,2.109032,2.050079,9.234073,256.342761,4.985128,7.236467,8.508441,38,0.131579
6,7.304833,2.016242,2.056682,9.345661,401.765808,6.519645,8.132474,8.895768,55,0.236364
7,7.263958,2.207309,2.047496,9.352298,559.324786,5.829776,8.092054,9.0931,77,0.233766
8,7.294024,2.197726,2.053801,9.268244,408.465356,5.760328,8.206773,9.065738,56,0.410714
9,7.451194,2.040893,2.078658,9.358904,357.657332,6.832891,8.289996,8.977701,48,0.479167


In [207]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()

model.fit(r.values[:, :-1], r.values[:, -1])

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [208]:
eval('[' + ', '.join(list([str(model.intercept_)]) + [str(s) for s in list(model.coef_)]) + ']')

[-0.8848630975791922,
 4.600451633673629,
 3.615872144716864,
 1.1838667374310157,
 -0.7000817701614678,
 -0.018821646628496263,
 -0.2313025930161946,
 0.2103729053368249,
 -4.060631004415739,
 0.14208357407079072]

In [209]:
model.predict(r.values[:, :-1])

array([0.2962963 , 0.19565217, 0.46938776, 0.2       , 0.42105263,
       0.13157895, 0.23636364, 0.23376623, 0.41071429, 0.47916667])

In [212]:
def select_k_top_markers(entropies):    
        
    if isinstance(entropies, pd.Series):
        entropies = entropies.values.reshape([-1])
        
    entropies = entropies.reshape([-1])
        
    def q25(e):
        return np.quantile(e, q=.25)

    def q50(e):
        return np.quantile(e, q=.50)

    def q75(e):
        return np.quantile(e, q=.75)
    
    def std(e):
        return np.std(e, ddof=1)

    betas = np.array([-0.8848630975791922,
                      4.600451633673629,
                      3.615872144716864,
                      1.1838667374310157,
                      -0.7000817701614678,
                      -0.018821646628496263,
                      -0.2313025930161946,
                      0.2103729053368249,
                      -4.060631004415739,
                      0.14208357407079072])

    params = np.array([1.0] + [op(entropies) for op in [
        np.mean, std, np.min, np.max, np.sum, q25, q50, q75, len]])
    
    return int(round(len(entropies) * params.dot(betas), 0))

result = []

for f in os.listdir(BASEDIR):
    if 'genefpkm' in f:
        k = pd.read_csv(os.path.join(BASEDIR, f))['entropy']
        result.append(select_k_top_markers(k))
result

[16, 9, 23, 8, 32, 5, 13, 18, 23, 23]