In [1]:
from datasets import load_dataset
import numpy as np
import pandas as pd
import polars as pl
from sklearn.metrics import average_precision_score
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
metadata = pd.read_csv("Borzoi_metadata.csv")
metadata

Unnamed: 0,name,file,clip,clip_soft,scale,sum_stat,strand_pair,description,assay,sample
0,CNhs10608+,/home/drk/tillage/datasets/human/cage/fantom/C...,768,384,1.00,sum,1,CAGE:Clontech Human Universal Reference Total ...,CAGE,"Clontech Human Universal Reference Total RNA, ..."
1,CNhs10608-,/home/drk/tillage/datasets/human/cage/fantom/C...,768,384,1.00,sum,0,CAGE:Clontech Human Universal Reference Total ...,CAGE,"Clontech Human Universal Reference Total RNA, ..."
2,CNhs10610+,/home/drk/tillage/datasets/human/cage/fantom/C...,768,384,1.00,sum,3,CAGE:SABiosciences XpressRef Human Universal T...,CAGE,SABiosciences XpressRef Human Universal Total ...
3,CNhs10610-,/home/drk/tillage/datasets/human/cage/fantom/C...,768,384,1.00,sum,2,CAGE:SABiosciences XpressRef Human Universal T...,CAGE,SABiosciences XpressRef Human Universal Total ...
4,CNhs10612+,/home/drk/tillage/datasets/human/cage/fantom/C...,768,384,1.00,sum,5,CAGE:Universal RNA - Human Normal Tissues Bioc...,CAGE,"Universal RNA - Human Normal Tissues Biochain,..."
...,...,...,...,...,...,...,...,...,...,...
7606,GTEX-13FTX-1026-SM-5J2O5.1,/home/drk/tillage/datasets/human/rna/recount3/...,768,384,0.01,sum_sqrt,7606,RNA:uterus,RNA,uterus
7607,GTEX-1MA7W-1526-SM-DHXKS.1,/home/drk/tillage/datasets/human/rna/recount3/...,768,384,0.01,sum_sqrt,7607,RNA:uterus,RNA,uterus
7608,GTEX-11EMC-1926-SM-5A5JU.1,/home/drk/tillage/datasets/human/rna/recount3/...,768,384,0.01,sum_sqrt,7608,RNA:vagina,RNA,vagina
7609,GTEX-12WSB-2426-SM-5EGJC.1,/home/drk/tillage/datasets/human/rna/recount3/...,768,384,0.01,sum_sqrt,7609,RNA:vagina,RNA,vagina


In [15]:
# should aggregate
len(metadata.description.unique()) / len(metadata)

0.7548285376428853

In [32]:
metadata.description.value_counts()

description
RNA:K562                                                                                                  23
RNA:GM12878                                                                                               22
RNA:H1                                                                                                    20
RNA:HepG2                                                                                                 13
RNA:HFFc6                                                                                                 12
                                                                                                          ..
CHIP:ZNF175:HEK293 eGFP- genetically modified using site-specific recombination originated from HEK293     1
CHIP:ZMYM3:K562                                                                                            1
CHIP:H3K36me3:gastrocnemius medialis female adult (53 years)                                               1
CHIP:ZN

In [3]:
metadata.assay.value_counts()

assay
CHIP     3886
RNA      1543
CAGE     1276
DNASE     674
ATAC      232
Name: count, dtype: int64

In [40]:
metadata["group"] = metadata.apply(lambda x: x.assay if x.assay != "CHIP" else ":".join(x.description.split(":")[:2]), axis=1)
metadata["group"].value_counts()

group
DNASE               674
CAGE                638
CHIP:H3K4me3        338
CHIP:H3K36me3       269
CHIP:H3K27me3       260
                   ... 
CHIP:eGFP-IKZF3       1
CHIP:eGFP-MZF1        1
CHIP:eGFP-ZNF213      1
CHIP:eGFP-ZNF777      1
CHIP:eGFP-ZNF660      1
Name: count, Length: 791, dtype: int64

In [4]:
dataset_path = "gonzalobenegas/siraj_gwas_highpip"
V = load_dataset(dataset_path, split="test").to_pandas()
V

Unnamed: 0,chrom,pos,ref,alt,label
0,1,3080038,T,C,True
1,1,3774964,A,G,True
2,1,6616131,C,T,False
3,1,7665224,C,A,False
4,1,8407293,G,A,False
...,...,...,...,...,...
1778,22,47019717,G,T,False
1779,22,47990921,C,T,True
1780,22,50007172,T,C,False
1781,22,50190508,G,A,True


In [5]:
features = "Borzoi_L2"
df = pd.read_parquet(f"https://huggingface.co/datasets/{dataset_path}/resolve/main/features/{features}.parquet")
feature_cols = df.columns
V = pd.concat([V, df], axis=1)
V

Unnamed: 0,chrom,pos,ref,alt,label,CNhs10608+,CNhs10608-,CNhs10610+,CNhs10610-,CNhs10612+,...,GTEX-1KD5A-1826-SM-DHXJI.1,GTEX-1MA7X-1526-SM-DHXJF.1,GTEX-13FH7-0126-SM-5KLZ1.1,GTEX-15CHQ-0826-SM-69LOT.1,GTEX-1J1OQ-0526-SM-A9G2P.1,GTEX-13FTX-1026-SM-5J2O5.1,GTEX-1MA7W-1526-SM-DHXKS.1,GTEX-11EMC-1926-SM-5A5JU.1,GTEX-12WSB-2426-SM-5EGJC.1,GTEX-W5WG-1026-SM-4LMIF.1
0,1,3080038,T,C,True,0.030109,0.029550,0.010730,0.012588,0.011650,...,0.049622,0.052124,0.025927,0.029505,0.025276,0.019794,0.023859,0.008613,0.010459,0.016822
1,1,3774964,A,G,True,0.072772,0.053760,0.072442,0.046519,0.071386,...,0.126223,0.128096,0.103943,0.103876,0.099199,0.063892,0.079416,0.089966,0.092519,0.061029
2,1,6616131,C,T,False,0.039855,0.075309,0.022525,0.077093,0.023479,...,0.063204,0.067226,0.035611,0.034379,0.036854,0.031151,0.035389,0.033019,0.037485,0.029799
3,1,7665224,C,A,False,0.056595,0.071691,0.022921,0.040726,0.025973,...,0.068021,0.071216,0.044499,0.048247,0.045753,0.042644,0.072322,0.019573,0.024010,0.043794
4,1,8407293,G,A,False,0.028987,0.053539,0.022211,0.033712,0.023465,...,0.105460,0.100969,0.087398,0.084599,0.072838,0.064153,0.064326,0.070928,0.069089,0.063530
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1778,22,47019717,G,T,False,0.045336,0.026805,0.024453,0.013941,0.027590,...,0.012302,0.013002,0.004905,0.004438,0.004185,0.003371,0.003603,0.006942,0.006107,0.003389
1779,22,47990921,C,T,True,0.008861,0.008165,0.006038,0.005140,0.006826,...,0.020491,0.029477,0.003468,0.002416,0.001717,0.001461,0.001972,0.001721,0.002095,0.001490
1780,22,50007172,T,C,False,0.044697,0.044838,0.025284,0.037314,0.026339,...,0.178772,0.204259,0.080355,0.090092,0.073444,0.058524,0.077468,0.050964,0.051430,0.065829
1781,22,50190508,G,A,True,0.575140,0.755803,0.534374,0.680176,0.537903,...,0.359001,0.396114,0.394123,0.370354,0.343617,0.341672,0.413581,0.326287,0.351261,0.322992


In [6]:
res = []
for c in tqdm(feature_cols):
    res.append([c, average_precision_score(V["label"], V[c])])
res = pd.DataFrame(res, columns=["feature", "AUPRC"]).sort_values("AUPRC", ascending=False)
res

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7611/7611 [00:13<00:00, 549.73it/s]


Unnamed: 0,feature,AUPRC
1298,ENCFF082SFS,0.740755
2023,kai156,0.738540
2107,kai31,0.737819
1281,ENCFF996AEF,0.736121
1317,ENCFF678LXL,0.735339
...,...,...
5253,ENCFF105NEW,0.498805
5594,ENCFF078WGV,0.497996
3829,ENCFF031GKU,0.494251
5582,ENCFF683WTT,0.493248


In [7]:
res = res.merge(metadata, left_on="feature", right_on="name")
res

Unnamed: 0,feature,AUPRC,name,file,clip,clip_soft,scale,sum_stat,strand_pair,description,assay,sample
0,ENCFF082SFS,0.740755,ENCFF082SFS,/home/drk/tillage/datasets/human/dnase/encode/...,128,32,2.00,mean,1298,DNASE:heart male adult (27 years) and male adu...,DNASE,heart male adult (27 years) and male adult (35...
1,kai156,0.738540,kai156,/home/drk/tillage/datasets/human/atac/catlas/k...,384,96,0.01,sum,2023,ATAC:Fibro Liver Adrenal,ATAC,Fibro Liver Adrenal
2,kai31,0.737819,kai31,/home/drk/tillage/datasets/human/atac/catlas/k...,384,96,0.01,sum,2107,ATAC:Endocardial,ATAC,Endocardial
3,ENCFF996AEF,0.736121,ENCFF996AEF,/home/drk/tillage/datasets/human/dnase/encode/...,128,32,2.00,mean,1281,DNASE:GM03348 genetically modified using trans...,DNASE,GM03348 genetically modified using transductio...
4,ENCFF678LXL,0.735339,ENCFF678LXL,/home/drk/tillage/datasets/human/dnase/encode/...,128,32,2.00,mean,1317,DNASE:CD14-positive monocyte female,DNASE,CD14-positive monocyte female
...,...,...,...,...,...,...,...,...,...,...,...,...
7606,ENCFF105NEW,0.498805,ENCFF105NEW,/home/drk/tillage/datasets/human/chip/encode/E...,128,32,2.00,mean,5253,CHIP:EP300:suprapubic skin male adult (54 years),CHIP,EP300:suprapubic skin male adult (54 years)
7607,ENCFF078WGV,0.497996,ENCFF078WGV,/home/drk/tillage/datasets/human/chip/encode/E...,128,32,2.00,mean,5594,CHIP:EP300:esophagus muscularis mucosa female ...,CHIP,EP300:esophagus muscularis mucosa female adult...
7608,ENCFF031GKU,0.494251,ENCFF031GKU,/home/drk/tillage/datasets/human/chip/encode/E...,128,32,2.00,mean,3829,CHIP:EP300:suprapubic skin female adult (51 year),CHIP,EP300:suprapubic skin female adult (51 year)
7609,ENCFF683WTT,0.493248,ENCFF683WTT,/home/drk/tillage/datasets/human/chip/encode/E...,128,32,2.00,mean,5582,CHIP:POLR2A:lower leg skin female adult (53 ye...,CHIP,POLR2A:lower leg skin female adult (53 years)


In [9]:
res[["AUPRC", "description"]].head()

Unnamed: 0,AUPRC,description
0,0.740755,DNASE:heart male adult (27 years) and male adu...
1,0.73854,ATAC:Fibro Liver Adrenal
2,0.737819,ATAC:Endocardial
3,0.736121,DNASE:GM03348 genetically modified using trans...
4,0.735339,DNASE:CD14-positive monocyte female


In [8]:
res.groupby("assay").AUPRC.mean().sort_values(ascending=False)

assay
DNASE    0.708207
ATAC     0.685740
CAGE     0.685703
RNA      0.668443
CHIP     0.638618
Name: AUPRC, dtype: float64

In [19]:
res.groupby("description").AUPRC.mean().sort_values(ascending=False)

description
DNASE:lung female embryo (76 days)                               0.734845
DNASE:stomach female embryo (98 days)                            0.733138
DNASE:heart male adult (27 years) and male adult (35 years)      0.732975
DNASE:femur female embryo (98 days)                              0.732596
DNASE:hepatocyte                                                 0.731510
                                                                   ...   
CHIP:EP300:suprapubic skin female adult (51 year)                0.523921
CHIP:H2BK15ac:H9                                                 0.522652
CHIP:EP300:esophagus muscularis mucosa female adult (51 year)    0.522403
CHIP:EP300:subcutaneous adipose tissue male adult (37 years)     0.517747
CHIP:EP300:breast epithelium male adult (54 years)               0.511211
Name: AUPRC, Length: 4925, dtype: float64

In [20]:
res.groupby("description").AUPRC.mean().sort_values(ascending=False).head(50)

description
DNASE:lung female embryo (76 days)                                                                    0.734845
DNASE:stomach female embryo (98 days)                                                                 0.733138
DNASE:heart male adult (27 years) and male adult (35 years)                                           0.732975
DNASE:femur female embryo (98 days)                                                                   0.732596
DNASE:hepatocyte                                                                                      0.731510
DNASE:spleen embryo (112 days)                                                                        0.730772
DNASE:right lung female embryo (105 days)                                                             0.730133
DNASE:NB4                                                                                             0.729599
DNASE:left lung male embryo (115 days)                                                              

In [23]:
res_chip = res[res.assay=="CHIP"]
res_chip

Unnamed: 0,feature,AUPRC,name,file,clip,scale,sum_stat,description,assay,sample
10,ENCFF579CVA,0.728819,ENCFF579CVA,/home/drk/tillage/datasets/human/chip/encode/E...,32,2,mean,CHIP:H3K4me3:subcutaneous abdominal adipose ti...,CHIP,H3K4me3:subcutaneous abdominal adipose tissue ...
14,ENCFF493FQG,0.728070,ENCFF493FQG,/home/drk/tillage/datasets/human/chip/encode/E...,32,2,mean,CHIP:H3K4me3:NB4,CHIP,H3K4me3:NB4
43,ENCFF035NGT,0.723262,ENCFF035NGT,/home/drk/tillage/datasets/human/chip/encode/E...,32,2,mean,CHIP:H3K27ac:right lobe of liver female adult ...,CHIP,H3K27ac:right lobe of liver female adult (53 y...
50,ENCFF870RAN,0.722389,ENCFF870RAN,/home/drk/tillage/datasets/human/chip/encode/E...,32,2,mean,CHIP:H3K27ac:upper lobe of left lung female ad...,CHIP,H3K27ac:upper lobe of left lung female adult (...
52,ENCFF024VVX,0.722357,ENCFF024VVX,/home/drk/tillage/datasets/human/chip/encode/E...,32,2,mean,CHIP:H3K27ac:adrenal gland female adult (53 ye...,CHIP,H3K27ac:adrenal gland female adult (53 years)
...,...,...,...,...,...,...,...,...,...,...
5308,ENCFF031GKU,0.523921,ENCFF031GKU,/home/drk/tillage/datasets/human/chip/encode/E...,32,2,mean,CHIP:EP300:suprapubic skin female adult (51 year),CHIP,EP300:suprapubic skin female adult (51 year)
5309,ENCFF149ZYU,0.522652,ENCFF149ZYU,/home/drk/tillage/datasets/human/chip/encode/E...,32,2,mean,CHIP:H2BK15ac:H9,CHIP,H2BK15ac:H9
5310,ENCFF078WGV,0.522403,ENCFF078WGV,/home/drk/tillage/datasets/human/chip/encode/E...,32,2,mean,CHIP:EP300:esophagus muscularis mucosa female ...,CHIP,EP300:esophagus muscularis mucosa female adult...
5311,ENCFF331CAB,0.517747,ENCFF331CAB,/home/drk/tillage/datasets/human/chip/encode/E...,32,2,mean,CHIP:EP300:subcutaneous adipose tissue male ad...,CHIP,EP300:subcutaneous adipose tissue male adult (...


In [28]:
res_chip["mark"] = res_chip.description.str.split(":").str[1]
res_chip.groupby("mark").AUPRC.mean().sort_values(ascending=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res_chip["mark"] = res_chip.description.str.split(":").str[1]


mark
RBFOX2            0.703089
3xFLAG-SAP130     0.696222
H3K27Ac           0.694529
3xFLAG-GATAD2A    0.692816
eGFP-PRDM10       0.691565
                    ...   
H2AK9ac           0.576007
H3K23me2          0.574407
H3K23ac           0.572393
3xFLAG-ZBED5      0.563453
PLRG1             0.560066
Name: AUPRC, Length: 788, dtype: float64

In [29]:
res_chip.groupby("mark").AUPRC.mean().sort_values(ascending=False).head(50)

mark
RBFOX2            0.703089
3xFLAG-SAP130     0.696222
H3K27Ac           0.694529
3xFLAG-GATAD2A    0.692816
eGFP-PRDM10       0.691565
NR2F2             0.690090
eGFP-GLIS1        0.689921
3xFLAG-ZGPAT      0.688471
eGFP-ZIC2         0.688342
eGFP-ZNF335       0.688163
RELB              0.688001
eGFP-ZNF341       0.687746
E2F6              0.686270
eGFP-NR2C2        0.686173
3xFLAG-KAT8       0.684598
3xFLAG-TEAD1      0.683932
3xFLAG-GABPB1     0.683761
HNRNPL            0.683642
eGFP-ATF1         0.683586
eGFP-VEZF1        0.683105
TBX21             0.682782
hBMAL1            0.682746
3xFLAG-SOX5       0.682499
3xFLAG-RARA       0.682466
3xFLAG-THAP11     0.682191
3xFLAG-ARID4B     0.681679
3xFLAG-ELF3       0.681007
3xFLAG-RXRB       0.680269
eGFP-ZNF600       0.680027
GABPB1            0.679494
H3K27ac           0.678991
H3K4me1           0.678871
eGFP-PBX2         0.678795
3xFLAG-HMG20A     0.678435
3xFLAG-FOXA3      0.677840
eGFP-ZBTB8A       0.677812
AGO2              0.677

In [32]:
res_chip_H3K4me3 = res_chip[res_chip.mark=="H3K4me3"]
res_chip_H3K4me3

Unnamed: 0,feature,AUPRC,name,file,clip,scale,sum_stat,description,assay,sample,mark
10,ENCFF579CVA,0.728819,ENCFF579CVA,/home/drk/tillage/datasets/human/chip/encode/E...,32,2,mean,CHIP:H3K4me3:subcutaneous abdominal adipose ti...,CHIP,H3K4me3:subcutaneous abdominal adipose tissue ...,H3K4me3
14,ENCFF493FQG,0.728070,ENCFF493FQG,/home/drk/tillage/datasets/human/chip/encode/E...,32,2,mean,CHIP:H3K4me3:NB4,CHIP,H3K4me3:NB4,H3K4me3
100,ENCFF535CDG,0.719474,ENCFF535CDG,/home/drk/tillage/datasets/human/chip/encode/E...,32,2,mean,CHIP:H3K4me3:subcutaneous abdominal adipose ti...,CHIP,H3K4me3:subcutaneous abdominal adipose tissue ...,H3K4me3
131,ENCFF179GFG,0.718162,ENCFF179GFG,/home/drk/tillage/datasets/human/chip/encode/E...,32,2,mean,CHIP:H3K4me3:subcutaneous abdominal adipose ti...,CHIP,H3K4me3:subcutaneous abdominal adipose tissue ...,H3K4me3
156,ENCFF327QYA,0.717199,ENCFF327QYA,/home/drk/tillage/datasets/human/chip/encode/E...,32,2,mean,CHIP:H3K4me3:stomach smooth muscle female adul...,CHIP,H3K4me3:stomach smooth muscle female adult (84...,H3K4me3
...,...,...,...,...,...,...,...,...,...,...,...
5184,ENCFF293ASZ,0.572609,ENCFF293ASZ,/home/drk/tillage/datasets/human/chip/encode/E...,32,2,mean,CHIP:H3K4me3:lung female adult (30 years),CHIP,H3K4me3:lung female adult (30 years),H3K4me3
5187,ENCFF015ZVX,0.571867,ENCFF015ZVX,/home/drk/tillage/datasets/human/chip/encode/E...,32,2,mean,CHIP:H3K4me3:neural stem progenitor cell origi...,CHIP,H3K4me3:neural stem progenitor cell originated...,H3K4me3
5214,ENCFF312ENS,0.567943,ENCFF312ENS,/home/drk/tillage/datasets/human/chip/encode/E...,32,2,mean,CHIP:H3K4me3:peripheral blood mononuclear cell...,CHIP,H3K4me3:peripheral blood mononuclear cell male...,H3K4me3
5248,ENCFF748XLQ,0.558304,ENCFF748XLQ,/home/drk/tillage/datasets/human/chip/encode/E...,32,2,mean,CHIP:H3K4me3:mesendoderm originated from H1-hESC,CHIP,H3K4me3:mesendoderm originated from H1-hESC,H3K4me3


In [26]:
res_chip.description.values.tolist()

['CHIP:H3K4me3:subcutaneous abdominal adipose tissue nuclear fraction female adult (81 year)',
 'CHIP:H3K4me3:NB4',
 'CHIP:H3K27ac:right lobe of liver female adult (53 years)',
 'CHIP:H3K27ac:upper lobe of left lung female adult (53 years)',
 'CHIP:H3K27ac:adrenal gland female adult (53 years)',
 'CHIP:H3K27ac:liver male adult (31 year)',
 'CHIP:H3K27ac:stomach female adult (53 years)',
 'CHIP:H3K27ac:liver female adult (25 years)',
 'CHIP:H3K4me1:subcutaneous abdominal adipose tissue nuclear fraction female adult (59 years)',
 'CHIP:H3K27ac:body of pancreas male adult (37 years)',
 'CHIP:H3K4me3:subcutaneous abdominal adipose tissue nuclear fraction female adult (49 years)',
 'CHIP:H3K4me1:subcutaneous abdominal adipose tissue nuclear fraction female adult (81 year)',
 'CHIP:H3K27ac:spleen female adult (30 years)',
 'CHIP:H3K4me2:skeletal muscle satellite cell female adult originated from mesodermal cell',
 'CHIP:H3K4me1:skeletal muscle tissue female adult (72 years)',
 'CHIP:H3K4me3:

## Aggregate features

In [10]:
assays = metadata.assay.unique()
assays

array(['CAGE', 'DNASE', 'ATAC', 'CHIP', 'RNA'], dtype=object)

In [11]:
features2 = []
for assay in assays:
    for norm_ord in [2]:#[1, 2, np.inf]:
        f = f"{assay}_L{norm_ord}"
        V[f] = np.linalg.norm(df.loc[:, (metadata.assay==assay).values], ord=norm_ord, axis=1)
        features2.append(f)
V

Unnamed: 0,chrom,pos,ref,alt,label,CNhs10608+,CNhs10608-,CNhs10610+,CNhs10610-,CNhs10612+,...,GTEX-13FTX-1026-SM-5J2O5.1,GTEX-1MA7W-1526-SM-DHXKS.1,GTEX-11EMC-1926-SM-5A5JU.1,GTEX-12WSB-2426-SM-5EGJC.1,GTEX-W5WG-1026-SM-4LMIF.1,CAGE_L2,DNASE_L2,ATAC_L2,CHIP_L2,RNA_L2
0,1,3080038,T,C,True,0.030109,0.029550,0.010730,0.012588,0.011650,...,0.019794,0.023859,0.008613,0.010459,0.016822,0.351265,1.697177,0.840852,9.752006,1.061627
1,1,3774964,A,G,True,0.072772,0.053760,0.072442,0.046519,0.071386,...,0.063892,0.079416,0.089966,0.092519,0.061029,1.266690,5.528660,1.696545,17.641531,7.284476
2,1,6616131,C,T,False,0.039855,0.075309,0.022525,0.077093,0.023479,...,0.031151,0.035389,0.033019,0.037485,0.029799,1.525708,1.005307,0.503357,7.281210,1.125815
3,1,7665224,C,A,False,0.056595,0.071691,0.022921,0.040726,0.025973,...,0.042644,0.072322,0.019573,0.024010,0.043794,0.924361,0.630737,0.575892,5.621067,1.535668
4,1,8407293,G,A,False,0.028987,0.053539,0.022211,0.033712,0.023465,...,0.064153,0.064326,0.070928,0.069089,0.063530,0.690484,2.025270,0.813058,9.269770,2.847146
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1778,22,47019717,G,T,False,0.045336,0.026805,0.024453,0.013941,0.027590,...,0.003371,0.003603,0.006942,0.006107,0.003389,0.864672,0.247854,0.245694,3.306631,0.376174
1779,22,47990921,C,T,True,0.008861,0.008165,0.006038,0.005140,0.006826,...,0.001461,0.001972,0.001721,0.002095,0.001490,0.248439,1.651675,0.573875,5.801227,0.208426
1780,22,50007172,T,C,False,0.044697,0.044838,0.025284,0.037314,0.026339,...,0.058524,0.077468,0.050964,0.051430,0.065829,0.827551,3.918920,1.756246,16.996849,2.460153
1781,22,50190508,G,A,True,0.575140,0.755803,0.534374,0.680176,0.537903,...,0.341672,0.413581,0.326287,0.351261,0.322992,21.049221,21.130545,4.802217,61.971668,15.439353


In [12]:
res2 = []
for c in features2:
    res2.append([c, average_precision_score(V["label"], V[c])])
res2 = pd.DataFrame(res2, columns=["feature", "AUPRC"]).sort_values("AUPRC", ascending=False)
res2

Unnamed: 0,feature,AUPRC
1,DNASE_L2,0.741049
0,CAGE_L2,0.724758
2,ATAC_L2,0.719748
3,CHIP_L2,0.70757
4,RNA_L2,0.696393


In [22]:
from sklearn.linear_model import LogisticRegression

X = V[features2]
y = V["label"]

clf = LogisticRegression(max_iter=1000).fit(X, y)
clf

In [30]:
for i in range(len(features2)):
    print(features2[i], clf.coef_[0][i])

CAGE_L2 0.063178946142026
DNASE_L2 0.17981244745307717
ATAC_L2 0.36467354646624706
CHIP_L2 -0.015772988378906724
RNA_L2 0.0972015211855454


In [31]:
pred = clf.predict_proba(X)[:, 1]
average_precision_score(y, pred)

0.7469741248990237