In [28]:
%reset

import pandas as pd
import argparse
import numpy as np
import pickle

In [71]:
version = 'v3'
genome = 'mm10'
infile_chip = f"../resources/experimentList_{version}.tab"
infile_tfs = f"../resources/{genome}/TF_list.txt"
geneid_genename_synonym_table = f"../genome/{genome}/{genome}_ENSID_Genename_synonyms.txt.gz"
synonym_genename_dict = f"../genome/{genome}/GeneName_Synonyms_dict.pkl"
th_reads = 1000
th_mapped_reads = 0.1
th_duplicates = .95
th_peaks = 1
th_exp_per_tf = 128

# get experiment table
chip = pd.read_csv(infile_chip,sep='\t',header=None,usecols=[0,1,2,3,4,5,6,7,8],index_col=0)
chip.columns = ['genome','antigen_class','antigen','celltype_class','celltype','celltype_description','QC','title']

# get only TFs and others from genome
chip = chip[ (chip.genome == genome) & (chip.antigen_class=='TFs and others') ]

# parse QC column add to chip table
QC = pd.DataFrame([[float(n) for n in qc.split(',')] for qc in chip.QC],columns=['n_reads','f_mapped','f_duplicates','n_peaks'],index=chip.index)
QC.iloc[:,1] /= 100
QC.iloc[:,2] /= 100
chip = pd.concat([chip,QC],axis=1)

# get peaks per unique mapped reads
chip.loc[:,'n_peaks_per_unique_mapped_reads'] = chip.n_peaks/(chip.f_mapped*chip.n_reads*(1-chip.f_duplicates))
n_tot = chip.shape[0]
print(f'{genome}: {n_tot} experiments')

# apply thresholds
idx_out = list( chip[(chip['n_reads']     < th_reads) | 
                     (chip['f_mapped']    < th_mapped_reads) |
                     (chip['f_duplicates']> th_duplicates) |
                     (chip['n_peaks']     < th_peaks) ].index )

# For TFs with more than th experiments, keep the th with the highest n_peaks_per_unique_mapped_reads
exp_per_tf = chip.groupby('antigen')['antigen'].aggregate('count')
for tf in exp_per_tf.loc[exp_per_tf > th_exp_per_tf].index:
    idx_out.extend( list( chip.loc[chip.antigen==tf].sort_values('n_peaks_per_unique_mapped_reads',ascending=False).index[th_exp_per_tf:] ) )
chip.drop(idx_out,inplace=True)

print(f'{genome}: {chip.shape[0]/n_tot} passed QC')

print( 'Ar' in np.sort(chip.antigen) )




# load GeneID GeneName Synonym table gene name list
Gene_id_name_syn = pd.read_csv(geneid_genename_synonym_table,sep='\t')
GeneName = set(Gene_id_name_syn['Gene name'])

Gene_id_name_syn.replace({np.nan:'None'},inplace=True)
Gene_id_name_syn.drop(Gene_id_name_syn.index[Gene_id_name_syn['Gene Synonym']=="None"],inplace=True)
Gene_id_name_syn.drop_duplicates(inplace=True)
Gene_id_name_syn.drop( Gene_id_name_syn.loc[Gene_id_name_syn['Gene Synonym'] == Gene_id_name_syn['Gene name']].index, inplace=True)

print( 'Ar' in GeneName )


# load synonym to gene name dictionary
with open(synonym_genename_dict, 'rb') as f:
    Synonym_2_GeneName = pickle.load(f)

# get antigen gene names to change
to_rename = []
not_found = []
for g in chip.antigen.unique():
    if g in GeneName:
        continue
    else:
        if g in Synonym_2_GeneName.keys():
            to_rename.append([g,Synonym_2_GeneName[g]])
        else:
            not_found.append(g)
to_rename = dict(zip(np.array(to_rename)[:,0],np.array(to_rename)[:,1]))

# change gene names
for g in to_rename.keys():
    idx = chip[ chip.antigen==g ].index
    for i in idx:
        chip.at[i,'antigen'] = to_rename[chip.at[i,'antigen']]

# get TF list
with open(infile_tfs,'r') as f:
    TFs = [tf.strip() for tf in f.readlines()]
TFs = np.array(TFs)

# get tf names to change
to_rename = []
not_found = []
for g in TFs:
    if g in GeneName:
        continue
    else:
        if g in Synonym_2_GeneName.keys():
            to_rename.append([g,Synonym_2_GeneName[g]])
        else:
            not_found.append(g)
to_rename = dict(zip(np.array(to_rename)[:,0],np.array(to_rename)[:,1]))

# change tf names
for g in to_rename.keys():
    i = np.where( TFs == g )[0]
    TFs[i] = to_rename[g]


# Keep only antigens that are in TF list
tf_id = []
tf_out = []
for id in chip.index:
    antigen = chip.at[id,'antigen']
    # if antigen is in TF list add id
    if antigen in TFs:
        tf_id.append(id)
    # if antigen is a synonym of 
    elif antigen in Gene_id_name_syn['Gene Synonym'].values:
        print(antigen)
        gene_name = Gene_id_name_syn.loc[ Gene_id_name_syn['Gene Synonym']==antigen, 'Gene name'].values
        if any([g in TFs for g in gene_name]):
            tf_id.append(id)
    else:
        tf_out.append(antigen)

# print kept ratio
print(f'{genome}: {len(tf_id)/chip.shape[0]} in TF list')
chip = chip.loc[tf_id,:]
print(f'{genome}: {len(chip.antigen.unique())} unique TF')

print( 'Ar' in np.sort(chip.antigen) )

mm10: 23006 experiments
mm10: 0.6816917325914978 passed QC
True
True
Grip1
Grip1
Grip1
Grip1
Grip1
Grip1
Wdr11
Wdr11
Wdr11
Htt
Htt
Htt
Htt
Htt
Htt
Htt
Mpnd
Mpnd
Htt
Htt
mm10: 0.8666071542434484 in TF list
mm10: 712 unique TF
True
