# Import Packages and Read in Dataframes

Files are taken directly from the ChIP-Atlas site. 

In [1]:
import os,sys
import pandas as pd 
import matplotlib.pyplot as plt
import numpy as np
from meta_utils import *
from analysis_utils import *

antigen_df = pd.read_csv('chip_atlas_antigen_list.csv')
celltype_df = pd.read_csv('chip_atlas_celltype_list.csv')
file_df = pd.read_csv('chip_atlas_file_list.csv')
exp_df = pd.read_csv('chip_atlas_experiment_list.csv')#.set_index('Experimental ID').rename_axis(None)
#exp_df.to_pickle("./exp_df.pkl")

## Extract Metadata from Experimental Dataframe

Read in the experimental dataframe from ChIP-Atlas site and parse through the `Meta data` column to find entries not curated by ChIP-Atlas creators for each experiment. 

This results in a new dataframe `exp_meta_df` that is indexied by `exp_ID` and contains 

In [2]:
new_df = exp_df
new_df = new_df.set_index('Experimental ID')

exp_list = []
for idx,row in new_df.iterrows():
    
    meta_dict = []
    try: 
        meta_list = row['Meta data'].strip().split('||')
        for att in meta_list: 
            tup = att.strip().split('=')
            if len(tup) == 1:
                tup = ['misc', str(tup[0])]
            elif len(tup) > 2:
                tup[1:] = [' '.join(tup[1:])]
            meta_dict.append(tuple(tup))
    
    # No Metadata Available
    except:
        pass
    
    meta_dict.append(('exp_ID', idx))        
    exp_list.append(dict(meta_dict))
    
exp_meta_df = pd.DataFrame(exp_list).set_index('exp_ID')

# Create Dataframe w/ Original Attributes and Parsed Attributes

This will be saved as `full_exp_df`. 

In [3]:
full_exp_df = exp_meta_df.join(exp_df)
full_exp_df.shape

(65499, 1369)

# Parse through Redundant Data in Metadata Field


See `analysis_utils.py` for helper functions used. Metadata included descriptors that contain overlapping content. Went through manually to correct these and incorporate them into the same field for celltype, tissue type, biomaterial provider, and sex. 

In [4]:
# Cell Types
relevant = ['cell-line','cell line','implated cell line/type',
            'derived cell line','cell  line','cell line background','cell_line', 'celll line',
            'host cell line','cell line/vendor','parental cell line', 'cell line/clone','cell lines','cell _line_name']
tissue = ['cell line sournce','cell line/type','cell line origin','cell line source','cell lineage','cell line type']

keys = ['cell','line']

print('Cell Type Reundancy')
cell_df = extract_meta_columns(keys, full_exp_df, col_name='cell_type', relevant=relevant)

# Tissue Types
relevant = ['tissue','tissue/cell','tissue source/type','tissue of origin',
            'tissue-type','tissue_type','tissue/cell type','tissue type','tissue source',
           'source tissue','tissue origin','tissues']

keys = ['tissue']

print('Tissue Type Reundancy')
tissue_df = extract_meta_columns(keys, full_exp_df, col_name='tissue_type', relevant=relevant)

# Biomaterial Provider
keys = ['provider']

relevant = ['biomaterial_provider','chip antibody provider','chip_antibody_provider',
            'antibody vendor/provider','sample_provider','antibody provider']

print('Provider Reundancy')
provider_df = extract_meta_columns(keys, full_exp_df, col_name='provider', relevant=relevant)

# Sex
keys = ['sex']
relevant = ['age and sex', 'age sex','sex','sex type','patient sex','cell sex']

print('Sex Redundancy')
sex_df = extract_meta_columns(keys, full_exp_df, col_name='sex', relevant=relevant)

Cell Type Reundancy
Num multi-rows: 10
Total experiments:  20012
Tissue Type Reundancy
Num multi-rows: 22
Total experiments:  13814
Provider Reundancy
Num multi-rows: 4
Total experiments:  4204
Sex Redundancy
Num multi-rows: 4
Total experiments:  10082


## Combine Parsed Redundant Attributes together into a final metadata dataframe

This includes batch information, source of experiment, tissue, age, treatment, genotype, source lab, age, gender, and health state. 

In [19]:
final_list = ['BATCH','source_name','tissue','age','treatment','genotype'
              ,'lab','age','Sex','gender','donor gender','health state']

final_df = full_exp_df[final_list]

val = pd.merge(final_df,cell_df,left_index=True, right_index=True, how='outer')
val = pd.merge(val,tissue_df,left_index=True, right_index=True, how='outer')
val = pd.merge(val,provider_df,left_index=True, right_index=True, how='outer')
final_df = pd.merge(val,sex_df,left_index=True, right_index=True, how='outer')
final_df = final_df.drop(['Sex','sex','gender','donor gender'],axis=1)
sex_df = combine_attributes(filter_attributes(final_df, ['sex','Sex','gender', 'donor gender']), combined_col_name = 'sex')

final_df = pd.merge(final_df,sex_df,left_index=True, right_index=True, how='outer')

KeyError: 'sex'

In [None]:
save = False

if save: 
    final_df.to_pickle("./exp_metadata_df.pkl")
#     exp_meta_mat = final_df.values
#     exp_meta_X = final_df.index.values
#     exp_meta_Y = final_df.columns.values

# Output Attributes to a Text File
# with open("attributes.txt", 'w') as f:
#     f.write("\n".join(sorted(list(full_exp_df))))

688  experiments found for: 
 H. sapiens  -->  Blood  -->  Lymphoblastoid cell line


['BATCH',
 'source_name',
 'tissue',
 'age',
 'treatment',
 'genotype',
 'lab',
 'age',
 'Sex',
 'gender',
 'donor gender',
 'health state',
 'cell_type',
 'tissue_type',
 'provider',
 'sex_x',
 'sex_y']

In [89]:
cell_type = 'Monocytes-CD14+'

e1 = pd.read_pickle('exp_metadata_df.pkl')
fin = pd.merge(exp_df.set_index('Experimental ID'),e1,left_index=True, right_index=True, how='outer')

exp_list = get_experiment_list(cell_type=cell_type)
lympho_df = fin.loc[exp_list]
#lympho_df.to_csv('GM12878.csv')
lympho_df[lympho_df['Meta data'].str.contains("disease")]

121  experiments found for: 
 H. sapiens  -->  Blood  -->  Monocytes-CD14+


Unnamed: 0,Genome assembly,Antigen class,Antigen,Cell type class,Cell type,Cell type description,Processing logs,Title,Meta data,BATCH,...,age,treatment,genotype,lab,age.1,health state,cell_type,tissue_type,provider,sex
SRX461539,hg19,Histone,H3K27me3,Blood,Monocytes-CD14+,Tissue=monocytes|Lineage=mesoderm|Description=...,"40033914,96.1,84.3,284",GSM1320312: ChIP-seq analysis of H3K27me3 in h...,source_name=CD14++ CD16- monocytes from blood ...,,...,,,,,,,,,,
SRX461540,hg19,Histone,H3K4me3,Blood,Monocytes-CD14+,Tissue=monocytes|Lineage=mesoderm|Description=...,"56660855,97.0,83.9,27151",GSM1320313: ChIP-seq analysis of H3K4me3 in h...,source_name=CD14++ CD16- monocytes from blood ...,,...,,,,,,,,,,
SRX461541,hg19,Histone,H3K9ac,Blood,Monocytes-CD14+,Tissue=monocytes|Lineage=mesoderm|Description=...,"57322386,89.8,88.9,1976",GSM1320314: ChIP-seq analysis of H3K9Ac in hu...,source_name=CD14++ CD16- monocytes from blood ...,,...,,,,,,,,,,
SRX461542,hg19,Input control,Input control,Blood,Monocytes-CD14+,Tissue=monocytes|Lineage=mesoderm|Description=...,"37363754,98.2,44.4,2519",GSM1320315: Sequencing of input DNA from human...,source_name=CD14++ CD16- monocytes from blood ...,,...,,,,,,,,,,
SRX461543,hg19,Histone,H3K27me3,Blood,Monocytes-CD14+,Tissue=monocytes|Lineage=mesoderm|Description=...,"49270540,89.2,72.3,920",GSM1320316: ChIP-seq analysis of H3K27me3 in h...,source_name=CD14++ CD16- monocytes from blood ...,,...,,,,,,,,,,
SRX461544,hg19,Histone,H3K4me3,Blood,Monocytes-CD14+,Tissue=monocytes|Lineage=mesoderm|Description=...,"67680186,89.5,90.7,22502",GSM1320317: ChIP-seq analysis of H3K4me3 in h...,source_name=CD14++ CD16- monocytes from blood ...,,...,,,,,,,,,,
SRX461545,hg19,Histone,H3K9ac,Blood,Monocytes-CD14+,Tissue=monocytes|Lineage=mesoderm|Description=...,"45011336,85.3,90.5,296",GSM1320318: ChIP-seq analysis of H3K9Ac in hu...,source_name=CD14++ CD16- monocytes from blood ...,,...,,,,,,,,,,
SRX461546,hg19,Input control,Input control,Blood,Monocytes-CD14+,Tissue=monocytes|Lineage=mesoderm|Description=...,"53812444,97.7,12.6,1398",GSM1320319: Sequencing of input DNA from human...,source_name=CD14++ CD16- monocytes from blood ...,,...,,,,,,,,,,
SRX461547,hg19,Histone,H3K27me3,Blood,Monocytes-CD14+,Tissue=monocytes|Lineage=mesoderm|Description=...,"54928877,89.4,88.7,697",GSM1320320: ChIP-seq analysis of H3K27me3 in h...,source_name=CD14++ CD16- monocytes from blood ...,,...,,,,,,,,,,
SRX461548,hg19,Histone,H3K4me3,Blood,Monocytes-CD14+,Tissue=monocytes|Lineage=mesoderm|Description=...,"43890155,96.7,74.5,28511",GSM1320321: ChIP-seq analysis of H3K4me3 in h...,source_name=CD14++ CD16- monocytes from blood ...,,...,,,,,,,,,,


In [90]:
lympho_df['Meta data'][0]

'MOLECULE=genomic DNA || DISEASE=None || BIOMATERIAL_PROVIDER=FHCRC HEIMFELD || BIOMATERIAL_TYPE=Primary Cell || CELL_TYPE=CD14 Primary Cells || MARKERS=CD14+ || DONOR_ID=RO 01679 || DONOR_AGE=year 21 || DONOR_HEALTH_STATUS=NA || DONOR_SEX=Male || DONOR_ETHNICITY=Caucasian || PASSAGE_IF_EXPANDED=NA || sample_term_id=CL_0001054'

(688, 22)