In [1]:
import datetime

class Timer:
    """A simple timer class"""

    def __init__(self):
        pass

    def start(self):
        """Starts the timer"""
        self.start = datetime.datetime.now()
        return self.start

    def stop(self, message="Total: "):
        """Stops the timer.  Returns the time elapsed"""
        self.stop = datetime.datetime.now()
        return message + str(self.stop - self.start)

    def now(self, message="Now: "):
        """Returns the current time with a message"""
        return message + ": " + str(datetime.datetime.now())

    def elapsed(self, message="Elapsed: "):
        """Time elapsed since start was called"""
        return message + str(datetime.datetime.now() - self.start)

    def split(self, message="Split started at: "):
        """Start a split timer"""
        self.split_start = datetime.datetime.now()
        return message + str(self.split_start)

    def unsplit(self, message="Unsplit: "):
        """Stops a split. Returns the time elapsed since split was called"""
        return message + str(datetime.datetime.now() - self.split_start)


In [3]:
import pandas as pd
import sqlite3
import os
from numpy import random

from pandas.io import sql

df = pd.DataFrame(random.randn(1000000,2),columns=list('AB'))

def test_sql_write(df):
    if os.path.exists('tmp/test.sql'):
        os.remove('tmp/test.sql')
    sql_db = sqlite3.connect('tmp/test.sql')
    df.to_sql(name='test_table', con=sql_db)
    sql_db.close()

def test_sql_read():
    sql_db = sqlite3.connect('tmp/test.sql')
    pd.read_sql_query("select * from test_table", sql_db)
    sql_db.close()

def test_hdf_fixed_write(df):
    df.to_hdf('tmp/test_fixed.hdf','test',mode='w')

def test_hdf_fixed_read():
    pd.read_hdf('tmp/test_fixed.hdf','test')

def test_hdf_fixed_write_compress(df):
    df.to_hdf('tmp/test_fixed_compress.hdf','test',mode='w',complib='blosc')

def test_hdf_fixed_read_compress():
    pd.read_hdf('tmp/test_fixed_compress.hdf','test')

def test_hdf_table_write(df):
    df.to_hdf('tmp/test_table.hdf','test',mode='w',format='table')
    
def test_hdf_table_read():
    pd.read_hdf('tmp/test_table.hdf','test')

def test_hdf_table_write_compress(df):
    df.to_hdf('tmp/test_table_compress.hdf','test',mode='w',complib='blosc',format='table')

def test_hdf_table_read_compress():
    pd.read_hdf('tmp/test_table_compress.hdf','test')

def test_csv_write(df):
    df.to_csv('tmp/test.csv',mode='w')

def test_csv_read():
    pd.read_csv('tmp/test.csv',index_col=0)
    


    

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 2 columns):
A    1000000 non-null float64
B    1000000 non-null float64
dtypes: float64(2)
memory usage: 15.3 MB


In [5]:
%timeit test_sql_write(df)
%timeit test_hdf_fixed_write(df)
%timeit test_hdf_fixed_write_compress(df)
%timeit test_hdf_table_write(df)
%timeit test_hdf_table_write_compress(df)
%timeit test_csv_write(df)

1 loop, best of 3: 4.54 s per loop
10 loops, best of 3: 25.7 ms per loop
10 loops, best of 3: 105 ms per loop
1 loop, best of 3: 420 ms per loop
1 loop, best of 3: 550 ms per loop
1 loop, best of 3: 4.78 s per loop


In [6]:
%timeit test_sql_read()
%timeit test_hdf_fixed_read()
%timeit test_hdf_fixed_read_compress()
%timeit test_hdf_table_read()
%timeit test_hdf_table_read_compress()
%timeit test_csv_read()

1 loop, best of 3: 1.72 s per loop
100 loops, best of 3: 11.3 ms per loop
10 loops, best of 3: 22.6 ms per loop
10 loops, best of 3: 20.7 ms per loop
10 loops, best of 3: 35.9 ms per loop
1 loop, best of 3: 743 ms per loop


In [7]:
### FIXED
timer = Timer()
timer.start()
file = "/Users/manuel/development/thesis/overlap/filtered_hg19DNase_H3K27ac_FANTOM_overlapped.csv"
df = pd.DataFrame.from_csv(file, sep="\t")
df.reset_index(level=0, inplace=True)
print("Loaded csv file to dataframe in", timer.elapsed())
timer.split()
df.to_hdf('encode_fantom.hdf','encode_fantom',mode='w')
print("Saved dataframe to hdf fixed table in", timer.unsplit())
timer.split()
returned = pd.read_hdf('encode_fantom.hdf','encode_fantom')
print("Loaded hdf table in", timer.unsplit())

Loaded csv file to dataframe in Elapsed: 0:00:09.236567
Saved dataframe to hdf fixed table in Unsplit: 0:00:03.845392
Loaded hdf table in Unsplit: 0:00:02.297161


In [8]:
### TABLE (query are allowed)
timer = Timer()
timer.start()
file = "/Users/manuel/development/thesis/overlap/filtered_hg19DNase_H3K27ac_FANTOM_overlapped.csv"
df = pd.DataFrame.from_csv(file, sep="\t")
df.reset_index(level=0, inplace=True)
print("Loaded csv file to dataframe in", timer.elapsed())
timer.split()
df.to_hdf('encode_fantom_table.hdf','encode_fantom', mode='w', format='table', data_columns=['biosample_term_name'])
print("Saved dataframe to hdf table in", timer.unsplit())
timer.split()
returned = pd.read_hdf('encode_fantom_table.hdf','encode_fantom')
print("Loaded hdf table in", timer.unsplit())
returned.info()

Loaded csv file to dataframe in Elapsed: 0:00:08.862346
Saved dataframe to hdf table in Unsplit: 0:00:35.124727


KeyboardInterrupt: 

In [9]:
timer = Timer()
timer.start()
placenta_df = pd.read_hdf('encode_fantom_table.hdf','encode_fantom',where='biosample_term_name == "placenta"')
print("Query on hdf table by biosample_term_name == \"placenta\" in", timer.elapsed())
timer.stop()

Query on hdf table by biosample_term_name == "placenta" in Elapsed: 0:00:00.732214


'Total: 0:00:00.732335'

In [10]:
placenta_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 38752 entries, 36794 to 75545
Data columns (total 27 columns):
chrom                  38752 non-null object
start                  38752 non-null int64
end                    38752 non-null int64
name                   38752 non-null object
score                  38752 non-null int64
strand                 38752 non-null object
size                   38752 non-null int64
method                 38752 non-null object
description            38752 non-null object
assembly               38752 non-null object
biosample_type         38752 non-null object
biosample_term_id      38752 non-null object
biosample_term_name    38752 non-null object
developmental_slims    38752 non-null object
system_slims           38752 non-null object
organ_slims            38752 non-null object
encyclopedia           38752 non-null object
FA_chrom               38752 non-null object
FA_start               38752 non-null int64
FA_end                 38752 non-null

In [249]:
import numpy as np
from pandas import HDFStore,DataFrame
# create (or open) an hdf5 file and opens in append mode
hdf = HDFStore('tmp/storage.h5')

In [250]:
df = DataFrame(np.random.rand(5,3), columns=('A','B','C'))

In [251]:
hdf.put('d1', df, format='table', data_columns=True)

In [277]:
hdf.close()
hdf = HDFStore('tmp/storage.h5')

In [253]:
hdf['d1']

Unnamed: 0,A,B,C
0,0.17708,0.848086,0.66934
1,0.739926,0.49105,0.285055
2,0.644054,0.840723,0.604718
3,0.087187,0.126708,0.228043
4,0.300771,0.554273,0.723586


In [278]:
hdf


<class 'pandas.io.pytables.HDFStore'>
File path: tmp/storage.h5
/another            frame_table  (typ->appendable,nrows->5,ncols->4,indexers->[index])            
/d1                 frame_table  (typ->appendable,nrows->5,ncols->3,indexers->[index],dc->[A,B,C])
/df_new             frame        (shape->[10,2])                                                  
/with_df            frame        (shape->[10,2])                                                  

In [257]:
df_new = DataFrame(np.random.rand(10,2), columns=('col1','col2'))

In [258]:
hdf.put('df_new', df_new, format='fixed')

In [264]:
hdf.close()

In [268]:
df_new.to_hdf('tmp/storage.h5', 'with_df', mode='a', format='fixed')

In [272]:
df_new_2 = DataFrame(np.random.rand(5,4), columns=('cA','cB', 'cC', 'cD'))

In [273]:
df_new_2

Unnamed: 0,cA,cB,cC,cD
0,0.143243,0.411492,0.447196,0.091126
1,0.709138,0.914033,0.140924,0.383184
2,0.914045,0.518004,0.913474,0.823266
3,0.790527,0.85307,0.26529,0.411477
4,0.640967,0.823737,0.296935,0.934472


In [274]:
df_new_2.to_hdf('tmp/storage.h5', 'another', mode='a', format='table')

In [276]:
hdf.close()

In [10]:
hdf.append('d1', DataFrame(np.random.rand(5,3), 
           columns=('A','B','C')), 
           format='table', data_columns=True)

In [19]:
df = hdf['d1']

In [21]:
df.reset_index(drop=True, inplace=True)

In [22]:
df

Unnamed: 0,A,B,C
0,0.794711,0.482078,0.696966
1,0.883016,0.769182,0.971613
2,0.44037,0.303744,0.677723
3,0.284244,0.675687,0.038991
4,0.077473,0.902328,0.291444
5,0.822127,0.432078,0.813824
6,0.504365,0.572067,0.011665
7,0.64699,0.12355,0.831861
8,0.842512,0.874672,0.860057
9,0.403208,0.535579,0.730168


In [23]:
hdf.put('d1', df, format='table', data_columns=True)

In [24]:
hdf['d1']

Unnamed: 0,A,B,C
0,0.794711,0.482078,0.696966
1,0.883016,0.769182,0.971613
2,0.44037,0.303744,0.677723
3,0.284244,0.675687,0.038991
4,0.077473,0.902328,0.291444
5,0.822127,0.432078,0.813824
6,0.504365,0.572067,0.011665
7,0.64699,0.12355,0.831861
8,0.842512,0.874672,0.860057
9,0.403208,0.535579,0.730168


In [26]:
hdf.select('d1', 'A > 0.5')

Unnamed: 0,A,B,C
0,0.794711,0.482078,0.696966
1,0.883016,0.769182,0.971613
5,0.822127,0.432078,0.813824
6,0.504365,0.572067,0.011665
7,0.64699,0.12355,0.831861
8,0.842512,0.874672,0.860057


In [282]:
hdf.close()

In [283]:
hdf.is_open

False

In [42]:
import pandas as pd
df1 = pd.read_hdf('tmp/storage.h5', 'd1', where='A > 0.5')

In [43]:
df1

Unnamed: 0,A,B,C
0,0.794711,0.482078,0.696966
1,0.883016,0.769182,0.971613
5,0.822127,0.432078,0.813824
6,0.504365,0.572067,0.011665
7,0.64699,0.12355,0.831861
8,0.842512,0.874672,0.860057


In [44]:
df1['D'] = 'blah'

In [45]:
df1

Unnamed: 0,A,B,C,D
0,0.794711,0.482078,0.696966,blah
1,0.883016,0.769182,0.971613,blah
5,0.822127,0.432078,0.813824,blah
6,0.504365,0.572067,0.011665,blah
7,0.64699,0.12355,0.831861,blah
8,0.842512,0.874672,0.860057,blah


In [47]:
import pandas as pd
import numpy as np
df_prova = pd.DataFrame(np.random.rand(5,3), columns=('A','B','C'))
df_copy = df_prova.copy()

In [48]:
df_prova = df_prova.append(pd.DataFrame(np.random.rand(5,3), columns=('A','B','C')))
df_prova.reset_index(inplace=True, drop=True)

In [49]:
df_prova

Unnamed: 0,A,B,C
0,0.818263,0.275533,0.597588
1,0.99448,0.464801,0.174817
2,0.209144,0.529241,0.87572
3,0.693763,0.143788,0.227387
4,0.32678,0.480611,0.098374
5,0.815444,0.808666,0.31903
6,0.741134,0.702763,0.425576
7,0.017374,0.072014,0.112435
8,0.829995,0.402769,0.853363
9,0.396224,0.649548,0.724502


In [50]:
df_copy

Unnamed: 0,A,B,C
0,0.818263,0.275533,0.597588
1,0.99448,0.464801,0.174817
2,0.209144,0.529241,0.87572
3,0.693763,0.143788,0.227387
4,0.32678,0.480611,0.098374


In [52]:
term = "A way    in some way"
term.replace(" ", "_")

'A_way____in_some_way'

## Store bed files using hdf

In [71]:
import pybedtools
from pybedtools import BedTool
a = pybedtools.example_bedtool('a.bed')
hdf = HDFStore('tmp/beds.h5')

hdf.put('a', a.to_dataframe())

In [72]:
a_df = hdf.select('a')
a_df

Unnamed: 0,chrom,start,end,name,score,strand
0,chr1,1,100,feature1,0,+
1,chr1,100,200,feature2,0,+
2,chr1,150,500,feature3,0,-
3,chr1,900,950,feature4,0,+


In [74]:
a = BedTool().from_dataframe(a_df)

In [75]:
a.to_dataframe()

Unnamed: 0,chrom,start,end,name,score,strand
0,chr1,1,100,feature1,0,+
1,chr1,100,200,feature2,0,+
2,chr1,150,500,feature3,0,-
3,chr1,900,950,feature4,0,+


In [107]:
file = "/Users/manuel/development/thesis/overlap/filtered_hg19DNase_H3K27ac_FANTOM_overlapped.csv"
df = pd.DataFrame.from_csv(file, sep="\t")
df.reset_index(level=0, inplace=True)

df['ovlp_name'] = df['name'] + df['FA_name']

In [112]:
overlaps_only_df = df.query('FA_ovlp_pct > 0')[['chrom', 'start', 'end', 'ovlp_name', 'score', 'strand']]

In [119]:
len(overlaps_only_df)

212041

In [113]:
encode_ovlp_fantom_bed = BedTool().from_dataframe(overlaps_only_df)

In [120]:
encode_ovlp_fantom_bed.count()

212041

In [115]:
after_some_operations_df = encode_ovlp_fantom_bed.to_dataframe()

In [116]:
after_some_operations_and_metadata_df = after_some_operations_df.merge(
            df[['name', 'size', 'method',
       'description', 'assembly', 'biosample_type', 'biosample_term_id',
       'biosample_term_name', 'developmental_slims', 'system_slims',
       'organ_slims', 'encyclopedia', 'FA_chrom', 'FA_start', 'FA_end',
       'FA_name', 'FA_score', 'FA_size', 'FA_method', 'FA_ovlp_len',
       'FA_ovlp_pct', 'FA_encyclopedia', 'ovlp_name']],
            how='left', left_on='name', right_on='ovlp_name')

In [121]:
len(after_some_operations_and_metadata_df)

212041

In [122]:
len(df.query('FA_ovlp_pct > 0'))

212041

In [123]:
after_some_operations_and_metadata_df

Unnamed: 0,chrom,start,end,name_x,score,strand,name_y,size,method,description,...,FA_start,FA_end,FA_name,FA_score,FA_size,FA_method,FA_ovlp_len,FA_ovlp_pct,FA_encyclopedia,ovlp_name
0,chr10,3892558,3895911,ENCODE.3.ENCFF778PVS.6FANTOM.5.PERMISSIVE.3965,1,.,ENCODE.3.ENCFF778PVS.6,3353,DNase_H3K27ac,Enhancer-like regions using DNase and H3K27ac ...,...,3893365,3894190,FANTOM.5.PERMISSIVE.3965,693,825,CAGE_TCs,825,24.604831,FANTOM,ENCODE.3.ENCFF778PVS.6FANTOM.5.PERMISSIVE.3965
1,chr3,5062817,5068862,ENCODE.3.ENCFF778PVS.8FANTOM.5.PERMISSIVE.26422,1,.,ENCODE.3.ENCFF778PVS.8,6045,DNase_H3K27ac,Enhancer-like regions using DNase and H3K27ac ...,...,5067974,5068590,FANTOM.5.PERMISSIVE.26422,256,616,CAGE_TCs,616,10.190240,FANTOM,ENCODE.3.ENCFF778PVS.8FANTOM.5.PERMISSIVE.26422
2,chr8,126230865,126234434,ENCODE.3.ENCFF778PVS.9FANTOM.5.PERMISSIVE.39851,1,.,ENCODE.3.ENCFF778PVS.9,3569,DNase_H3K27ac,Enhancer-like regions using DNase and H3K27ac ...,...,126231490,126231859,FANTOM.5.PERMISSIVE.39851,140,369,CAGE_TCs,369,10.339031,FANTOM,ENCODE.3.ENCFF778PVS.9FANTOM.5.PERMISSIVE.39851
3,chr8,126230865,126234434,ENCODE.3.ENCFF778PVS.9FANTOM.5.PERMISSIVE.39852,1,.,ENCODE.3.ENCFF778PVS.9,3569,DNase_H3K27ac,Enhancer-like regions using DNase and H3K27ac ...,...,126232175,126232753,FANTOM.5.PERMISSIVE.39852,291,578,CAGE_TCs,578,16.195013,FANTOM,ENCODE.3.ENCFF778PVS.9FANTOM.5.PERMISSIVE.39852
4,chr11,94479303,94485487,ENCODE.3.ENCFF778PVS.12FANTOM.5.PERMISSIVE.7393,1,.,ENCODE.3.ENCFF778PVS.12,6184,DNase_H3K27ac,Enhancer-like regions using DNase and H3K27ac ...,...,94480491,94481212,FANTOM.5.PERMISSIVE.7393,13,721,CAGE_TCs,721,11.659120,FANTOM,ENCODE.3.ENCFF778PVS.12FANTOM.5.PERMISSIVE.7393
5,chr10,3845854,3855339,ENCODE.3.ENCFF778PVS.15FANTOM.5.PERMISSIVE.3954,1,.,ENCODE.3.ENCFF778PVS.15,9485,DNase_H3K27ac,Enhancer-like regions using DNase and H3K27ac ...,...,3848037,3849536,FANTOM.5.PERMISSIVE.3954,339,1499,CAGE_TCs,1499,15.803901,FANTOM,ENCODE.3.ENCFF778PVS.15FANTOM.5.PERMISSIVE.3954
6,chr6,159268701,159276266,ENCODE.3.ENCFF778PVS.21FANTOM.5.PERMISSIVE.35853,1,.,ENCODE.3.ENCFF778PVS.21,7565,DNase_H3K27ac,Enhancer-like regions using DNase and H3K27ac ...,...,159273925,159274893,FANTOM.5.PERMISSIVE.35853,807,968,CAGE_TCs,968,12.795770,FANTOM,ENCODE.3.ENCFF778PVS.21FANTOM.5.PERMISSIVE.35853
7,chr1,145113555,145115561,ENCODE.3.ENCFF778PVS.25FANTOM.5.PERMISSIVE.1955,1,.,ENCODE.3.ENCFF778PVS.25,2006,DNase_H3K27ac,Enhancer-like regions using DNase and H3K27ac ...,...,145113904,145114998,FANTOM.5.PERMISSIVE.1955,2794,1094,CAGE_TCs,1094,54.536391,FANTOM,ENCODE.3.ENCFF778PVS.25FANTOM.5.PERMISSIVE.1955
8,chr1,235132364,235135272,ENCODE.3.ENCFF778PVS.35FANTOM.5.PERMISSIVE.3671,1,.,ENCODE.3.ENCFF778PVS.35,2908,DNase_H3K27ac,Enhancer-like regions using DNase and H3K27ac ...,...,235133014,235133422,FANTOM.5.PERMISSIVE.3671,135,408,CAGE_TCs,408,14.030261,FANTOM,ENCODE.3.ENCFF778PVS.35FANTOM.5.PERMISSIVE.3671
9,chr1,235132364,235135272,ENCODE.3.ENCFF778PVS.35FANTOM.5.PERMISSIVE.3672,1,.,ENCODE.3.ENCFF778PVS.35,2908,DNase_H3K27ac,Enhancer-like regions using DNase and H3K27ac ...,...,235134086,235134438,FANTOM.5.PERMISSIVE.3672,45,352,CAGE_TCs,352,12.104539,FANTOM,ENCODE.3.ENCFF778PVS.35FANTOM.5.PERMISSIVE.3672


In [124]:
len(df)

1801781

In [125]:
df.drop_duplicates(inplace=True)

In [126]:
len(df)

1801781

In [141]:
store = pd.HDFStore('tmp/storage.h5')

In [142]:
store.keys()

['/d1']

In [143]:
'd1' in store

True

In [147]:
store.is_open

False

In [146]:
store.close()

In [148]:
store = pd.HDFStore('tmp/cane.h5')

In [245]:
store.close()

In [246]:
store = pd.HDFStore('../storage/downloads.hdf')

In [247]:
store.keys()

[]

In [237]:
meta = store['encode_metadata']

KeyError: 'No object named encode_metadata in the file'

In [226]:
meta.reset_index(inplace=True, drop=True)

In [227]:
meta

Unnamed: 0,@id,@type,accession,aliases,alternate_accessions,annotation_type,assembly,award,biosample_synonyms,biosample_term_id,...,related_files,revoked_files,schema_version,status,submitted_by,superseded_by,supersedes,system_slims,targets,uuid
0,/annotations/ENCSR615QPD/,"[Annotation, FileSet, Dataset, Item]",ENCSR615QPD,[zhiping-weng:v3-enhancer-like-histone-only-EN...,[],enhancer-like regions,mm10-minimal,/awards/U41HG007000/,"[iecur, jecur, Entire liver]",UBERON:0002107,...,[],[],10,released,/users/ef309f6b-c671-44c8-a33d-3850bd14fe63/,[],[],"[digestive system, endocrine system]",[],8f0ff07d-a14e-4f1d-9a10-9d78160695ef
1,/annotations/ENCSR195RZJ/,"[Annotation, FileSet, Dataset, Item]",ENCSR195RZJ,[zhiping-weng:v3-enhancer-like-histone-only-EN...,[],enhancer-like regions,mm10-minimal,/awards/U41HG007000/,[],UBERON:0001049,...,[],[],10,released,/users/ef309f6b-c671-44c8-a33d-3850bd14fe63/,[],[],[],[],6a51ee05-aeaf-4ff6-985d-e0614f5cba0c
2,/annotations/ENCSR530VHP/,"[Annotation, FileSet, Dataset, Item]",ENCSR530VHP,[zhiping-weng:v3-enhancer-like-histone-only-EN...,[],enhancer-like regions,mm10-minimal,/awards/U41HG007000/,"[iecur, jecur, Entire liver]",UBERON:0002107,...,[],[],10,released,/users/ef309f6b-c671-44c8-a33d-3850bd14fe63/,[],[],"[digestive system, endocrine system]",[],957f04ae-56a3-433a-ba74-217b3d3206cf
3,/annotations/ENCSR519ZWO/,"[Annotation, FileSet, Dataset, Item]",ENCSR519ZWO,[zhiping-weng:v3-enhancer-like-dnase-histone-E...,[],enhancer-like regions,mm10-minimal,/awards/U41HG007000/,"[Entire midbrain, mesencephalon]",UBERON:0001891,...,[],[],10,released,/users/ef309f6b-c671-44c8-a33d-3850bd14fe63/,[],[],[central nervous system],[],dc79331c-8182-4eca-bc38-e0685e5fa5f0
4,/annotations/ENCSR161NGP/,"[Annotation, FileSet, Dataset, Item]",ENCSR161NGP,[zhiping-weng:v3-enhancer-like-histone-only-EN...,[],enhancer-like regions,mm10-minimal,/awards/U41HG007000/,"[Entire midbrain, mesencephalon]",UBERON:0001891,...,[],[],10,released,/users/ef309f6b-c671-44c8-a33d-3850bd14fe63/,[],[],[central nervous system],[],035898f4-52f8-4d2e-91d1-9d4c4ae54598
5,/annotations/ENCSR148VNG/,"[Annotation, FileSet, Dataset, Item]",ENCSR148VNG,[zhiping-weng:v3-enhancer-like-histone-only-EN...,[],enhancer-like regions,mm10-minimal,/awards/U41HG007000/,"[Entire prosencephalon, prosencephalon]",UBERON:0001890,...,[],[],10,released,/users/ef309f6b-c671-44c8-a33d-3850bd14fe63/,[],[],[central nervous system],[],e11ce9cb-2d61-49aa-ad12-916e5cca881f
6,/annotations/ENCSR522DOY/,"[Annotation, FileSet, Dataset, Item]",ENCSR522DOY,[zhiping-weng:v3-enhancer-like-dnase-only-ENCS...,[],enhancer-like regions,hg19,/awards/ENCODE/,[],CL:0002586,...,[],[],10,released,/users/ef309f6b-c671-44c8-a33d-3850bd14fe63/,[],[],[sensory system],[],f1016833-b259-4cee-9e70-43ce41c6045b
7,/annotations/ENCSR800JIB/,"[Annotation, FileSet, Dataset, Item]",ENCSR800JIB,[zhiping-weng:v3-enhancer-like-dnase-only-ENCS...,[],enhancer-like regions,hg19,/awards/ENCODE/,[],EFO:0001247,...,[],[],10,released,/users/ef309f6b-c671-44c8-a33d-3850bd14fe63/,[],[],[],[],d5bac420-7957-467f-8604-52b967b32a1d
8,/annotations/ENCSR724FFK/,"[Annotation, FileSet, Dataset, Item]",ENCSR724FFK,[zhiping-weng:v3-enhancer-like-dnase-only-ENCS...,[],enhancer-like regions,hg19,/awards/ENCODE/,"[Th1 T lymphocyte, Th1 CD4+ T cell, Th1 cell, ...",CL:0000545,...,[],[],10,released,/users/ef309f6b-c671-44c8-a33d-3850bd14fe63/,[],[],[immune system],[],60f0b95a-a8e2-458a-8ec5-d60b9b56af86
9,/annotations/ENCSR569UNI/,"[Annotation, FileSet, Dataset, Item]",ENCSR569UNI,[zhiping-weng:v3-enhancer-like-dnase-only-ENCS...,[],enhancer-like regions,hg19,/awards/ENCODE/,[HTB186],EFO:0005698,...,[],[],10,released,/users/ef309f6b-c671-44c8-a33d-3850bd14fe63/,[],[],[],[],c3a8c858-25df-4bc1-9401-b6b337656060


In [220]:
meta.query('imported == False')

Unnamed: 0,@id,@type,accession,aliases,alternate_accessions,annotation_type,assembly,award,biosample_synonyms,biosample_term_id,...,related_files,revoked_files,schema_version,status,submitted_by,superseded_by,supersedes,system_slims,targets,uuid
0,/annotations/ENCSR615QPD/,"[Annotation, FileSet, Dataset, Item]",ENCSR615QPD,[zhiping-weng:v3-enhancer-like-histone-only-EN...,[],enhancer-like regions,mm10-minimal,/awards/U41HG007000/,"[iecur, jecur, Entire liver]",UBERON:0002107,...,[],[],10,released,/users/ef309f6b-c671-44c8-a33d-3850bd14fe63/,[],[],"[digestive system, endocrine system]",[],8f0ff07d-a14e-4f1d-9a10-9d78160695ef
1,/annotations/ENCSR195RZJ/,"[Annotation, FileSet, Dataset, Item]",ENCSR195RZJ,[zhiping-weng:v3-enhancer-like-histone-only-EN...,[],enhancer-like regions,mm10-minimal,/awards/U41HG007000/,[],UBERON:0001049,...,[],[],10,released,/users/ef309f6b-c671-44c8-a33d-3850bd14fe63/,[],[],[],[],6a51ee05-aeaf-4ff6-985d-e0614f5cba0c
2,/annotations/ENCSR530VHP/,"[Annotation, FileSet, Dataset, Item]",ENCSR530VHP,[zhiping-weng:v3-enhancer-like-histone-only-EN...,[],enhancer-like regions,mm10-minimal,/awards/U41HG007000/,"[iecur, jecur, Entire liver]",UBERON:0002107,...,[],[],10,released,/users/ef309f6b-c671-44c8-a33d-3850bd14fe63/,[],[],"[digestive system, endocrine system]",[],957f04ae-56a3-433a-ba74-217b3d3206cf
3,/annotations/ENCSR519ZWO/,"[Annotation, FileSet, Dataset, Item]",ENCSR519ZWO,[zhiping-weng:v3-enhancer-like-dnase-histone-E...,[],enhancer-like regions,mm10-minimal,/awards/U41HG007000/,"[Entire midbrain, mesencephalon]",UBERON:0001891,...,[],[],10,released,/users/ef309f6b-c671-44c8-a33d-3850bd14fe63/,[],[],[central nervous system],[],dc79331c-8182-4eca-bc38-e0685e5fa5f0
4,/annotations/ENCSR161NGP/,"[Annotation, FileSet, Dataset, Item]",ENCSR161NGP,[zhiping-weng:v3-enhancer-like-histone-only-EN...,[],enhancer-like regions,mm10-minimal,/awards/U41HG007000/,"[Entire midbrain, mesencephalon]",UBERON:0001891,...,[],[],10,released,/users/ef309f6b-c671-44c8-a33d-3850bd14fe63/,[],[],[central nervous system],[],035898f4-52f8-4d2e-91d1-9d4c4ae54598
5,/annotations/ENCSR148VNG/,"[Annotation, FileSet, Dataset, Item]",ENCSR148VNG,[zhiping-weng:v3-enhancer-like-histone-only-EN...,[],enhancer-like regions,mm10-minimal,/awards/U41HG007000/,"[Entire prosencephalon, prosencephalon]",UBERON:0001890,...,[],[],10,released,/users/ef309f6b-c671-44c8-a33d-3850bd14fe63/,[],[],[central nervous system],[],e11ce9cb-2d61-49aa-ad12-916e5cca881f
6,/annotations/ENCSR522DOY/,"[Annotation, FileSet, Dataset, Item]",ENCSR522DOY,[zhiping-weng:v3-enhancer-like-dnase-only-ENCS...,[],enhancer-like regions,hg19,/awards/ENCODE/,[],CL:0002586,...,[],[],10,released,/users/ef309f6b-c671-44c8-a33d-3850bd14fe63/,[],[],[sensory system],[],f1016833-b259-4cee-9e70-43ce41c6045b
7,/annotations/ENCSR800JIB/,"[Annotation, FileSet, Dataset, Item]",ENCSR800JIB,[zhiping-weng:v3-enhancer-like-dnase-only-ENCS...,[],enhancer-like regions,hg19,/awards/ENCODE/,[],EFO:0001247,...,[],[],10,released,/users/ef309f6b-c671-44c8-a33d-3850bd14fe63/,[],[],[],[],d5bac420-7957-467f-8604-52b967b32a1d
8,/annotations/ENCSR724FFK/,"[Annotation, FileSet, Dataset, Item]",ENCSR724FFK,[zhiping-weng:v3-enhancer-like-dnase-only-ENCS...,[],enhancer-like regions,hg19,/awards/ENCODE/,"[Th1 T lymphocyte, Th1 CD4+ T cell, Th1 cell, ...",CL:0000545,...,[],[],10,released,/users/ef309f6b-c671-44c8-a33d-3850bd14fe63/,[],[],[immune system],[],60f0b95a-a8e2-458a-8ec5-d60b9b56af86
9,/annotations/ENCSR569UNI/,"[Annotation, FileSet, Dataset, Item]",ENCSR569UNI,[zhiping-weng:v3-enhancer-like-dnase-only-ENCS...,[],enhancer-like regions,hg19,/awards/ENCODE/,[HTB186],EFO:0005698,...,[],[],10,released,/users/ef309f6b-c671-44c8-a33d-3850bd14fe63/,[],[],[],[],c3a8c858-25df-4bc1-9401-b6b337656060


In [202]:
meta.T

Unnamed: 0,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,...,0.10,0.11,0.12,0.13,0.14,0.15,0.16,0.17,0.18,0.19
@id,/annotations/ENCSR615QPD/,/annotations/ENCSR195RZJ/,/annotations/ENCSR530VHP/,/annotations/ENCSR519ZWO/,/annotations/ENCSR161NGP/,/annotations/ENCSR148VNG/,/annotations/ENCSR522DOY/,/annotations/ENCSR800JIB/,/annotations/ENCSR724FFK/,/annotations/ENCSR569UNI/,...,/annotations/ENCSR645JVZ/,/annotations/ENCSR506QZM/,/annotations/ENCSR840KOW/,/annotations/ENCSR614XSX/,/annotations/ENCSR618QHY/,/annotations/ENCSR495KQL/,/annotations/ENCSR456SQX/,/annotations/ENCSR936YWW/,/annotations/ENCSR226GOY/,/annotations/ENCSR136OLS/
@type,"[Annotation, FileSet, Dataset, Item]","[Annotation, FileSet, Dataset, Item]","[Annotation, FileSet, Dataset, Item]","[Annotation, FileSet, Dataset, Item]","[Annotation, FileSet, Dataset, Item]","[Annotation, FileSet, Dataset, Item]","[Annotation, FileSet, Dataset, Item]","[Annotation, FileSet, Dataset, Item]","[Annotation, FileSet, Dataset, Item]","[Annotation, FileSet, Dataset, Item]",...,"[Annotation, FileSet, Dataset, Item]","[Annotation, FileSet, Dataset, Item]","[Annotation, FileSet, Dataset, Item]","[Annotation, FileSet, Dataset, Item]","[Annotation, FileSet, Dataset, Item]","[Annotation, FileSet, Dataset, Item]","[Annotation, FileSet, Dataset, Item]","[Annotation, FileSet, Dataset, Item]","[Annotation, FileSet, Dataset, Item]","[Annotation, FileSet, Dataset, Item]"
accession,ENCSR615QPD,ENCSR195RZJ,ENCSR530VHP,ENCSR519ZWO,ENCSR161NGP,ENCSR148VNG,ENCSR522DOY,ENCSR800JIB,ENCSR724FFK,ENCSR569UNI,...,ENCSR645JVZ,ENCSR506QZM,ENCSR840KOW,ENCSR614XSX,ENCSR618QHY,ENCSR495KQL,ENCSR456SQX,ENCSR936YWW,ENCSR226GOY,ENCSR136OLS
aliases,[zhiping-weng:v3-enhancer-like-histone-only-EN...,[zhiping-weng:v3-enhancer-like-histone-only-EN...,[zhiping-weng:v3-enhancer-like-histone-only-EN...,[zhiping-weng:v3-enhancer-like-dnase-histone-E...,[zhiping-weng:v3-enhancer-like-histone-only-EN...,[zhiping-weng:v3-enhancer-like-histone-only-EN...,[zhiping-weng:v3-enhancer-like-dnase-only-ENCS...,[zhiping-weng:v3-enhancer-like-dnase-only-ENCS...,[zhiping-weng:v3-enhancer-like-dnase-only-ENCS...,[zhiping-weng:v3-enhancer-like-dnase-only-ENCS...,...,[zhiping-weng:v3-enhancer-like-dnase-only-ENCS...,[zhiping-weng:v3-enhancer-like-dnase-only-ENCS...,[zhiping-weng:v3-enhancer-like-dnase-histone-E...,[zhiping-weng:v3-enhancer-like-dnase-only-ENCS...,[zhiping-weng:v3-enhancer-like-dnase-only-ENCS...,[zhiping-weng:v3-enhancer-like-histone-only-E115],[zhiping-weng:v3-enhancer-like-histone-only-E096],[zhiping-weng:v3-enhancer-like-histone-only-E119],[zhiping-weng:v3-enhancer-like-histone-only-E080],[zhiping-weng:v3-enhancer-like-histone-only-E007]
alternate_accessions,[],[],[],[],[],[],[],[],[],[],...,[],[],[],[],[],[],[],[],[],[]
annotation_type,enhancer-like regions,enhancer-like regions,enhancer-like regions,enhancer-like regions,enhancer-like regions,enhancer-like regions,enhancer-like regions,enhancer-like regions,enhancer-like regions,enhancer-like regions,...,enhancer-like regions,enhancer-like regions,enhancer-like regions,enhancer-like regions,enhancer-like regions,enhancer-like regions,enhancer-like regions,enhancer-like regions,enhancer-like regions,enhancer-like regions
assembly,mm10-minimal,mm10-minimal,mm10-minimal,mm10-minimal,mm10-minimal,mm10-minimal,hg19,hg19,hg19,hg19,...,hg19,hg19,hg19,hg19,hg19,hg19,hg19,hg19,hg19,hg19
award,/awards/U41HG007000/,/awards/U41HG007000/,/awards/U41HG007000/,/awards/U41HG007000/,/awards/U41HG007000/,/awards/U41HG007000/,/awards/ENCODE/,/awards/ENCODE/,/awards/ENCODE/,/awards/ENCODE/,...,/awards/ENCODE/,/awards/ENCODE/,/awards/ENCODE/,/awards/ENCODE/,/awards/ENCODE/,/awards/U41HG007000/,/awards/U41HG007000/,/awards/U41HG007000/,/awards/U41HG007000/,/awards/U41HG007000/
biosample_synonyms,"[iecur, jecur, Entire liver]",[],"[iecur, jecur, Entire liver]","[Entire midbrain, mesencephalon]","[Entire midbrain, mesencephalon]","[Entire prosencephalon, prosencephalon]",[],[],"[Th1 T lymphocyte, Th1 CD4+ T cell, Th1 cell, ...",[HTB186],...,"[CRL-1932, 786_0, RCC7860, RCC_7860, RCC 7860]",[],[astrocytic glia],"[encephalon, synganglion, Entire brain]","[Entire spinal cord, spinal medulla, medulla s...",[ACC525],"[Entire lung, pulmo]","[breast epithelial cell, mammary epithelial cell]","[glandula adrenalis, glandula suprarenalis]","[neural stem cell, neural stem progenitor cell..."
biosample_term_id,UBERON:0002107,UBERON:0001049,UBERON:0002107,UBERON:0001891,UBERON:0001891,UBERON:0001890,CL:0002586,EFO:0001247,CL:0000545,EFO:0005698,...,EFO:0005707,CL:0002555,CL:0000127,UBERON:0000955,UBERON:0002240,EFO:0007074,UBERON:0002048,CL:0002327,UBERON:0002369,CL:0000047


In [59]:
store.close()

In [57]:
store = pd.HDFStore('../storage/encode_staging.hdf')

In [58]:
store.keys()

[]

In [65]:
store = pd.HDFStore('../storage/stats.hdf')

In [66]:
store.keys()

['/encode_fantom_reldist', '/encode_fantom_tests']

In [70]:
store['encode_fantom_reldist'].head()

Unnamed: 0,encyclopedia,biosample_name,ovlp_encyclopedia,encyclopedia_size,ovlp_encyclopedia_size,reldist,ovlp_count,ovlp_fraction
0,ENCODE,adrenal gland,FANTOM,38634.0,43011.0,0.0,2618.0,0.068
1,ENCODE,adrenal gland,FANTOM,38634.0,43011.0,0.01,1333.0,0.035
2,ENCODE,adrenal gland,FANTOM,38634.0,43011.0,0.02,1127.0,0.029
3,ENCODE,adrenal gland,FANTOM,38634.0,43011.0,0.03,962.0,0.025
4,ENCODE,adrenal gland,FANTOM,38634.0,43011.0,0.04,892.0,0.023


In [68]:
store['encode_fantom_tests'].head()

Unnamed: 0,encyclopedia,biosample_name,ovlp_encyclopedia,encyclopedia_size,ovlp_encyclopedia_size,min_ovlp,ovlp_count,z_random,z_shuffled,fisher_right_p,jaccard
0,ENCODE,adrenal gland,FANTOM,38634.0,43011.0,0.1,3935.0,159.284426,137.501569,1.0,0.017648
1,ENCODE,adrenal gland,FANTOM,38634.0,43011.0,0.2,1627.0,79.69203,95.488257,2.0708000000000003e-28,0.008181
2,ENCODE,adrenal gland,FANTOM,38634.0,43011.0,0.3,817.0,114.272055,84.685343,1.0,0.004265
3,ENCODE,adrenal gland,FANTOM,38634.0,43011.0,0.4,430.0,109.081765,65.376154,1.0,0.002139
4,ENCODE,adrenal gland,FANTOM,38634.0,43011.0,0.5,285.0,97.524247,42.011767,1.0,0.001264


In [64]:
store.close()

In [3]:
import pandas as pd
import numpy as np

In [11]:
df = pd.DataFrame(np.random.rand(5,4), columns=('A','B','C', 'E'))

In [12]:
df

Unnamed: 0,A,B,C,E
0,0.635898,0.790049,0.215816,0.76305
1,0.125801,0.557713,0.711374,0.927946
2,0.973146,0.450443,0.00286,0.317759
3,0.286271,0.469885,0.472969,0.552272
4,0.371127,0.155284,0.942506,0.773598


In [13]:
df2 = pd.DataFrame(np.random.rand(5,4), columns=('A','B','C','D'))

In [14]:
df2.append(df)

Unnamed: 0,A,B,C,D,E
0,0.667414,0.639236,0.201143,0.742717,
1,0.920193,0.410856,0.454657,0.235716,
2,0.056874,0.845272,0.05676,0.037293,
3,0.045872,0.74464,0.920737,0.381974,
4,0.807417,0.84201,0.970083,0.335893,
0,0.635898,0.790049,0.215816,,0.76305
1,0.125801,0.557713,0.711374,,0.927946
2,0.973146,0.450443,0.00286,,0.317759
3,0.286271,0.469885,0.472969,,0.552272
4,0.371127,0.155284,0.942506,,0.773598
