# Speed test reading in data files

## I know `np.loadtxt` can be really slow. I want to test out a better method.
- Python with lists takes ~1 minute
- stacking np arrays ~1 minute
- Pandas ~12 sec
- Julia  ~16 sec
- HDF5 = super fast

In [17]:
import pandas as pd
import numpy as np
from glob import glob
from astropy.table import Table

In [6]:
test_file = "../../../Data/sim_data/z2.45/spec_xHeII1_015_mt_line0000.dat"
names = ["velocity [km/s]", "tau_HILya", "tau_HeI584", "tau_HeIILya", 
         "nHI [cm^-3]", "nHeII [cm^-3]", "Delta_b", "T [K]", "xpos [Mpc/h]", "vpec [km/sec]"]
pd.read_table(test_file, header=0, names=names, delimiter=' ', comment='#')[:5]

Unnamed: 0,velocity [km/s],tau_HILya,tau_HeI584,tau_HeIILya,nHI [cm^-3],nHeII [cm^-3],Delta_b,T [K],xpos [Mpc/h],vpec [km/sec]
0,0.8441,0.112,0.003089,2.868,8.913e-12,8.913e-10,3.001,17210,7567,-87.03
1,1.688,0.1155,0.003236,3.005,9.094e-12,9.094e-10,3.004,17210,7575,-86.82
2,2.532,0.1189,0.003373,3.131,9.271e-12,9.271e-10,3.005,17190,7583,-86.39
3,3.377,0.1224,0.003496,3.246,9.583e-12,9.583e-10,3.057,17160,7592,-85.82
4,4.221,0.1258,0.003606,3.348,1e-11,1e-09,3.157,17110,7600,-85.24


In [12]:
file_list = glob("../../../Data/sim_data/z2.45/spec_xHeII1_015_mt_line*.dat")
def py_read_files(file_list):
    step = 10
    t_list = []
    # nHI = []
    # file_list = glob(path+'sim_data/z2.45/spec_xHeII1_015_mt_line*.dat')
    for f in file_list:
        v,t,c1,c2,n,c4,c5,c6,c7,c8 = np.loadtxt(f, unpack=True)
        t_list.append(t)
        # nHI.append(n)
    # diffv = v[1] - v[0]
    return t_list

def np_read_files(file_list):
    t_array = np.zeros(0)
    for f in file_list:
        v,t,c1,c2,n,c4,c5,c6,c7,c8 = np.loadtxt(f, unpack=True)
        np.hstack((t_array, t))
    return t_array

def pd_vec_read_files(f):
    t_array = np.zeros(0)
    df = pd.read_table(f, header=None, comment='#', delimiter=' ')
    np.hstack((t_array, df.values[1]))

pd_vec_read_files = np.vectorize(pd_vec_read_files)
    

def pd_read_files(infile):
    t_array = np.zeros(0)
    for f in infile:
        df = pd.read_table(f, header=None, comment='#', delimiter=' ')
        # n = df.values[4]
        np.hstack((t_array, df.values[1]))
    return t_array

def quick_pd_read(files):
    # return pd.concat((pd.read_table(f, header=None, comment='#', delimiter=' ',index_col=False) for f in file_list))
    return pd.concat((pd.read_table(f, header=None, comment='#', delimiter=' ') for f in file_list))

In [15]:
%timeit py_read_files(file_list)

1 loops, best of 3: 1min 7s per loop


In [14]:
%timeit pd_read_files(file_list)

1 loops, best of 3: 12.3 s per loop


In [10]:
%timeit pd_vec_read_files(file_list)

1 loops, best of 3: 12.6 s per loop


In [13]:
%timeit quick_pd_read(file_list)

1 loops, best of 3: 12.5 s per loop
