In [1]:
from functools import reduce
import pickle
import pandas as pd
import os
import re
import time

In [2]:
def make_fset(x):
    x1 = x.split(' ')[0]
    x2 = x.split(' ')[1]
    fset = frozenset({x1,x2})
    return(fset)

In [3]:
def read_files(data_dir, pickle_files=False):  
    fmat_list = []
    t0 = time.time()
    if pickle_files:
        flist = [f for f in os.listdir(data_dir) if re.match('.*.pkl', f)]
        for f in flist:
            print(f'Reading {f} ...')
            with open(data_dir+f, 'rb') as handle:
                df = pickle.load(handle)
            df['frozen_pair'] = [make_fset(i) for i in df['ID']]
            df.drop(['ID'], axis=1, inplace=True)
            fmat_list.append(df)
    else:
        flist = [f for f in os.listdir(data_dir)]
        print(f'Reading {f} ...')
        for f in flist:
            df = pd.read_csv(data_dir+f)
            df['frozen_pair'] = [make_fset(i) for i in df['ID']]
            df.drop(['ID'], axis=1, inplace=True)
            fmat_list.append(df)
    print(f'Total time to read in & format all files: {time.time() - t0} seconds')
    return(fmat_list)

In [4]:
def get_left_join_idx(data_dir, pickle_files, left_file):
    if pickle_files:
        flist = [f for f in os.listdir(data_dir) if re.match('.*.pkl', f)]
        left_index = flist.index(left_file)
        return(left_index)
    else:
        flist = [f for f in os.listdir(data_dir)]
        left_index = flist.index(left_file)
        return(left_index)

In [5]:
def build_fmat(fmat_list, join_type='outer', left_index=None):
    print('Merging features matrices ...')
    t0 = time.time()
    if join_type == 'outer':
        fmat = reduce(lambda x, y: pd.merge(x, y, on='frozen_pair', how='outer'), fmat_list)
        fmat.fillna(0, inplace=True)
    elif join_type == 'left':
        fmat = fmat_list[left_index]
        fmat_list.pop(left_index)
        for df in fmat_list:
            fmat = fmat.merge(df, how=join_type, on=['frozen_pair'])
            fmat.fillna(0, inplace=True)
        fmat.drop(['frozen_pair'], axis=1, inplace=True)
    else:
        print("Invalid join type specified; only 'outer' and 'left' accepted.")
    print(f'Total time to merge feature matrices: {time.time() - t0} seconds')
    return(fmat)

In [6]:
def write_fmat(fmat, outfile_name):
    print(f"Writing full results to {outfile_name} & {outfile_name+'.pkl'} ... ")
    t0 = time.time()
    fmat.to_pickle(outfile_name+'.pkl')
    fmat.to_csv(outfile_name, index=False)
    print(f'Total time to write out final merged feature matrix: {time.time() - t0} seconds')

In [7]:
data_dir = '../ppi_ml/data/features/'
left_file = 'featmat_allexps_p3c2.pkl'
outfile = data_dir+'featmat_final'

In [8]:
fmat_list = read_files(data_dir, pickle_files=True)
left_idx = get_left_join_idx(data_dir, pickle_files=True, left_file=left_file)

Reading featmat_plants.pkl ...


KeyboardInterrupt: 

In [None]:
test_df = build_fmat(fmat_list)

In [None]:
fmat_list[10]

## Development code below here

In [None]:
flist = sorted(fmat_files, reverse=True)

In [None]:
t0 = time.time()
fmat_list = []
for f in flist:
    print(f'Reading {f} ...')
    with open(data_dir+f, 'rb') as handle:
        df = pickle.load(handle)
        df['frozen_pair'] = [make_fset(i) for i in df['ID']]
        df.drop(['ID'], axis=1, inplace=True)
        fmat_list.append(df)
print(f'Total time to read in & format all files: {time.time() - t0} seconds')

In [None]:
t0 = time.time()
with open(data_dir+lmat_file, 'rb') as handle:
    lmat = pickle.load(handle)
    lmat['frozen_pair'] = [make_fset(i) for i in lmat['ID']]
    for df in fmat_list:
        lmat = lmat.merge(df, how='left', on=['frozen_pair'])
        lmat.fillna(0, inplace=True)
lmat.drop(['frozen_pair'], axis=1, inplace=True)
print(f'Total time to merge featmats: {time.time() - t0} seconds')

In [None]:
lmat.to_pickle(outfile+'.pkl')
lmat.to_csv(outfile, index=False)