In [1]:
import pickle
import pandas as pd
import os
import re
import time
from functools import reduce

In [2]:
data_dir = '../ppi_ml/data/calc_feats/'
f_exp = 'cmds/explist.txt'
expset = set([line.strip().split(' ')[0] for line in open(data_dir+f_exp, 'r')])
explist = list(expset)

In [4]:
explist.append('all')
explist

['amorphea', 'tsar', 'excavate', 'viridiplantae', 'all']

In [7]:
for exp in explist:
    feat_dfs = []
    t0 = time.time()
    feat_files = [f for f in os.listdir(data_dir) if re.match(f'{exp}.*pkl', f)]
    for f in feat_files:
        print(f'Reading in features from {f} ...')
        ncol = f.replace('.pkl','')
        with open(data_dir+f, 'rb') as handle:
            fdf = pickle.load(handle)
        fdf.columns.values[2] = ncol
        feat_dfs.append(fdf)
    print(f'Total time to read in all files: {time.time() - t0} seconds')
    
    t0 = time.time()
    print('Merging all experiments ...')
    df_merged = reduce(lambda x, y: pd.merge(x, y), feat_dfs)
    df_merged.fillna(0, inplace=True)
    score_cols = df_merged.columns.tolist()[2:]
    df_merged['ID'] = df_merged.ID1.str.cat(df_merged.ID2, sep=' ')
    df_merged.drop(['ID1','ID2'], axis=1, inplace=True)
    df_merged = df_merged[['ID'] + score_cols]
    print(f'Total time to complete merge & format: {time.time() - t0} seconds')
    
    t0 = time.time()
    outfile = f'../ppi_ml/data/featmats/featmap_{exp}_norm'
    print(f'Writing output to {outfile} ...')
    df_merged.to_csv(outfile, index=False, na_rep=0)
    df_merged.to_pickle(outfile+'.pkl')
    print(df_merged.head())
    print(f'Total time to write joined feats: {time.time() - t0} seconds')

Reading in features from amorphea.filtdollo.norm.150p.covariance.feat.pkl ...
Reading in features from amorphea.filtdollo.norm.150p.spearmanR_weighted.feat.pkl ...
Reading in features from amorphea.filtdollo.norm.150p.euclidean.feat.pkl ...
Reading in features from amorphea.filtdollo.norm.150p.pearsonR.feat.pkl ...
Reading in features from amorphea.filtdollo.norm.150p.braycurtis.feat.pkl ...
Reading in features from amorphea.filtdollo.norm.150p.spearmanR.feat.pkl ...
Total time to read in all files: 6.464269638061523 seconds
Merging all experiments ...
           ID1          ID2  amorphea.filtdollo.norm.150p.covariance.feat  \
0  ENOG502QPI8  ENOG502QPIA                                 -2.903842e-05   
1  ENOG502QPI8  ENOG502QPJC                                 -7.860326e-07   
2  ENOG502QPI8  ENOG502QPJG                                 -1.965081e-07   
3  ENOG502QPI8  ENOG502QPJI                                 -8.400723e-06   
4  ENOG502QPI8  ENOG502QPJV                             

### Make file patterns match old fmats:

In [8]:
!mv ../ppi_ml/data/featmats/featmap_viridiplantae_norm ../ppi_ml/data/featmats/featmap_plants_norm
!mv ../ppi_ml/data/featmats/featmap_viridiplantae_norm.pkl ../ppi_ml/data/featmats/featmap_plants_norm.pkl
!mv ../ppi_ml/data/featmats/featmap_amorphea_norm ../ppi_ml/data/featmats/featmap_animals_norm
!mv ../ppi_ml/data/featmats/featmap_amorphea_norm.pkl ../ppi_ml/data/featmats/featmap_animals_norm.pkl
!mv ../ppi_ml/data/featmats/featmap_all_norm ../ppi_ml/data/featmats/featmap_allconcat_norm
!mv ../ppi_ml/data/featmats/featmap_all_norm.pkl ../ppi_ml/data/featmats/featmap_allconcat_norm.pkl