This assumes that the pathway-preprocessing notebook has already been run

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Create-edge-lists" data-toc-modified-id="Create-edge-lists-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Create edge lists</a></span><ul class="toc-item"><li><span><a href="#WGS" data-toc-modified-id="WGS-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>WGS</a></span></li><li><span><a href="#16S" data-toc-modified-id="16S-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>16S</a></span></li></ul></li><li><span><a href="#RNAseq" data-toc-modified-id="RNAseq-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>RNAseq</a></span></li></ul></div>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import dask
import dask.dataframe as dd
from dask.distributed import Client, progress
%matplotlib inline

#client = Client(memory_limit='500GB')
client = Client(threads_per_worker=2, n_workers=60, memory_limit="500GB")

In [2]:
rna_compounds = pd.read_csv('../results/RNA_compounds.csv', index_col=0)
wgs_compounds = pd.read_csv('../results/WGS_compounds.csv', index_col=0)
amp_compounds = pd.read_csv('../results/16S_compounds.csv', index_col=0)

In [3]:
def edge_f(df):
    compounds = df['Compound'].values
    p = df['-log(pvalue)']
    edges = []
    for i in range(len(compounds)):
        for j in range(i):
            edges.append((compounds[i], compounds[j]))
    return set(edges)

# Create edge lists

## WGS

In [4]:
wgs_compounds = pd.read_csv('../results/WGS_compounds.csv', index_col=0)
wgs_compounds = wgs_compounds[['Pathway', 'Compound', '-log(pvalue)']]
wgs_compounds_dd = dd.from_pandas(wgs_compounds, npartitions=10000)
wgs_futures = wgs_compounds_dd.groupby('Pathway').apply(edge_f)
wgs_compound_edges = wgs_futures.compute()
wgs_list = set()
for i in range(len(wgs_compound_edges)):
    if len(wgs_compound_edges.values[i]) > 0:
        wgs_list |= wgs_compound_edges.values[i]
wgs_edges = pd.DataFrame(list(wgs_list))
wgs_edges.columns = ['src', 'dest']
wgs_edges.to_parquet('../results/wgs_compound_edges.parquet')

  Before: .apply(func)
  After:  .apply(func, meta={'x': 'f8', 'y': 'f8'}) for dataframe result
  or:     .apply(func, meta=('x', 'f8'))            for series result
  wgs_futures = wgs_compounds_dd.groupby('Pathway').apply(edge_f)


## 16S 

In [5]:
amp_compounds = pd.read_csv('../results/16S_compounds.csv', index_col=0)
amp_compounds = amp_compounds[['Pathway', 'Compound', '-log(pvalue)']]
amp_compounds_dd = dd.from_pandas(amp_compounds, npartitions=10000)
amp_futures = amp_compounds_dd.groupby('Pathway').apply(edge_f)
amp_compound_edges = amp_futures.compute()

amp_list = set()
for i in range(len(amp_compound_edges)):
    if len(amp_compound_edges.values[i]) > 0:
        amp_list |= amp_compound_edges.values[i]
                
amp_edges = pd.DataFrame(list(amp_list))
amp_edges.columns = ['src', 'dest']

amp_edges.to_parquet('../results/amp_compound_edges.parquet')

  Before: .apply(func)
  After:  .apply(func, meta={'x': 'f8', 'y': 'f8'}) for dataframe result
  or:     .apply(func, meta=('x', 'f8'))            for series result
  amp_futures = amp_compounds_dd.groupby('Pathway').apply(edge_f)


# RNAseq

In [6]:
rna_compounds = pd.read_csv('../results/RNA_compounds.csv', index_col=0)
rna_compounds = rna_compounds[['Pathway', 'Compound', '-log(pvalue)']]
rna_compounds_dd = dd.from_pandas(rna_compounds, npartitions=10000)
rna_futures = rna_compounds_dd.groupby('Pathway').apply(edge_f)
rna_compound_edges = rna_futures.compute()

rna_list = set()
for i in range(len(rna_compound_edges)):
    if len(rna_compound_edges.values[i]) > 0:
        rna_list |= rna_compound_edges.values[i]
        
rna_edges = pd.DataFrame(list(rna_list))
rna_edges.columns = ['src', 'dest']
rna_edges.to_parquet('../results/rna_compound_edges.parquet')

  Before: .apply(func)
  After:  .apply(func, meta={'x': 'f8', 'y': 'f8'}) for dataframe result
  or:     .apply(func, meta=('x', 'f8'))            for series result
  rna_futures = rna_compounds_dd.groupby('Pathway').apply(edge_f)


In [7]:
client.close()