In [112]:
import numpy as np
import pandas as pd
from itertools import combinations
import multiprocessing
import os

from utils import flatten

## Set up. 
dataDir = '/gpfs/commons/groups/gursoy_lab/ajoglekar/Projects/2023_03_01_multiwayInteractions/2023_03_01_v0_dataGathering/v0_hypergraphSimulations/getMultiwayInteractions_fromBPChains/'

ModuleNotFoundError: No module named 'utils'

In [158]:
def increaseIncDF_binSize(df,binSize):
    """This collapses consective bins in an incidence DF
    to reduce dimensions. This also means that some multiway
    interactions are now collapsed and those edges will be
    pruned"""
    result = []
    names = []
    for i in range(0,len(df) - binSize + 1,binSize):
        summed_value = df.loc[i:i+binSize-1,:].sum()
        summed_value[summed_value > 0] = 1
        names.append(f"Bin{i}:{i+binSize-1}")
        result.append(summed_value)
    result_df = pd.DataFrame(result,index = names)
    result_df = result_df.loc[:,result_df.sum() >= 2]
    return(result_df)

def dfToDict(df,result_dict):
    """Takes in an incidence DF and converts to 
    a dictionary of hyperedges"""
    for col in df.columns:
        twoWayRatio = float(col.split("_")[1])
        indices = df.index[df[col] == 1].tolist()
        key = '_'.join(indices)

        tmpList = result_dict.get(key, [])
        if tmpList:
            tmpList[0] += 1
            tmpList[1].append(twoWayRatio)
        else:
            tmpList = [1,[twoWayRatio]]
        result_dict[key] = tmpList
    return [result_dict]

class IncDFCreator:
    """Replacing this function: makeIncDF_fromChainDists"""
    def __init__(self, numProcesses, prim_cutoff, sec_cutoff, offDiagLim):
        self.numProcesses = numProcesses
        self.prim_cutoff = prim_cutoff
        self.sec_cutoff = sec_cutoff
        self.offDiagLim = offDiagLim

    def preprocessMat(self, chainMat):
        """Get upper triangular matrix and number of rows"""
        nrow = chainMat.shape[0]
        chainMat_triu = np.triu(chainMat, k=1)
        return chainMat_triu, nrow
    
    def assessMultiway(self, slice):
        """How many 2-way contacts fall below secondary cutoff"""
        total = np.count_nonzero(slice)
        passThresh = np.count_nonzero(slice[slice < self.sec_cutoff])
        ratio = passThresh / total
        return(ratio)
        
    def perRow(self, args):
        """Per row of upper tri matrix, get potential list of
        neighbors that fulfill distance criteria. Take iterative
        n choose k subsets of the matrix if the elements fall below 
        primary cutoff. If all elements fall 
        below the secondary distance cutoff needed to make it interact, then
        report that as a hyperedge"""
        chainMat_triu, row_ix = args
        columns_to_add = []
        ratioVec = []
        self.nrow = chainMat_triu.shape[0]
        vec = chainMat_triu[row_ix, row_ix + self.offDiagLim:]
        #condition1 = (0 < vec)
        condition2 = (vec < self.prim_cutoff)
        possNeighbors = [row_ix + self.offDiagLim + index for index in np.where(condition2)[0]]
        possNeighbors.insert(0, row_ix)
        if possNeighbors:
            for ix in range(2, len(possNeighbors)):
                for comb in combinations(possNeighbors, ix):
                    d = chainMat_triu[np.ix_(comb, comb)]
                    ratioUnderThresh = self.assessMultiway(d)
                    if ratioUnderThresh >= 0.5:
                        new_column = np.zeros(self.nrow)
                        new_column[list(comb)] = 1
                        columns_to_add.append(new_column)
                        ratioVec.append(ratioUnderThresh)
        if columns_to_add:
            return [columns_to_add,ratioVec]
        else:
            return None

    def mp(self, chainMat_triu, nrow):
        """Define multiprocessing pool"""
        pool = multiprocessing.Pool(self.numProcesses)
        argument_pairs = [(chainMat_triu, row_ix) for row_ix in range(nrow)]
        results = pool.map(self.perRow, argument_pairs)
        return results

    def makeIncDF_fromChainDists_mp(self, chainMat):
        """Run the dist matrix --> incidence DF in a parallelized
        fashion"""
        chainMat_triu, nrow = self.preprocessMat(chainMat)
        results = self.mp(chainMat_triu, nrow)
        filtered_results = [arr[0] for arr in results if arr is not None]
        ratioVec = flatten([vec[1] for vec in results if vec is not None])
        df = pd.DataFrame(np.concatenate(filtered_results)).T
        return [df,ratioVec]

    def makeIncDF_fromChainDists_single(self, chainMat):
        """Run the dist matrix --> incidence DF in a single-threaded
        fashion"""
        chainMat_triu, nrow = self.preprocessMat(chainMat)
        res = []
        rV = []
        for i in range(nrow):
            args = (chainMat_triu, i)
            row_result = self.perRow(args)
            if row_result is not None:
                res.append(row_result[0])
                rV.append(row_result[1])
        filtered_results = [arr for arr in res if arr is not None]
        df = pd.DataFrame(np.concatenate(filtered_results)).T
        ratioVec = flatten(rV)
        return [df, ratioVec]

In [152]:
numProcesses = 4
prim_cutoff = 500
sec_cutoff = 550
offDiagLim = 3

creator = IncDFCreator(numProcesses, prim_cutoff, sec_cutoff, offDiagLim)

In [25]:
exMat = np.loadtxt(f'{dataDir}chains_500_10000_1500_1681171613/chain_dist_5292.txt')
# exChain = creator.makeIncDF_fromChainDists_mp(exMat)

In [154]:
def flatten(l):
    ## flatten lists
    return [item for sublist in l for item in sublist]

A = creator.makeIncDF_fromChainDists_mp(exMat)
df = A[0]
ratios = A[1]


In [130]:
seventy = [index for index, value in enumerate(ratios) if value > 0.7]
len(seventy)

15672

In [167]:
df.index = ["Bin"+str(i) for i in range(500)]

In [168]:
inc_dict = {}
inc_dict = dfToDict(df,inc_dict)

In [157]:
bInc_by5 = increaseIncDF_binSize(df,5)

In [160]:
bInc_dict = {}
bInc_dict = dfToDict(bInc_by5,bInc_dict)

In [163]:
bInc_dict

{'Bin0:4_Bin5:9': 4,
 'Bin0:4_Bin370:374': 4,
 'Bin0:4_Bin5:9_Bin370:374': 5,
 'Bin0:4_Bin10:14': 3,
 'Bin0:4_Bin30:34': 3,
 'Bin5:9_Bin10:14': 4,
 'Bin10:14_Bin370:374': 18,
 'Bin10:14_Bin30:34': 74,
 'Bin0:4_Bin5:9_Bin10:14': 3,
 'Bin0:4_Bin5:9_Bin30:34': 3,
 'Bin0:4_Bin10:14_Bin30:34': 9,
 'Bin0:4_Bin10:14_Bin370:374': 9,
 'Bin0:4_Bin30:34_Bin370:374': 9,
 'Bin5:9_Bin10:14_Bin370:374': 8,
 'Bin5:9_Bin10:14_Bin30:34': 6,
 'Bin10:14_Bin30:34_Bin370:374': 51,
 'Bin0:4_Bin5:9_Bin10:14_Bin30:34': 9,
 'Bin0:4_Bin5:9_Bin10:14_Bin370:374': 9,
 'Bin0:4_Bin5:9_Bin30:34_Bin370:374': 8,
 'Bin0:4_Bin10:14_Bin30:34_Bin370:374': 27,
 'Bin5:9_Bin10:14_Bin30:34_Bin370:374': 10,
 'Bin0:4_Bin5:9_Bin10:14_Bin30:34_Bin370:374': 26,
 'Bin5:9_Bin30:34': 1,
 'Bin5:9_Bin40:44': 3,
 'Bin5:9_Bin45:49': 19,
 'Bin40:44_Bin45:49': 27,
 'Bin5:9_Bin30:34_Bin40:44': 1,
 'Bin5:9_Bin30:34_Bin45:49': 1,
 'Bin5:9_Bin40:44_Bin45:49': 18,
 'Bin5:9_Bin15:19': 8,
 'Bin5:9_Bin35:39': 1,
 'Bin5:9_Bin385:389': 2,
 'Bin10:14_B

In [170]:
roi = 'Bin0:4_Bin5:9_Bin10:14_Bin30:34_Bin370:374'
ids = roi.split("_")

### this is convoluted but it works
sevenWay1 = np.where(bInc_by5.sum() == 5)[0].tolist()
print(len(sevenWay1))
actualColname = [bInc_by5.columns[s] for s in sevenWay1]
subset_binnedDF = bInc_by5[actualColname]
print(subset_binnedDF.shape)
subset_binnedDF2 = subset_binnedDF.loc[ids]
print(subset_binnedDF2.shape)
sevenWay2 = np.where(subset_binnedDF2.sum() == 5)[0].tolist()
print(len(sevenWay2))
actualColname2 = [subset_binnedDF2.columns[s] for s in sevenWay2]
subset_binnedDF2[actualColname2]

15206
(100, 15206)
(5, 15206)
26


Unnamed: 0,121,122,123,124,127,128,129,130,164,165,...,173,174,175,176,187,188,189,190,191,192
Bin0:4,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
Bin5:9,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
Bin10:14,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
Bin30:34,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
Bin370:374,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [171]:
roi_iDF = df[actualColname2]
roi_iDF.loc[~(roi_iDF==0).all(axis=1)]

Unnamed: 0,121,122,123,124,127,128,129,130,164,165,...,173,174,175,176,187,188,189,190,191,192
Bin3,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
Bin9,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
Bin10,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0
Bin11,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0
Bin33,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,...,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0
Bin34,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,...,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0
Bin373,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0
Bin374,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,...,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0


In [181]:
newCols = [str(bInc_by5.columns[i])+"_"+str(round(ratios[i],2)) 
           for i in range(bInc_by5.shape[1])]
bInc_by5.columns = newCols

In [184]:
float(bInc_by5.columns[0].split("_")[1])

1.0

In [247]:
def dfToDict(df,result_dict):
    """Takes in an incidence DF and converts to 
    a dictionary of hyperedges"""
    for col in df.columns:
        twoWayRatio = float(col.split("_")[1])
        indices = df.index[df[col] == 1].tolist()
        key = '_'.join(indices)

        tmpList = result_dict.get(key, [])
        if tmpList:
            tmpList[0] += 1
            tmpList[1].append(twoWayRatio)
        else:
            tmpList = [1,[twoWayRatio]]
        result_dict[key] = tmpList
    return [result_dict]

In [248]:
from collections import defaultdict
ratioDict = defaultdict(list)

In [259]:
bInc_dict = defaultdict()
bInc_dict = dfToDict(bInc_by5,bInc_dict)

In [263]:
print(bInc_dict[0]['Bin0:4_Bin5:9'])
# print(bInc_dict['Bin5:9_Bin10:14_Bin370:374'])
# print(bInc_dict['Bin0:4_Bin10:14_Bin30:34_Bin370:374'])

[4, [1.0, 1.0, 1.0, 0.67]]


In [255]:
bInc_dict

[defaultdict(list,
             {'Bin0:4_Bin5:9': [4, [1.0, 1.0, 1.0, 0.67]],
              'Bin0:4_Bin370:374': [4, [1.0, 1.0, 1.0, 0.67]],
              'Bin0:4_Bin5:9_Bin370:374': [5, [1.0, 0.67, 1.0, 1.0, 0.5]],
              'Bin0:4_Bin10:14': [3, [1.0, 1.0, 0.67]],
              'Bin0:4_Bin30:34': [3, [1.0, 1.0, 1.0]],
              'Bin5:9_Bin10:14': [4, [1.0, 1.0, 0.67, 0.83]],
              'Bin10:14_Bin370:374': [18,
               [1.0,
                1.0,
                1.0,
                0.67,
                0.67,
                0.67,
                1.0,
                0.5,
                0.61,
                0.53,
                1.0,
                1.0,
                1.0,
                0.67,
                0.9,
                0.5,
                1.0,
                0.64]],
              'Bin10:14_Bin30:34': [74,
               [1.0,
                1.0,
                0.67,
                0.67,
                0.67,
                0.67,
            

In [233]:
a = defaultdict(list)
a["A"] = [5,[0.67, 1.0, 1.0, 1.0, 0.5]]
a["A"][0]

a["A"][0] += 1

x = a.get("B",[])
x.append([3,[0.8]])
a["B"] = x
a

defaultdict(list, {'A': [6, [0.67, 1.0, 1.0, 1.0, 0.5]], 'B': [[3, [0.8]]]})

In [None]:
## Make multiple incDFs ? Or somehow save the ratio in a vector
## Make binned DF first
## Then dict
## Then extend and combine