In [None]:
#import hypernetx as hnx
import numpy as np
import pandas as pd
from itertools import combinations
import matplotlib.pyplot as plt
import seaborn as sns
import glob
import random
#from PyComplexHeatmap import *

import sys
sys.path.append('/gpfs/commons/groups/gursoy_lab/ajoglekar/Projects/2023_03_01_multiwayInteractions/v0.analysis/scripts/pythonScripts/functions/')
from incidenceToProjection import makeHiC_fromInc
#from chains import makeIncDF_fromChainDists
from chains import RealHiC
from utils import flatten

## Set up. 
dataDir = '/gpfs/commons/groups/gursoy_lab/ajoglekar/Projects/2023_03_01_multiwayInteractions/2023_03_01_v0_dataGathering/v0_hypergraphSimulations/getMultiwayInteractions_fromBPChains/'


In [None]:
## Read in example file
exMat1 = np.loadtxt(f'{dataDir}chains_500_10000_1500_1696950861/chain_dist_281.txt')
## Read in example file
exMat2 = np.loadtxt(f'{dataDir}chains_500_10000_1500_1696950861/chain_dist_38.txt')

nrow = exMat1.shape[0]
cutoff = 500

In [None]:
chainDir = f'{dataDir}chains_500_10000_1500_1696950861/'
num_files = 300
binaryInit = RealHiC(chainDir,num_files)
binMat = binaryInit.distMatToBinary(281)

plt.figure(figsize=(6, 4))
im = plt.imshow(exMat1, cmap="YlGnBu")
plt.colorbar(im, fraction=0.046, pad=0.04, label='balanced');
plt.title("Distance matrix heatmap")
plt.show()

In [None]:
def averageNChains(seed,sampleSize,matching_files):
    random.seed(seed)
    mat = None
    for f in random.sample(matching_files,sampleSize):
        oneChain = np.loadtxt(f)
        chainDFs.append(oneChain)
        if mat is None:
            mat = (oneChain / sampleSize)
        else:
            mat += (oneChain / sampleSize)
    return(mat)

In [None]:
distPattern = "chain_dist_*.txt"
matching_files = glob.glob(f'{dataDir}chains_500_10000_1500_1681171613/{distPattern}')
random.seed(10)
sampleSize = 4

avgMat = None
dfs = []
chainDFs = []

for f in random.sample(matching_files,sampleSize):
    oneChain = np.loadtxt(f)
    chainDFs.append(oneChain)
    if avgMat is None:
        avgMat = (oneChain / sampleSize)
    else:
        avgMat += (oneChain / sampleSize)
    chainDF = makeIncDF_fromChainDists(oneChain)
    print(chainDF.shape)
    dfs.append(chainDF)

combined_incDF = pd.concat(dfs,axis=1)
combined_incDF.shape

In [None]:
plt.figure(figsize=(6, 4))
im = plt.imshow(avgMat, cmap="YlGnBu")
plt.colorbar(im, fraction=0.046, pad=0.04, label='balanced');
plt.title("Distance matrix heatmap")
plt.show()

In [None]:

plt.figure(figsize=(8, 8))
 
for i, matrix in enumerate(chainDFs):
    plt.subplot(2, 2, i + 1)
    im = plt.imshow(matrix, cmap="YlGnBu")
    plt.colorbar(im, fraction=0.05, pad=0.04)
    plt.title(f"Chain {i+1}")

plt.tight_layout()
plt.show()

In [None]:
print(chainDFs[0][0:4,0:4])
print(chainDFs[1][0:4,0:4])
print(chainDFs[2][0:4,0:4])
print(chainDFs[3][0:4,0:4])
print(avgMat[0:4,0:4])


In [None]:
oneChain = makeIncDF_fromChainDists(avgMat)
oneChain.shape

In [None]:
from matplotlib.colors import BoundaryNorm, ListedColormap

my_colors = ["lightyellow",'darkblue']
my_cmap = ListedColormap(my_colors)
bounds = [0, 0.5, 1]
my_norm = BoundaryNorm(bounds, ncolors=len(my_colors))

plt.figure(figsize=(6, 7))
im = sns.heatmap(combined_incDF, cmap=my_cmap,norm = my_norm)
plt.title("Incidence DF")
plt.show()

In [None]:
## Convert incidence matrix to 2d hiC matrix
def makeHiC_fromInc(incDF):
    nrow = incDF.shape[0]
    ncol = incDF.shape[1]
    binIDs = list(incDF.index)
    df = pd.DataFrame(np.zeros(shape = (nrow,nrow)), index=binIDs, columns=binIDs)
    for read in incDF.columns:
        arr = incDF[read][incDF[read] == 1].index
        for a in arr:
            df.loc[a][a] += 1
        combs = list(combinations(arr,2))
        for c in combs:
            df.loc[c[0]][c[1]] += 1
            df.loc[c[1]][c[0]] += 1
    return(df)

In [None]:
hic_mat = makeHiC_fromInc(df)

In [None]:
from matplotlib.colors import LogNorm
plt.figure(figsize=(6, 4))
im = plt.imshow(hic_mat, cmap="YlOrRd",norm = LogNorm(vmax=100, vmin = 0.05))
plt.colorbar(im, fraction=0.046, pad=0.04, label='balanced');
plt.title(f"Projection matrix: threshold = {cutoff}")
plt.show()

In [None]:
#chain_dir = f'{dataDir}chains_500_10000_1500_1681171613/'
chain_dir = f'{dataDir}chains_500_10000_1500_1696950861/'
num_files = 1000
hic_processor = RealHiC(chain_dir,num_files)
hic1k = hic_processor.distFilesToRealHiC()

In [None]:
from matplotlib.colors import LogNorm
plt.figure(figsize=(6, 4))
im = plt.imshow(hic1k, cmap="YlOrRd",norm = LogNorm())
plt.colorbar(im, fraction=0.046, pad=0.04, label='balanced');
plt.title(f"Real HiC matrix")
plt.show()

In [None]:
np.where(exMat1 < 500, 1, 0)

In [None]:
card = df.sum()

In [None]:
# Generate a histogram
plt.figure(figsize=(6,4))
plt.hist(card, bins='auto')

# Add labels and title
plt.xlabel('Cardinality')
plt.ylabel('Frequency')
plt.title('Histogram of cardinality from chain')

# Display the histogram
plt.show()

In [None]:
avgMat1 = averageNChains(seed=101,sampleSize=5,matching_files = matching_files)
chains1 = makeIncDF_fromChainDists(avgMat1)
print(chains1.shape)

avgMat2 = averageNChains(seed=102,sampleSize=5,matching_files = matching_files)
chains2 = makeIncDF_fromChainDists(avgMat2)
print(chains2.shape)

In [None]:
plt.figure(figsize=(6, 4))
im = plt.imshow(avgMat1, cmap="YlGnBu")
plt.colorbar(im, fraction=0.046, pad=0.04, label='balanced');
plt.title("Avg Mat - 1")
plt.show()

plt.figure(figsize=(6, 4))
im = plt.imshow(avgMat2, cmap="YlGnBu")
plt.colorbar(im, fraction=0.046, pad=0.04, label='balanced');
plt.title("Avg Mat - 2")
plt.show()

In [None]:
chains1_bin5 = increaseIncDF_binSize(chains1,5)
chains2_bin5 = increaseIncDF_binSize(chains2,5)

In [None]:
H1 = hnx.Hypergraph.from_incidence_dataframe(chains1)
H2 = hnx.Hypergraph.from_incidence_dataframe(chains2)

In [None]:
h1_nodes = []
for e in H1.edges:
    h1_nodes.append(tuple(H1.edges[e]))

h2_nodes = []
for e in H2.edges:
    h2_nodes.append(tuple(H2.edges[e]))

In [None]:
common_elements = set(h1_nodes).intersection(set(h2_nodes))
print(len(common_elements),len(h1_nodes),len(h2_nodes))

In [None]:
common_elements

## SCRATCHPAD 

In [None]:
chains1 = makeIncDF_fromChainDists(exMat1,500)
chains2 = makeIncDF_fromChainDists(exMat2,500)

In [None]:
chains1by5 = increaseIncDF_binSize(chains1,5)
chains1by5 = chains1by5.loc[:,chains1by5.sum() >= 2]

In [None]:
chains2by5 = increaseIncDF_binSize(chains2,5)
chains2by5 = chains2by5.loc[:,chains2by5.sum() >= 2]

In [None]:
def increaseIncDF_binSize(df,binSize):
    result = []
    names = []
    for i in range(0,len(df) - binSize,binSize):
        summed_value = df.loc[i:i+binSize-1,:].sum()
        summed_value[summed_value > 0] = 1
        names.append(f"Bin{i}:{i+binSize-1}")
        result.append(summed_value)
    result_df = pd.DataFrame(result,index = names)
    return(result_df)

In [None]:
def dfToDict(df,result_dict):
    for col in df.columns:
        indices = df.index[df[col] == 1].tolist()
        key = '_'.join(indices)

        result_dict[key] = result_dict.get(key, 0) + 1
    return(result_dict)

In [None]:
def dictToDF(hpDict):
    indices = list(set(flatten([key.split('_') for key in hpDict.keys()])))
    columns = []
    colnames = []
    counter = 0

    for key, value in hpDict.items():
        counter+=1
        col_ix = key.split('_')
        column = pd.Series([0] * len(indices),index = indices)  # Initialize row with zeros
        column[col_ix] = 1
        colName = f"Read{counter}:{value}"
        colnames.append(colName)
        columns.append(column)

    df = pd.concat(columns,axis=1)
    df.columns = colnames
    return(df)

In [None]:
emptyDict = {}
chains1_dict = dfToDict(chains1by5,emptyDict)
emptyDict = {}
chains2_dict = dfToDict(chains2by5,emptyDict)


In [None]:
chains1_new = dictToDF(chains1_dict)
chains1_new.shape

In [None]:
def constructFullDict(listOfDFs):
    result_dict = {}
    for df in listOfDFs:
        result_dict = dfToDict(df,result_dict)
    return(result_dict)

In [None]:
fullDF = dictToDF(result_dict)

In [None]:
fullDF.shape

In [None]:
fullDF

## Trying to make things more efficient

In [1]:
import numpy as np
import pandas as pd
import argparse

import sys
sys.path.append('/gpfs/commons/groups/gursoy_lab/ajoglekar/Projects/2023_03_01_multiwayInteractions/v0.analysis/scripts/pythonScripts/functions/')
from incidenceToProjection import makeHiC_fromInc
from chains import IncDFCreator, increaseIncDF_binSize

In [2]:
dataDir = '/gpfs/commons/groups/gursoy_lab/ajoglekar/Projects/2023_03_01_multiwayInteractions/2023_03_01_v0_dataGathering/v0_hypergraphSimulations/getMultiwayInteractions_fromBPChains/'
inputDir = 'chains_500_10000_1500_1681171613/'

prim_cutoff = 500
sec_cutoff = 550
numProcesses = 4
offDiagLim = 3

In [3]:
fileNum = 5
exMat = np.loadtxt(f'{dataDir}{inputDir}/chain_dist_{fileNum}.txt')

In [4]:
creator = IncDFCreator(numProcesses, prim_cutoff, sec_cutoff, offDiagLim)
exChain = creator.makeIncDF_fromChainDists_mp(exMat)

In [28]:
fileNum = 7
exMat = np.loadtxt(f'{dataDir}{inputDir}/chain_dist_{fileNum}.txt')
exChain = creator.makeIncDF_fromChainDists_mp(exMat)

In [7]:
exChain.to_parquet(f'{dataDir}/tmp0.pq',compression="gzip")

In [30]:
exChain.to_hdf(f'{dataDir}tmp.h5', key='df7', mode='a')

In [12]:
exChain.to_pickle(f'{dataDir}/tmp0.pkl')

In [11]:
a1 = pd.read_parquet(f'{dataDir}/tmp0.pq')

In [13]:
a2 = pd.read_pickle(f'{dataDir}/tmp0.pkl')

In [33]:
a3 = pd.read_hdf(f'{dataDir}tmp.h5',key='7')

In [35]:
def constructFullDict(numFiles):
    """Takes in a directory of DFs and outputs a dict"""
    result_dict = {}
    numEdges = []
    for ix in range(1,numFiles+1):
        filePath = f'{dataDir}/{inputDir}/binConcatInc_{offDiagDist}_600_750_{ix}.pkl'
        if os.path.isfile(filePath):
            bIncDF = pd.read_pickle(filePath)
            result_dict = dfToDict(bIncDF,result_dict)
            nE = len(result_dict)
            numEdges.append(nE)
    return(result_dict,numEdges)

In [42]:
import numpy as np
import pandas as pd
from itertools import combinations
import random
import os.path
import pickle

import sys
sys.path.append('/gpfs/commons/groups/gursoy_lab/ajoglekar/Projects/2023_03_01_multiwayInteractions/v0.analysis/scripts/pythonScripts/functions/')
from chains import dfToDict

inputDir = 'chains_10k_500_projectionMtxOutput/'
offDiagDist = 3

In [43]:
hpEdges, numEdges = constructFullDict(5)

In [44]:
def constructFullDict_h5(numFiles):
    """Takes in a directory of DFs and outputs a dict"""
    result_dict = {}
    numEdges = []
    for ix in range(1,numFiles+1):
        filePath = f'{dataDir}/{inputDir}/binConcatInc_3_500_550.h5'
        if os.path.isfile(filePath):
            bIncDF = pd.read_hdf(filePath,key = f'df{ix}')
            result_dict = dfToDict(bIncDF,result_dict)
            nE = len(result_dict)
            numEdges.append(nE)
    return(result_dict,numEdges)