In [193]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import random
import pickle
from collections import Counter

In [97]:
## get intra-chromosomal genes

dataDir = '/gpfs/commons/groups/gursoy_lab/ajoglekar/Projects/2023_03_01_multiwayInteractions/2023_03_01_v0_dataGathering/v1_poreC_explore/'
chromSizes = pd.read_csv(f'{dataDir}hg38.chromSizes',sep="\t", names = ['chr','size']).set_index('chr')['size'].to_dict()
readConcatemersWClosestGene = f'{dataDir}NlaIII_GM12878_output_byChr/NlaIII_GM12878_chr19.gz'
colnames = ["chr","start","end","readID","readLen","readQual",
"geneChr","geneStart","geneEnd","strand","geneID","bioType","geneName","dist","ID"]

fullBed = pd.read_csv(readConcatemersWClosestGene,sep = "\t",names = colnames)

In [98]:
chr19 = fullBed[fullBed['chr']=="chr19"]
binSize = 5*10**5
chrBins = [x for x in range(0,chromSizes['chr19']+binSize,binSize)]
chr19_binned = pd.cut(chr19['start'],bins = chrBins, labels = ["Bin_"+str(i+1) for i in range(len(chrBins)-1)]).rename("binID")
chr19_wBinID = chr19.merge(chr19_binned,left_index=True,right_index=True)
chr19_wBinID.shape

(9069080, 16)

In [99]:
random.seed(100)
randomReadIdx = random.sample(list(chr19_wBinID['ID']),200000)
subsetDF = chr19_wBinID[chr19_wBinID['ID'].isin(randomReadIdx)]
subsetDF.shape

(686620, 16)

In [100]:
chr19IncDF = pd.pivot_table(data=subsetDF[['ID', 'binID']], index='binID', columns='ID', aggfunc=lambda x: 1, fill_value=0)
chr19IncDF.shape

(114, 194392)

In [187]:
colSums_chr19IncDF = chr19IncDF.sum()
intraChrom19 = chr19IncDF.loc[:, colSums_chr19IncDF >=2]

In [188]:
def getCardProbs(incDF):
    cardVec = incDF.sum()
    cardFreq = Counter(cardVec)
    total = sum(cardFreq.values())
    probCard = {key: (value/total) for key, value in cardFreq.items()}
    return(probCard)

In [162]:
def getLenDistrPerCard(incDF,card):
    readIx = incDF.columns[incDF.sum() == card]
    concatemerLen = []
    for ix in readIx:
        binList = list(incDF[ix])
        ixFirst = binList.index(1)
        ixLast = (len(binList) - 1) - binList[::-1].index(1)    ## Index base 0
        cL = ixLast - ixFirst + 1
        concatemerLen.append(cL)   ## Difference w.r.t. base 0 gives 1 less so add
    return(concatemerLen)

In [169]:
def plotHistOfLenGivenCard(lenDist,card):
    fig, ax = plt.subplots(figsize =(5, 4))
    ax.hist(lenDist,density = True,bins = binSpecs)
    plt.title(f"Length | Card for c= {card}")
    plt.show()
    return()

In [164]:
def getFreqPerCard(lenDist):
    lenCounts=Counter(lenDist)
    #plotHistOfLenGivenCard(lenDist,card)
    total = sum(lenCounts.values())
    freqLen = {key: (value/total) for key, value in lenCounts.items()}
    return(freqLen)

In [195]:
cardProbs = getCardProbs(chr19IncDF)

In [198]:
freqLenPerCard = dict()
for card in cardProbs.keys():
    lenDist = getLenDistrPerCard(chr19IncDF,card)
    freqLen = getFreqPerCard(lenDist)
    freqLenPerCard[card] = freqLen


In [196]:
print(cardProbs[1])
print(cardProbs[2])
print(cardProbs[3])

0.4668504876743899
0.21803880818140664
0.1314405942631384


In [192]:
print(freqLenPerCard[3][3])
print(freqLenPerCard[3][5])
print(freqLenPerCard[3][6])
print(freqLenPerCard[3][7])

0.1401902078196548
0.05060467300692732
0.037258815701929474
0.03377558608273649


In [200]:
# Save nested dictionary to a pickle file
with open(f'{dataDir}../v0_hypergraphSimulations/freqLensGivenCard.pkl', 'wb') as pickle_file:
    pickle.dump(freqLenPerCard, pickle_file)