# Required libraries

In [1]:
import pandas as pd
import numpy as np
import random

import seaborn as sns
import matplotlib.pyplot as plt

import HiCtoolbox
from scipy import sparse

# 1 ) Save all contact map in csv format in order to fasten data generation 
##### required space: 17.6 MB



# 
## Path for folder containing all HiC file

folder should contains 25kb HiC files named as following: chrX_25kb.Rawobserved"

In [2]:
HiCpath=r'dataforstudent\HiC\GM12878\25kb_resolution_intrachromosomal'

## Path for destination folder

In [9]:
folder = r'Chromosomes'

Leopold Caron's script to generate HIC matrices.

### Do not run the cell below if files have already been generated

In [None]:
R=25000
alpha = 0.227

for c in [x if x<23 else "X" for x in range(1,24)]:
    print("Chrom {}".format(c))
    HiCfilename= Hicpath + '\\chr{}_25kb.RAWobserved'.format(c)
    #Build matrix
    A=np.loadtxt(HiCfilename)
    A=np.int_(A)
    A=np.concatenate((A,np.transpose(np.array([A[:,1],A[:,0],A[:,2]]))), axis=0)#build array at pb resolution
    A = sparse.coo_matrix( (A[:,2], (A[:,0],A[:,1])))
    binned_map=HiCtoolbox.bin2d(A,R,R) #!become csr sparse array
    LENTEST=np.shape(A)[0]
    del A #keep space

    #Build color annotation at desired resolution
    # FILTER
    sumHicmat=np.sum(binned_map,0) 
    mini = np.mean(sumHicmat)-np.std(sumHicmat)*1.5 #min value of filtering
    maxi = np.mean(sumHicmat)+np.std(sumHicmat)*1.5 #max value of filtering
    binsaved=np.where(np.logical_and(mini < sumHicmat,sumHicmat < maxi)) #coord of bin to save
    contact_map=HiCtoolbox.SCN(binned_map.copy()) 
    contact_map=np.asarray(contact_map)**alpha #now we are not sparse at all
    
    np.savetxt(folder + "\Chrom_{}.csv".format(c),contact_map, delimiter = " ")

print("Done!")
    

# 2) Generates windows (33*33) and labels
##### required space: 267 Mb

Will extract all label 1 and 2 windows and select the of label 0 windows

## Data file's path

In [6]:
arrH_file = r"data\annotated_position.csv" #path to supplied annotated_position.csv
lab0 = r"data\labelzero.csv" #path to supplied labelzero.csv

## Destination folder

In [7]:
data_path = "generated_windows"

# window generation
Create the numpy matrix for each position with label:
0: not a tad border
1: start of a tad
2: end of a tad

In [10]:
print("Read data...")
R=25000                       #Resolution
with open(arrH_file, "r") as f:
    lines = f.readlines()
tmp=[]
for l in lines:
    tmp.append(l.rstrip("\n").split(" "))
    
x = pd.DataFrame(tmp[1:],columns= tmp[0])
x0 = pd.read_csv(lab0, sep=" ")

for c in [x if x<23 else "X" for x in range(1,22)]:
    print("Read chromosome " + str(c) )
    mat = np.array(pd.read_csv(folder + "\\Chrom_{}.csv".format(c), sep = " "))
    arr1 = np.array(x[x.chr=="chr"+str(c)]["start"])
    arr2 = np.array(x[x.chr=="chr"+str(c)]["end"])
    arr0 = np.array(x0[x0.chr==c]["pos"])
    np.random.shuffle(arr0)
    
    cpt = 0
    for i,l in enumerate(arr1):
        #label 1
        c1rd1 = int(l)
        c1rd2 = int(arr2[i])
        len1 = int(arr2[i])-int(l)
        
        #label 0
        c0rd1 = int(arr0[i])
        c0rd2 = int(arr0[-i])

        try:
            n1 = mat[c0rd1-16:c0rd1+17,c0rd1-16:c0rd1+17]
            p1 = mat[c1rd1-16:c1rd1+17,c1rd1-16:c1rd1+17]
            p2 = mat[c1rd2-16:c1rd2+17,c1rd2-16:c1rd2+17]

        except IndexError:
            debug = False
            print("IndexError")
            continue
        
        if all([True if x.shape == (33,33) else False for x in [n1,p1,p2]]):

            np.savetxt(data_path + "\\0\\chrom_{}_{}_{}.csv".format(c,c0rd1,0), n1)
            np.savetxt(data_path + "\\1\\chrom_{}_{}_{}.csv".format(c,c1rd1,len1), p1)
            np.savetxt(data_path + "\\2\\chrom_{}_{}_{}.csv".format(c,c1rd2,len1), p2)
            cpt+=3
        else: continue
    print("Chromosome {}: {} windows".format(c, cpt))




Read Arrowhead file...
Read chromosome 1
Chromosome 1: 596 windows
Read chromosome 2
Chromosome 2: 550 windows
Read chromosome 3
Chromosome 3: 482 windows
Read chromosome 4
Chromosome 4: 368 windows
Read chromosome 5
Chromosome 5: 428 windows
Read chromosome 6
Chromosome 6: 442 windows
Read chromosome 7
Chromosome 7: 362 windows
Read chromosome 8
Chromosome 8: 326 windows
Read chromosome 9
Chromosome 9: 194 windows
Read chromosome 10
Chromosome 10: 358 windows
Read chromosome 11
Chromosome 11: 346 windows
Read chromosome 12
Chromosome 12: 344 windows
Read chromosome 13
Chromosome 13: 184 windows
Read chromosome 14
Chromosome 14: 218 windows
Read chromosome 15
Chromosome 15: 268 windows
Read chromosome 16
Chromosome 16: 196 windows
Read chromosome 17
Chromosome 17: 182 windows
Read chromosome 18
Chromosome 18: 212 windows
Read chromosome 19
Chromosome 19: 172 windows
Read chromosome 20
Chromosome 20: 210 windows
Read chromosome 21
Chromosome 21: 94 windows


Create the daframe needed for the data generator

In [11]:
dat1 = pd.read_csv(arrH_file, sep = " ")
dat0 = pd.read_csv(lab0, sep = " ")

dat1["count"] = dat1["end"] - dat1["start"]
dat1["chr"] = dat1["chr"].str.replace("chr","")
pos = np.concatenate((np.array(dat1["start"]),np.array(dat1["end"])))
length = np.concatenate((np.array(dat1["count"]),np.array(dat1["count"])))
clas = np.concatenate((np.array([1]*len(dat1)),np.array([2]*len(dat1))))
chrom = np.concatenate((np.array(dat1["chr"]),np.array(dat1["chr"])))

dat_full = pd.DataFrame({'chr':chrom,'pos':pos,'class':clas, 'length':length})

dat0["class"] = 0
dat0["length"] = 0

dat_full = pd.concat((dat_full,dat0.sample(frac=len(dat_full)/len(dat0))), axis=0)
sample_dat = dat_full.sample(frac=1)
frac = int(len(sample_dat)*0.7)
sample_dat[:frac].to_csv("gentrain_data.txt",index=False)
sample_dat[frac:].to_csv("gentest_data.txt",index=False)