In [1]:
#MIT License
#
#Copyright (c) 2021 Pierre Michel Joubert
#
#Permission is hereby granted, free of charge, to any person obtaining a copy
#of this software and associated documentation files (the "Software"), to deal
#in the Software without restriction, including without limitation the rights
#to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
#copies of the Software, and to permit persons to whom the Software is
#furnished to do so, subject to the following conditions:
#
#The above copyright notice and this permission notice shall be included in all
#copies or substantial portions of the Software.
#
#THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
#IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
#FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
#AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
#LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
#OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
#SOFTWARE.
import csv
import numpy as np

In [2]:
# locations of gene deletions in guy11 genome
region_file = 'uniq.all_small_deletions.bed'
# list of non-lowq eccdnas from all samples in guy11
ecc_file = 'noduplicates.all.ecc_caller_out.details.nolowq.txt'

In [3]:
# read in
regions = []
with open(region_file, newline = '') as file:
    file_reader = csv.reader(file, delimiter = '\t')
    for row in file_reader:
        regions.append([row[0], int(row[1]), int(row[2])])

eccs = []
with open(ecc_file, newline = '') as file:
    file_reader = csv.reader(file, delimiter = '\t')
    for row in file_reader:
        eccs.append([row[0], row[1], row[2]])

In [4]:
## index and numpify for speed up
eccs_indexed = [[] for i in range(56)]
for ecc in eccs:
    scaffold_num = int(ecc[0][10:12])-1
    eccs_indexed[scaffold_num].append(ecc)
eccs_arrays = []
for i in range(len(eccs_indexed)):
    eccs_arrays.append(np.array(eccs_indexed[i], dtype=object))

In [5]:
# check for overlaps between eccdnas and regions
regions_with_overlap = []
tolerance = 10 # tolerance is important here
for region in regions:
    start_region = region[1]
    end_region= region[2]
    eccs_for_scaffold = eccs_arrays[int(region[0][10:12])-1]
    # magic numpy comparison command with tolerance
    ecc_matches = eccs_for_scaffold[np.logical_and(np.isclose((eccs_for_scaffold[:,1]).astype(int), start_region, atol=tolerance, rtol=0),
                                    np.isclose((eccs_for_scaffold[:,2]).astype(int), end_region, atol=tolerance, rtol=0))]
    if np.shape(ecc_matches)[0] > 0:
        regions_with_overlap.append(ecc_matches)
        print(ecc_matches)
        print(region)

In [6]:
print(len(regions))

257


In [7]:
print(len(regions_with_overlap))

0
