In [1]:
from Bio import SeqIO, SeqRecord
from Bio.Seq import Seq

In [2]:
import re
allowed_chromosomes = "^chr[0-9]?[0-9]?[XYM]?$"

In [3]:
def load_genecode_chromosomes(filename):
    return {record.name: record for record in SeqIO.parse("annotations/GRCh37.primary_assembly.genome.fa", "fasta")}

In [4]:
all_chromosomes = load_genecode_chromosomes("annotations/GRCh37.primary_assembly.genome.fa")

In [5]:
# This is really subtle.
# First, genomic coordinates are index 1 based.
# So, to convert left, right to python take left-1, right-1.
# Now, python slices are left-inclusive right-exclusive
# Genomic coordinates are left,right-inclusive.
# So add 1 to right-1.
# You end up with left-1, right.

def genome_to_python(left, right):
    return left - 1, right

def python_to_genome(left, right):
    return left + 1, right

In [6]:
large_seqG = None
small_seqG = None
def find_coords_in_window(large_seq, small_seq, window_left, window_right):
    python_left, python_right = genome_to_python(window_left, window_right)
    offset = large_seq[python_left:python_right].find(small_seq)
    if offset == -1:
        #print(large_seq[python_left:python_right], small_seq)
        raise IndexError("small_seq not in large_seq")
    genome_coord_left = window_left + offset
    genome_coord_right = genome_coord_left + len(small_seq) - 1
    return genome_coord_left, genome_coord_right

In [7]:
small_seq = Seq("CACAGCCACACTGCCGTCTACGGCC")
large_seq = Seq("ATTCGCACACAGCCACACTGCCGTCTACGGCCACCGCTC")
left, right = genome_to_python(*find_coords_in_window(large_seq, small_seq, 2, 37))
assert(large_seq[left:right] == small_seq)

In [8]:
filename_in = "probesets/all_meta.csv"

In [9]:
def probe_into_plus(seq, strand):
    if strand == "+":
        return seq.reverse_complement()
    return seq

In [10]:
count_mismatch = 0
def compute_probe_coord(seq, left_pos, right_pos, chrom, strand, headers, genecode_assembly):
    global count_mismatch
    if re.match(allowed_chromosomes, chrom) and strand in ["+", "-"]:
        # Affymetrix keeps their probe sequences in 3' -> 5'!
        seq = Seq(seq[::-1])

        seq = probe_into_plus(seq, strand)
                    
        large_sequence = genecode_assembly[chrom].seq
        python_left_pos, python_right_pos = genome_to_python(left_pos, right_pos)
                    
        in_genecode = seq in large_sequence[python_left_pos:python_right_pos]
        if in_genecode:
            left, right = find_coords_in_window(large_sequence, seq, left_pos, right_pos)
            left_python, right_python = genome_to_python(left, right)
            assert genecode_assembly[chrom][left_python:right_python].seq == seq
            return seq, left_python, right_python
        else:
            count_mismatch += 1
    return None

In [11]:
def probe_level_coords(filename_in, genecode_assembly, filename_out):
    headers = None
    with open(filename_out, "w") as f_out:
        print("seq5to3plus", "chrom", "genocode_left", "genecode_right", "strand", "x", "y", sep="\t", file=f_out)
        with open(filename_in) as f:
            for i, line in enumerate(f):
                line = [i.strip() for i in line.split()]
                if not headers:
                    headers = {key: value for value, key in enumerate(line)}
                else:
                    seq, left_pos, right_pos, chrom, strand, x, y = [line[i] for i in [headers[key] for key in ["seq", "left_pos", "right_pos", "chromosome", "strand", "x", "y"]]]
                    try:
                        left_pos = int(left_pos)
                        right_pos = int(right_pos)
                    except:
                        continue
                    result = compute_probe_coord(seq, left_pos, right_pos, chrom, strand, headers, genecode_assembly)
                    if result is not None:
                        seq, probe_left, probe_right = result
                        print(seq, chrom, probe_left, probe_right, strand, x, y, sep="\t", file=f_out)

In [12]:
probe_level_coords(filename_in, all_chromosomes, "probesets/probesets_correct_coords.csv")

In [14]:
print(count_mismatch)

60006
