# Getting started

This is supposed to illustrate a bit the task at hand. Let's start by extracting the allele names from the genotypes.

In principle, alleles in the genotype string should be separated by spaces. Some will be wrong, but that's fine for now.

In [14]:
import pandas
import re

# We make a set to store the alleles
allele_names = set([])
data = pandas.read_csv('../data/strains.tsv', sep='\t')

# We force conversion to string, otherwise empty values are parsed as nans (floats)
data['genotype'] = data['genotype'].astype(str)

for genotype in data.genotype:
    # split the genotype by any separator and add the alleles names to the set
    allele_names.update([a.lower() for a in  re.split("\s+",genotype)])



We want now to get an idea of the ways alleles have been stored. Some of the inconsistencies will be the same for many genes, so we will substitute the name of the gene in the allele by the word "GENE"

In [15]:
# Read all gene names and identifiers

systematic_ids = set()
gene_names = set()
other = set()

def add_gene_name(gene_name):
    if re.match(r'[a-z]{3}\d+',gene_name) is not None:
        gene_names.add(gene_name)
    elif re.match(r'SP.+\.\d+c?',gene_name) is not None:
        systematic_ids.add(gene_name)
    else:
        other.add(gene_name)

with open('../data/gene_IDs_names.tsv') as ins:
    # First line does not count
    ins.readline()
    for line in ins:
        fields = line.strip().split('\t')
        add_gene_name(fields[0])
        if len(fields)>1:
            add_gene_name(fields[1])
            if len(fields)>2:
                if ',' in fields[2]:
                    [add_gene_name(f) for f in fields[2].split(',')]
                else:
                    add_gene_name(fields[2])

# There are some gene synonyms that fall out of the naming conventions. For now we can leave them outside
# print(other)


Now let's replace the gene names in the allele names by `GENE` and store them in a list. Then let's see what are the most common occurrences in the list.

In [16]:
alleles_with_replaced_name = list()

for allele in allele_names:

    for name in re.findall(r'[a-z]{3}\d+',allele):
        if name in gene_names:
            allele = allele.replace(name,'GENE')

    for name in re.findall(r'SP.+\.\d+c?',allele):
        if name in systematic_ids:
            allele = allele.replace(name,'GENE')

    alleles_with_replaced_name.append(allele)


We can also replace the resistance markers for `MARKER`

In [17]:
markers = ['kanr','kanmx6','kanmx4','kanmx','hygr','hphr','natr','kan','natmx6']

for i in range(len(alleles_with_replaced_name)):
    for marker in markers:
        if marker in alleles_with_replaced_name[i]:
            alleles_with_replaced_name[i] = alleles_with_replaced_name[i].replace(marker,'MARKER')

tags = ['gfp','mcherry']

for i in range(len(alleles_with_replaced_name)):
    for tag in tags:
        if tag in alleles_with_replaced_name[i]:
            alleles_with_replaced_name[i] = alleles_with_replaced_name[i].replace(tag,'TAG')

In [18]:
from collections import Counter

counted = Counter(alleles_with_replaced_name)
# Sort
result = counted.most_common()

# Write into file

with open('dummy00.txt','w') as out:
    for r in result:

        out.write(f'{r[0]} {r[1]}\n')
