# Finding neighbours in the genomes

In [7]:
import os
from iteration_utilities import deepflatten, Iterable
import pandas as pd
from Bio import SeqIO
from collections import defaultdict

wdir = "711_genomes"
interesting_cogs = set([
    "COG2426",
    "COG4769",
    "COG4720",
    "COG2456",
    "COG4708",
    "COG4035",
    "COG4042",
    "COG4039",
    "COG4036",
    "COG4078",
    "COG0650",
    "COG2245",
    "COG4059",
    "COG4060",
    "COG4061",
    "COG3872",
    "COG1284",
    "COG1967",
    "COG4657",
    "COG4660",
    "COG4025",
    "COG1811",
    "COG3086",
    "COG2155",
    "COG3601",
    "COG3371",
    "COG2431",
    "COG3815",
    "COG2383",
    "COG1006",
    "COG4089",
    "COG1347",
    "COG1814",
    "COG4512",
    "COG0428",
    "COG5522",
    "COG1268",
    "COG2209",
    "COG2237",
    "COG2212",
    "COG3859" 
])  # specify cogs


def get_location(description):
    """be careful: cannot resolve around 1st nucleotide"""
    res = dict()
    res["strand"], res["loc"] = description.split("|")[4:6]
    res["strand"] = int(res["strand"])
    loc = res["loc"].split("..")
    try:
        l1 = int(loc[0])
    except ValueError:
        l1 = int(loc[0][1:])
    try:
        l2 = int(loc[-1])
    except ValueError:
        l2 = int(loc[-1][1:])
    res["loc"] = (l1, l2)
    return res
        


def get_operons(seqio_handler, max_dist=80):
    """accepts sorted by coordinates genes in fasta format. 5 - strand, 6 - coordinates"""
    operons_list = [[]]
    current_operon = -1
    old_location = location = {"strand": 0, "loc":(0, 0)}
    for seq in seqio_handler:
        location = get_location(seq.description)
        if location["strand"] == old_location["strand"] and location["loc"][0] - old_location["loc"][1] < max_dist:
            operons_list[current_operon].append(seq)
            # if location["loc"][0] - old_location["loc"][1] < 0:  # check bypassing genome's first nucleotide
                # print("Warning! 1 nucleotide passed.")
        else:
            current_operon += 1
            operons_list.append([])
            operons_list[current_operon].append(seq)
        old_location = location
    return operons_list


def gi_to_cog(df, gi):
    try:
        series = df.loc[gi][1]
        if isinstance(series, str):
            return [series]
        else:
            return list(series)
    except KeyError:
        return 0

In [10]:
res_dict = defaultdict(lambda: defaultdict(int))
total_operons = defaultdict(int)
all_closest_neighbours = defaultdict(list) # only for insteresting cog
for folder in filter(lambda x: os.path.isdir(wdir+"/"+x), os.listdir(wdir)):
    print("Processing {}...".format(folder))
    operons = get_operons(SeqIO.parse("{}/{}/p_{}.fasta".format(wdir, folder, folder), "fasta"))  # get grouped sequences
    try:
        gi_to_cogs = pd.read_table("{}/{}/c_{}.txt".format(wdir, folder, folder), index_col=0, header=None) # table of gis
    except:
        gi_to_cogs = pd.DataFrame({0:[1]})  # if table is empty
    # Seq to COGs
    cog_operons = []
    for t in operons:
        cog_operons.append([])
        for x in t:
            xcogs = gi_to_cog(gi_to_cogs, int(x.id.split("|")[1]))
            if xcogs != 0:
                for xcog in xcogs:
                    if xcog in interesting_cogs:
                        if xcog not in all_closest_neighbours.keys():
                            all_closest_neighbours[xcog] = t[:]
                        else:
                            all_closest_neighbours[xcog] += t
            cog_operons[-1].append(xcog)
    # constructing table as dict (count collocations of every pair of COGs)
    for operon in cog_operons:
        cogs_in_operon = set(deepflatten(operon, ignore=str))
        for pair in Iterable(cogs_in_operon).combinations(2):
            res_dict[pair[0]][pair[1]] += 1
            # symmetric pair
            res_dict[pair[1]][pair[0]] += 1
            # addd one operon to counter
        for x in cogs_in_operon:
            total_operons[x] += 1
# [index cog] needs [column cog] coefficient -> 1
for icog in res_dict:
    for ccog in res_dict[icog]:
        res_dict[icog][ccog] /= total_operons[icog]

Processing NC_011061...
Processing NC_017845...


  


Processing NC_015704...
Processing NC_014006...
Processing NC_009727...
Processing NC_019775...
Processing NC_001857...
Processing NC_013211...
Processing NC_005232...
Processing NC_004347...
Processing NC_021236...
Processing NC_019761...
Processing NC_011963...
Processing NC_000964...
Processing NC_014205...
Processing NC_013515...
Processing NC_000907...
Processing NC_010843...
Processing NC_013210...
Processing NC_009350...
Processing NC_020516...
Processing NC_014914...
Processing NC_007617...
Processing NC_023138...
Processing NC_008686...
Processing NC_005877...
Processing NC_019683...
Processing NC_015638...
Processing NC_009953...
Processing NC_008536...
Processing NC_020130...
Processing NC_002695...
Processing NC_017249...
Processing NC_008789...
Processing NC_002932...
Processing NC_014926...
Processing NC_007427...
Processing NC_006139...
Processing NC_013203...
Processing NC_012439...
Processing NC_014355...
Processing NC_008260...
Processing NC_020909...
Processing NC_01

In [11]:
for key in all_closest_neighbours:
    SeqIO.write(all_closest_neighbours[key], "{}.fasta".format(key), "fasta")
res_df = pd.DataFrame(res_dict)
res_df.fillna(0, inplace=True)
res_df.to_csv("conditional.csv") # output raw data

In [12]:
# name cogs
cog_names = pd.read_table("cognames2003-2014.tab", index_col=0)
res_dft = res_df.join(cog_names["name"])
# print best results for chosen COGs
for cog in interesting_cogs:
    try:
        res_dft.sort_values(cog, ascending=False).head(20)[["name", cog]].to_csv("besthits_{}.csv".format(cog))
    except KeyError:
        print("No hits for {}".format(cog))
# print best results for chosen COGs (reversed hits)
res_dft = res_df.T
res_dft = res_dft.join(cog_names["name"])
for cog in interesting_cogs:
    try:
        res_dft.sort_values(cog, ascending=False).head(20)[["name", cog]].to_csv("rbesthits_{}.csv".format(cog))
    except KeyError:
        print("No hits for {}".format(cog))

  


No hits for COG4042
No hits for COG4035
No hits for COG1347
No hits for COG2209
No hits for COG4025
No hits for COG0650
No hits for COG4042
No hits for COG4035
No hits for COG1347
No hits for COG2209
No hits for COG4025
No hits for COG0650


In [12]:
set(res_df.columns) & set(interesting_cogs)

{'COG0636',
 'COG1563',
 'COG1784',
 'COG1822',
 'COG1906',
 'COG1967',
 'COG2245',
 'COG2426',
 'COG2456',
 'COG4036',
 'COG4039',
 'COG4059',
 'COG4060',
 'COG4061',
 'COG4078',
 'COG4720',
 'COG4769'}