In [78]:
import os
import matplotlib.pyplot as plt 

In [35]:
parent = 'new-data/'
species1 = 'Campylobacter coli 76339'
species2 = 'Candidatus Koribacter versatilis Ellin345'
species1_num = '1367491'
species2_num = '204669'

info_path = '.protein.info.v11.5.txt'
link_path = '.protein.links.v11.5.txt'

In [100]:
# process info files

with open(parent + species1 + '/' + species1_num + info_path) as f:
    input = f.read().splitlines()
    
protein_functions = {}
    
species1_dict = {}
species1_dict_rev = {}
species1_proteins = set()
for val in input[1:]:
    values = val.split()
    
    species1_dict[values[1]] = values[0]
    species1_dict_rev[values[0]] = values[1]
    species1_proteins.add(values[1])
    
    protein_functions[values[1]] = ' '.join(values[3:])
    
with open(parent + species2 + '/' + species2_num + info_path) as f:
    input = f.read().splitlines()
    
species2_dict = {}
species2_dict_rev = {}
species2_proteins = set()
for val in input[1:]:
    values = val.split()
    
    species2_dict[values[1]] = values[0]
    species2_dict_rev[values[0]] = values[1]
    species2_proteins.add(values[1])
    
    if not values[1] in protein_functions:
        protein_functions[values[1]] = ' '.join(values[3:])
    else:
        string = ' '.join(values[3:])
        if protein_functions[values[1]] == string:
            protein_functions[values[1]] = ' '.join(values[3:])
        else:
            protein_functions[values[1]] += '\nALTERNATIVELY, ' + ' '.join(values[3:])
    
# find proteins that are in both species PPIN
intersect = species1_proteins.intersection(species2_proteins)

In [58]:
# graph data structure: https://www.bogotobogo.com/python/python_graph_data_structures.php
class Vertex:
    def __init__(self, node):
        self.id = node
        self.adjacent = {}

    def __str__(self):
        return str(self.id)# + ' adjacent: ' + str([x.id for x in self.adjacent])

    def add_neighbor(self, neighbor, weight=0):
        self.adjacent[neighbor] = weight

    def get_neighbors(self):
        return self.adjacent.keys()  

    def get_id(self):
        return self.id

    def get_weight(self, neighbor):
        return self.adjacent[neighbor]

class Graph:
    def __init__(self):
        self.vert_dict = {}
        self.num_vertices = 0

    def __iter__(self):
        return iter(self.vert_dict.values())

    def add_vertex(self, node):
        self.num_vertices = self.num_vertices + 1
        new_vertex = Vertex(node)
        self.vert_dict[node] = new_vertex
        return new_vertex

    def get_vertex(self, n):
        if n in self.vert_dict:
            return self.vert_dict[n]
        else:
            return None

    def add_edge(self, frm, to, cost = 0):
        if frm not in self.vert_dict:
            self.add_vertex(frm)
        if to not in self.vert_dict:
            self.add_vertex(to)

        self.vert_dict[frm].add_neighbor(self.vert_dict[to], cost)
        self.vert_dict[to].add_neighbor(self.vert_dict[frm], cost)

    def get_vertices(self):
        return self.vert_dict.keys()

In [59]:
# read graph files
with open(parent + species1 + '/' + species1_num + link_path) as f:
    input = f.read().splitlines()
    
species1_graph = Graph()
for val in input[1:]:
    values = val.split()
    
#   add edges to an undirected graph
    species1_graph.add_edge(values[0], values[1], int(values[2]))
    species1_graph.add_edge(values[1], values[0], int(values[2]))
    
with open(parent + species2 + '/' + species2_num + link_path) as f:
    input = f.read().splitlines()
    
species2_graph = Graph()
for val in input[1:]:
    values = val.split()
    
#   add edges to an undirected graph
    species2_graph.add_edge(values[0], values[1], int(values[2]))
    species2_graph.add_edge(values[1], values[0], int(values[2]))

In [83]:
# for each protein, compare connections
fracs = {}
for protein in intersect:
    print('Investigating protein ' + protein)
    species1_vert = species1_graph.get_vertex(species1_dict[protein])
    species2_vert = species2_graph.get_vertex(species2_dict[protein])
    print('\tFound ' + str(species1_vert) + ' and ' + str(species2_vert))
    
    species1_ngh = species1_vert.get_neighbors()
    species2_ngh = species2_vert.get_neighbors()
    print('\tSize of neighborhood for species 1: ' + str(len(species1_ngh)))
    print('\tSize of neighborhood for species 2: ' + str(len(species2_ngh)))
    
    s1_ngh_set = set([species1_dict_rev[x.id] for x in species1_ngh])
    s2_ngh_set = set([species2_dict_rev[x.id] for x in species2_ngh])
    
    ngh_intersect = s1_ngh_set.intersection(s2_ngh_set)
    print('\tSize of intersection of neighbors: ' + str(len(ngh_intersect)))
    frac = len(ngh_intersect) / ((len(species1_ngh) + len(species2_ngh))/2)
    print('\tFraction of intersection: ' + str(frac))
    
    fracs[protein] = frac
    
print(fracs)

Investigating protein thiL
	Found 1367491.BN865_06830 and 204669.Acid345_4694
	Size of neighborhood for species 1: 101
	Size of neighborhood for species 2: 218
	Size of intersection of neighbors: 21
	Fraction of intersection: 0.13166144200626959
Investigating protein clpP
	Found 1367491.BN865_02180 and 204669.Acid345_1560
	Size of neighborhood for species 1: 255
	Size of neighborhood for species 2: 515
	Size of intersection of neighbors: 115
	Fraction of intersection: 0.2987012987012987
Investigating protein ddl
	Found 1367491.BN865_13210 and 204669.Acid345_4592
	Size of neighborhood for species 1: 193
	Size of neighborhood for species 2: 299
	Size of intersection of neighbors: 67
	Fraction of intersection: 0.27235772357723576
Investigating protein secE
	Found 1367491.BN865_16040c and 204669.Acid345_4682
	Size of neighborhood for species 1: 134
	Size of neighborhood for species 2: 710
	Size of intersection of neighbors: 95
	Fraction of intersection: 0.22511848341232227
Investigating pr

In [101]:
filtered = {}
for val in fracs:
    if fracs[val] > 0.4:
        filtered[val] = fracs[val]

# plt.bar(filtered.keys(), filtered.values())
print(filtered.keys())
for protein in filtered.keys():
    print(protein)
    print('\t' + protein_functions[protein])

dict_keys(['efp', 'rplF', 'rimP', 'rplX', 'rplC', 'prfA', 'rpsT', 'rpmC', 'tsf', 'trmD', 'rplB', 'secY', 'rplI', 'rpmB', 'rpmI', 'rplV', 'rpsJ', 'hisS', 'rpmF', 'rplO', 'rplL', 'rplM', 'rpsF', 'glyQ', 'rpsH', 'rplS', 'frr', 'rpsK', 'ileS', 'rplP', 'rplT', 'rplR', 'rplN', 'rplY', 'pyrH', 'rpmG', 'lysS', 'rpsQ', 'infB', 'rplJ', 'rpsS', 'rbfA', 'rpsO', 'rpsG', 'smpB', 'pheS', 'rpsC', 'infC', 'purD', 'aspS', 'rpsR', 'rpmE', 'fusA', 'prfB', 'rplW', 'rpsM', 'rpsE', 'rplD', 'rplQ', 'rpmH', 'nusG', 'rplK', 'rplE', 'rplA', 'rpmA', 'rpsP', 'infA', 'ychF'])
efp
	Involved in peptide bond synthesis. Stimulates efficient translation and peptide-bond synthesis on native or reconstituted 70S ribosomes in vitro. Probably functions indirectly by altering the affinity of the ribosome for aminoacyl-tRNA, thus increasing their reactivity as acceptors for peptidyl transferase
rplF
	This protein binds to the 23S rRNA, and is important in its secondary structure. It is located near the subunit interface in th