Skip to content

Commit

Permalink
Added junit tests and refactored STRING Loader to be more modular and…
Browse files Browse the repository at this point in the history
… easier for testing.

Current test coverage is 60%.
  • Loading branch information
vrynkov committed Jul 18, 2019
1 parent 2eec8e3 commit f9a814c
Show file tree
Hide file tree
Showing 2 changed files with 174 additions and 56 deletions.
113 changes: 62 additions & 51 deletions ndexstringloader/ndexloadstring.py
Expand Up @@ -201,6 +201,10 @@ def __init__(self, args):
self._output_tsv_file_name = os.path.join(self._datadir, '9606.protein.links.tsv')
self._cx_network = os.path.join(self._datadir, '9606.protein.links.cx')

self.ensembl_ids = {}
self.duplicate_display_names = {}
self.duplicate_uniprot_ids = {}


def _parse_config(self):
"""
Expand Down Expand Up @@ -262,8 +266,8 @@ def _unpack_STRING_files(self):
self._unzip(self._names_file + '.gz')
self._unzip(self._uniprot_file + '.gz')

def _get_name_rep_alias(self, ensembl_protein_id, ensembl_ids):
name_rep_alias = ensembl_ids[ensembl_protein_id]
def _get_name_rep_alias(self, ensembl_protein_id):
name_rep_alias = self.ensembl_ids[ensembl_protein_id]
use_ensembl_id_for_represents = False

if name_rep_alias['display_name'] is None:
Expand Down Expand Up @@ -308,7 +312,7 @@ def check_if_edge_is_duplicate(self, edge_key_1, edge_key_2, edges, combined_sco
return is_duplicate


def create_output_tsv_file(self, ensembl_ids):
def create_output_tsv_file(self):

# generate output tsv file
output_file = self._output_tsv_file_name
Expand Down Expand Up @@ -345,8 +349,8 @@ def create_output_tsv_file(self, ensembl_ids):
dup_count += 1
continue

name_rep_alias_1 = self._get_name_rep_alias(protein1, ensembl_ids)
name_rep_alias_2 = self._get_name_rep_alias(protein2, ensembl_ids)
name_rep_alias_1 = self._get_name_rep_alias(protein1)
name_rep_alias_2 = self._get_name_rep_alias(protein2)

tsv_string = name_rep_alias_1 + '\t' + name_rep_alias_2 + '\t' + \
'\t'.join(x for x in columns_in_row[2:])
Expand All @@ -367,34 +371,21 @@ def _check_if_data_dir_exists(self):

return data_dir_existed

def run(self):
"""
Runs content loading for NDEx STRING Content Loader
:param theargs:
:return:
"""
self._parse_config()
self._load_style_template()

data_dir_existed = self._check_if_data_dir_exists()

if self._args.skipdownload is False or data_dir_existed is False:
self._download_STRING_files()
self._unpack_STRING_files()
def _get_headers_headers_of_links_file(self):
headers = None

ensembl_ids = {}
duplicate_display_names = {}
duplicate_uniprot_ids = {}
with open(self._full_file_name, 'r') as f:
d_reader = csv.DictReader(f)
headers = ((d_reader.fieldnames)[0]).split()

return headers


logger.info('\nLoading {} for reading...'.format(self._full_file_name))

with open(self._full_file_name, 'r') as f:
d_reader = csv.DictReader(f)
headers = ((d_reader.fieldnames)[0]).split()
def _init_ensembl_ids(self):

logger.debug('{} loaded\n'.format(self._full_file_name))
headers = self._get_headers_headers_of_links_file()

logger.debug('Preparing a dictionary of Ensembl Ids ...')

Expand All @@ -404,12 +395,32 @@ def run(self):
df.drop_duplicates(subset=headers[i], keep='first', inplace=True)

for index, row in df.iterrows():
ensembl_ids[row[headers[i]]] = {}
ensembl_ids[row[headers[i]]]['display_name'] = None
ensembl_ids[row[headers[i]]]['alias'] = None
ensembl_ids[row[headers[i]]]['represents'] = None
self.ensembl_ids[row[headers[i]]] = {}
self.ensembl_ids[row[headers[i]]]['display_name'] = None
self.ensembl_ids[row[headers[i]]]['alias'] = None
self.ensembl_ids[row[headers[i]]]['represents'] = None

logger.info('Found {:,} unique Ensembl Ids in {}\n'.format(len(self.ensembl_ids), self._full_file_name))


def run(self):
"""
Runs content loading for NDEx STRING Content Loader
:param theargs:
:return:
"""
self._parse_config()
self._load_style_template()

data_dir_existed = self._check_if_data_dir_exists()

if self._args.skipdownload is False or data_dir_existed is False:
self._download_STRING_files()
self._unpack_STRING_files()


self._init_ensembl_ids()

logger.info('Found {:,} unique Ensembl Ids in {}\n'.format(len(ensembl_ids), self._full_file_name))

#populate name - 4.display name -> becomes name

Expand All @@ -424,19 +435,19 @@ def run(self):
ensembl_id = columns_in_row[2]
display_name = columns_in_row[1]

if ensembl_id in ensembl_ids:
if ensembl_id in self.ensembl_ids:

if (ensembl_ids[ensembl_id]['display_name'] is None):
ensembl_ids[ensembl_id]['display_name'] = display_name
if (self.ensembl_ids[ensembl_id]['display_name'] is None):
self.ensembl_ids[ensembl_id]['display_name'] = display_name

elif display_name != ensembl_ids[ensembl_id]['display_name']:
elif display_name != self.ensembl_ids[ensembl_id]['display_name']:
# duplicate: we found entries in human.name_2_string.tsv where same Ensembl Id maps to
# multiple display name. This should never happen though
if ensembl_id not in duplicate_display_names:
duplicate_display_names[ensembl_id] = []
duplicate_display_names[ensembl_id].append(ensembl_ids[ensembl_id]['display_name'])
if ensembl_id not in self.duplicate_display_names:
self.duplicate_display_names[ensembl_id] = []
self.duplicate_display_names[ensembl_id].append(self.ensembl_ids[ensembl_id]['display_name'])

duplicate_display_names[ensembl_id].append(display_name)
self.duplicate_display_names[ensembl_id].append(display_name)

row_count = row_count + 1;

Expand All @@ -456,9 +467,9 @@ def run(self):
ensembl_id = columns_in_row[2]
ncbi_gene_id = columns_in_row[1]

if ensembl_id in ensembl_ids:
if ensembl_id in self.ensembl_ids:

if (ensembl_ids[ensembl_id]['alias'] is None):
if (self.ensembl_ids[ensembl_id]['alias'] is None):

ensembl_alias = 'ensembl:' + ensembl_id.split('.')[1]

Expand All @@ -472,7 +483,7 @@ def run(self):
else:
alias_string = ncbi_gene_id_split[0] + ensembl_alias

ensembl_ids[ensembl_id]['alias'] = alias_string
self.ensembl_ids[ensembl_id]['alias'] = alias_string

else:
pass
Expand All @@ -493,26 +504,26 @@ def run(self):
ensembl_id = columns_in_row[2]
uniprot_id = columns_in_row[1].split('|')[0]

if ensembl_id in ensembl_ids:
if ensembl_id in self.ensembl_ids:

if (ensembl_ids[ensembl_id]['represents'] is None):
ensembl_ids[ensembl_id]['represents'] = 'uniprot:' + uniprot_id
if (self.ensembl_ids[ensembl_id]['represents'] is None):
self.ensembl_ids[ensembl_id]['represents'] = 'uniprot:' + uniprot_id

elif uniprot_id != ensembl_ids[ensembl_id]['represents']:
elif uniprot_id != self.ensembl_ids[ensembl_id]['represents']:
# duplicate: we found entries in human.uniprot_2_string.tsv where same Ensembl Id maps to
# multiple uniprot ids.
if ensembl_id not in duplicate_uniprot_ids:
duplicate_uniprot_ids[ensembl_id] = []
duplicate_uniprot_ids[ensembl_id].append(ensembl_ids[ensembl_id]['represents'])
if ensembl_id not in self.duplicate_uniprot_ids:
self.duplicate_uniprot_ids[ensembl_id] = []
self.duplicate_uniprot_ids[ensembl_id].append(self.ensembl_ids[ensembl_id]['represents'])

duplicate_uniprot_ids[ensembl_id].append(uniprot_id)
self.duplicate_uniprot_ids[ensembl_id].append(uniprot_id)

row_count = row_count + 1;

logger.debug('Populated {:,} represents from {}\n'.format(row_count, self._uniprot_file))


self.create_output_tsv_file(ensembl_ids)
self.create_output_tsv_file()

return 0

Expand Down
117 changes: 112 additions & 5 deletions tests/test_ndexloadstring.py
Expand Up @@ -51,7 +51,7 @@ def setUp(self):


def tearDown(self):
"""Tear down test fixtures, if any."""
"""Remove temp directory created by setUp"""
if os.path.exists(self._args['datadir']):
shutil.rmtree(self._args['datadir'])

Expand Down Expand Up @@ -129,6 +129,7 @@ def test_0020_remove_duplicate_edges(self):

try:
string_loader = NDExSTRINGLoader(self._args)
string_loader.__setattr__('ensembl_ids', ensembl_ids)

file_with_duplicates = os.path.join(temp_dir, string_loader._full_file_name)

Expand All @@ -148,7 +149,7 @@ def test_0020_remove_duplicate_edges(self):
index += 1

# generate tsv file without duplicates
string_loader.create_output_tsv_file(ensembl_ids)
string_loader.create_output_tsv_file()


# records that should be in the new file after calling create_output_tsv_file
Expand Down Expand Up @@ -197,6 +198,7 @@ def test_0030_exception_on_duplicate_edge_with_different_scores(self):

try:
string_loader = NDExSTRINGLoader(self._args)
string_loader.__setattr__('ensembl_ids', ensembl_ids)

file_with_duplicates = os.path.join(temp_dir, string_loader._full_file_name)

Expand All @@ -217,7 +219,7 @@ def test_0030_exception_on_duplicate_edge_with_different_scores(self):
index += 1

with self.assertRaises(ValueError):
string_loader.create_output_tsv_file(ensembl_ids)
string_loader.create_output_tsv_file()

finally:
shutil.rmtree(temp_dir)
Expand Down Expand Up @@ -392,6 +394,7 @@ def test_0110_load_style_template(self):
self.assertDictEqual(style_template_actual.__dict__, style_template_expected.__dict__)


#@unittest.skip("skip it now - uncomment later")
def test_0120_download_and_unzip(self):

entrez_url = \
Expand All @@ -410,10 +413,9 @@ def test_0120_download_and_unzip(self):
self.assertTrue(os.path.exists(local_downloaded_file_name_unzipped))



#@unittest.skip("skip it now - uncomment later")
def test_0130_download_and_unzip_STRING_files(self):


loader = NDExSTRINGLoader(self._args)

loader._download_STRING_files()
Expand Down Expand Up @@ -442,6 +444,111 @@ def test_0130_download_and_unzip_STRING_files(self):
self.assertTrue(os.path.exists(uniprot_file))


def test_0140_get_headers_headers_of_links_file(self):
header = [
'protein1',
'protein2',
'neighborhood',
'neighborhood_transferred',
'fusion',
'cooccurence',
'homology',
'coexpression',
'coexpression_transferred',
'experiments',
'experiments_transferred',
'database',
'database_transferred',
'textmining',
'textmining_transferred',
'combined_score'
]

header_str = ' '.join(header)

temp_dir = self._args['datadir']
tempfile = os.path.join(temp_dir, '__temp_link_file__.txt')

with open(tempfile, 'w') as f:
f.write(header_str + '\n')
f.flush()

loader = NDExSTRINGLoader(self._args)
loader.__setattr__('_full_file_name', tempfile)

header_actual = loader._get_headers_headers_of_links_file()

self.assertEqual(header, header_actual)


def test_0150_init_ensembl_ids(self):
header = [
'protein1',
'protein2',
'neighborhood',
'neighborhood_transferred',
'fusion',
'cooccurence',
'homology',
'coexpression',
'coexpression_transferred',
'experiments',
'experiments_transferred',
'database',
'database_transferred',
'textmining',
'textmining_transferred',
'combined_score'
]
content = [
'9606.ENSP00000000233 9606.ENSP00000272298 0 0 0 332 0 0 62 0 181 0 0 0 125 490',
'9606.ENSP00000000233 9606.ENSP00000253401 0 0 0 0 0 0 0 0 186 0 0 0 56 198',
'9606.ENSP00000000233 9606.ENSP00000401445 0 0 0 0 0 0 0 0 160 0 0 0 0 159',
'9606.ENSP00000000233 9606.ENSP00000418915 0 0 0 0 0 0 61 0 158 0 0 542 0 606',
'9606.ENSP00000000233 9606.ENSP00000327801 0 0 0 0 0 69 61 0 78 0 0 0 89 167',
'9606.ENSP00000000233 9606.ENSP00000466298 0 0 0 0 0 141 0 0 131 0 0 0 98 267',
'9606.ENSP00000000233 9606.ENSP00000232564 0 0 0 0 0 0 62 0 171 0 0 0 56 201',
'9606.ENSP00000000233 9606.ENSP00000393379 0 0 0 0 0 0 61 0 131 0 0 0 43 150',
'9606.ENSP00000000233 9606.ENSP00000371253 0 0 0 0 0 0 61 0 0 0 0 0 224 240',
'9606.ENSP00000000233 9606.ENSP00000373713 0 0 0 0 0 0 63 0 63 0 0 0 237 271'
]

ensembl_ids_expected = {
'9606.ENSP00000000233': { 'display_name': None, 'alias': None, 'represents': None },
'9606.ENSP00000272298': { 'display_name': None, 'alias': None, 'represents': None },
'9606.ENSP00000253401': { 'display_name': None, 'alias': None, 'represents': None },
'9606.ENSP00000401445': { 'display_name': None, 'alias': None, 'represents': None },
'9606.ENSP00000418915': { 'display_name': None, 'alias': None, 'represents': None },
'9606.ENSP00000327801': { 'display_name': None, 'alias': None, 'represents': None },
'9606.ENSP00000466298': { 'display_name': None, 'alias': None, 'represents': None },
'9606.ENSP00000232564': { 'display_name': None, 'alias': None, 'represents': None },
'9606.ENSP00000393379': { 'display_name': None, 'alias': None, 'represents': None },
'9606.ENSP00000371253': { 'display_name': None, 'alias': None, 'represents': None },
'9606.ENSP00000373713': { 'display_name': None, 'alias': None, 'represents': None }
}

header_str = ' '.join(header)

temp_dir = self._args['datadir']
tempfile = os.path.join(temp_dir, '__temp_link_file__.txt')

with open(tempfile, 'w') as f:
f.write(header_str + '\n')
for c in content:
f.write(c + '\n')
f.flush()

loader = NDExSTRINGLoader(self._args)
loader.__setattr__('_full_file_name', tempfile)

loader._init_ensembl_ids()

ensembl_ids_actual = loader.__getattribute__(('ensembl_ids'))

self.assertEqual(ensembl_ids_expected, ensembl_ids_actual)






Expand Down

0 comments on commit f9a814c

Please sign in to comment.