Skip to content

Commit

Permalink
This resolves for UD-467 For ndexstringloader Hardcode the following …
Browse files Browse the repository at this point in the history
…network attributes instead of pulling them from the style file,

and UD-482 For STRING loader, check if the specified working directory exists.
  • Loading branch information
vrynkov committed Jul 12, 2019
1 parent 95e5db6 commit faeed1d
Show file tree
Hide file tree
Showing 2 changed files with 92 additions and 86 deletions.
77 changes: 41 additions & 36 deletions ndexstringloader/ndexloadstring.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,25 @@ def __init__(self, args):
"combined_score"
]

self._protein_links_url = \
'https://stringdb-static.org/download/protein.links.full.v11.0/9606.protein.links.full.v11.0.txt.gz'

self._names_file_url = \
'https://string-db.org/mapping_files/STRING_display_names/human.name_2_string.tsv.gz'

self._entrez_ids_file_url = \
'https://stringdb-static.org/mapping_files/entrez/human.entrez_2_string.2018.tsv.gz'

self._uniprot_ids_file_url = \
'https://string-db.org/mapping_files/uniprot/human.uniprot_2_string.2018.tsv.gz'

self._full_file_name = os.path.join(self._datadir, '9606.protein.links.full.v11.0.txt')
self._entrez_file = os.path.join(self._datadir, 'entrez_2_string.2018.tsv')
self._names_file = os.path.join(self._datadir, 'name_2_string.tsv')
self._uniprot_file = os.path.join(self._datadir, 'uniprot_2_string.2018.tsv')
self._output_tsv_file_name = os.path.join(self._datadir, '9606.protein.links.tsv')
self._cx_network = os.path.join(self._datadir, '9606.protein.links.cx')


def _parse_config(self):
"""
Expand All @@ -194,17 +213,6 @@ def _parse_config(self):
self._pass = con.get(self._profile, NDExUtilConfig.PASSWORD)
self._server = con.get(self._profile, NDExUtilConfig.SERVER)

self._protein_links_url = con.get(self._profile, 'ProteinLinksFile')
self._names_file_url = con.get(self._profile, 'NamesFile')
self._entrez_ids_file_url = con.get(self._profile, 'EntrezIdsFile')
self._uniprot_ids_file_url = con.get(self._profile, 'UniprotIdsFile')

self._full_file_name = os.path.join(self._datadir, con.get(self._profile, 'full_file_name'))
self._entrez_file = os.path.join(self._datadir, con.get(self._profile, 'entrez_file'))
self._names_file = os.path.join(self._datadir, con.get(self._profile, 'names_file'))
self._uniprot_file = os.path.join(self._datadir, con.get(self._profile, 'uniprot_file'))
self._output_tsv_file_name = os.path.join(self._datadir, con.get(self._profile, 'output_tsv_file_name'))


def _load_style_template(self):
"""
Expand Down Expand Up @@ -300,9 +308,10 @@ def check_if_edge_is_duplicate(self, edge_key_1, edge_key_2, edges, combined_sco
return is_duplicate


def create_output_tsv_file(self, output_file, input_file, ensembl_ids):
def create_output_tsv_file(self, ensembl_ids):

# generate output tsv file
output_file = self._output_tsv_file_name
logger.debug('Creating target {} file...'.format(output_file))


Expand All @@ -318,7 +327,7 @@ def create_output_tsv_file(self, output_file, input_file, ensembl_ids):

edges = {}

with open(input_file, 'r') as f_f:
with open(self._full_file_name, 'r') as f_f:
next(f_f)
for line in f_f:
columns_in_row = line.split(' ')
Expand Down Expand Up @@ -348,6 +357,11 @@ def create_output_tsv_file(self, output_file, input_file, ensembl_ids):
logger.debug('Created {} ({:,} lines) \n'.format(output_file, row_count))
logger.debug('{:,} duplicate rows detected \n'.format(dup_count))


def _check_if_data_dir_exists(self):
if not os.path.exists(self._datadir):
os.makedirs(self._datadir)

def run(self):
"""
Runs content loading for NDEx STRING Content Loader
Expand All @@ -357,6 +371,8 @@ def run(self):
self._parse_config()
self._load_style_template()

self._check_if_data_dir_exists()

if self._args.skipdownload is False:
self._download_STRING_files()
self._unpack_STRING_files()
Expand Down Expand Up @@ -491,7 +507,7 @@ def run(self):
logger.debug('Populated {:,} represents from {}\n'.format(row_count, self._uniprot_file))


self.create_output_tsv_file(self._output_tsv_file_name, self._full_file_name, ensembl_ids)
self.create_output_tsv_file(ensembl_ids)

return 0

Expand Down Expand Up @@ -538,14 +554,12 @@ def _init_network_attributes(self):


def _generate_CX_file(self, network_attributes):
file_name = self._output_tsv_file_name
new_cx_file = file_name + '.cx'

logger.debug('generating CX file for network {}...'.format(network_attributes['name']))

with open(file_name, 'r') as tsvfile:
with open(self._output_tsv_file_name, 'r') as tsvfile:

with open(new_cx_file, "w") as out:
with open(self._cx_network, "w") as out:
loader = StreamTSVLoader(self._load_plan, self._template)

loader.write_cx_network(tsvfile, out,
Expand All @@ -564,16 +578,15 @@ def _generate_CX_file(self, network_attributes):
])

logger.debug('CX file for network {} generated\n'.format(network_attributes['name']))
return new_cx_file


def _load_or_update_network_on_server(self, new_cx_file, network_name, network_id):

def _load_or_update_network_on_server(self, network_name, network_id):

logger.debug('updating network {} on server {} for user {}...'.format(network_name,
self._server,
self._user))
with open(new_cx_file, 'br') as network_out:

with open(self._cx_network, 'br') as network_out:
try:
if network_id is None:
self._ndex.save_cx_stream_as_new_network(network_out)
Expand Down Expand Up @@ -624,16 +637,15 @@ def load_to_NDEx(self):
if self.create_ndex_connection() is None:
return 2


network_attributes = self._init_network_attributes()

cx_file_name = self._generate_CX_file(network_attributes)
self._generate_CX_file(network_attributes)

network_name = network_attributes['name']

network_id = self.get_network_uuid(network_name)

self._load_or_update_network_on_server(cx_file_name, network_name, network_id)
self._load_or_update_network_on_server(network_name, network_id)



Expand All @@ -659,16 +671,7 @@ def main(args):
{user} = <NDEx username>
{password} = <NDEx password>
{server} = <NDEx server(omit http) ie public.ndexbio.org>
ProteinLinksFile = https://stringdb-static.org/download/protein.links.full.v11.0/9606.protein.links.full.v11.0.txt.gz
NamesFile = https://string-db.org/mapping_files/STRING_display_names/human.name_2_string.tsv.gz
EntrezIdsFile = https://stringdb-static.org/mapping_files/entrez/human.entrez_2_string.2018.tsv.gz
UniprotIdsFile = https://string-db.org/mapping_files/uniprot/human.uniprot_2_string.2018.tsv.gz
full_file_name = 9606.protein.links.full.v11.0.txt
entrez_file = human.entrez_2_string.2018.tsv
names_file = human.name_2_string.tsv
uniprot_file = human.uniprot_2_string.2018.tsv
output_tsv_file_name = 9606.protein.links.full.v11.0.tsv.txt
output_hi_conf_tsv_file_name = 9606.protein.links.full.v11.0.hi_conf.tsv.txt
""".format(confname=NDExUtilConfig.CONFIG_FILE,
user=NDExUtilConfig.USER,
Expand All @@ -686,7 +689,9 @@ def main(args):
loader.load_to_NDEx()
return 0
except Exception as e:
logger.exception('Caught exception')
#sys.tracebacklimit = 1
print("\n{}: {}".format(type(e).__name__, e))
logger.exception(e)
return 2
finally:
logging.shutdown()
Expand Down
101 changes: 51 additions & 50 deletions tests/test_ndexloadstring.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,36 +47,37 @@ def setUp(self):
self._args = dotdict(self._args)



def tearDown(self):
"""Tear down test fixtures, if any."""
if os.path.exists(self._args['datadir']):
shutil.rmtree(self._args['datadir'])

@unittest.skip("skip it now - will add later")
#@unittest.skip("skip it now - uncomment later")
def test_0010_parse_config(self):

temp_dir = self._args['datadir']
try:
p = Param()
p.profile = 'test_conf_section'
conf = os.path.join(temp_dir, 'temp.conf')
p.conf = conf
self._args['profile'] = 'test_conf_section'
self._args['conf'] = os.path.join(temp_dir, 'temp.conf')

with open(conf, 'w') as f:
f.write('[' + p.profile + ']' + '\n')
with open(self._args['conf'], 'w') as f:
f.write('[' + self._args['profile'] + ']' + '\n')
f.write(NDExUtilConfig.USER + ' = aaa\n')
f.write(NDExUtilConfig.PASSWORD + ' = bbb\n')
f.write(NDExUtilConfig.SERVER + ' = dev.ndexbio.org\n')
f.flush()

loader = NDExSTRINGLoader(p)
loader = NDExSTRINGLoader(self._args)
loader._parse_config()
self.assertEqual('aaa', loader._user)
self.assertEqual('bbb', loader._pass)
self.assertEqual('dev.ndexbio.org', loader._server)
finally:
shutil.rmtree(temp_dir)

@unittest.skip("skip it now - uncomment later")

#@unittest.skip("skip it now - uncomment later")
def test_0020_remove_duplicate_edges(self):

# some duplicate records in the same format as in STRING 9606.protein.links.full.v11.0.txt
Expand Down Expand Up @@ -122,41 +123,29 @@ def test_0020_remove_duplicate_edges(self):
}

temp_dir = self._args['datadir']
temp_file = 'tmp.txt'
temp_file_1 = 'tmp1.txt'

try:
f = os.path.join(temp_dir, temp_file)

self._full_name_file = f

self._output_tsv_file_name = os.path.join(temp_dir, temp_file_1)

string_loader = NDExSTRINGLoader(self._args)

f = os.path.join(temp_dir, temp_file)
file_with_duplicates = os.path.join(temp_dir, string_loader._full_file_name)

# create file with duplicate records
with open(f, 'w') as o_f:
with open(file_with_duplicates, 'w') as o_f:
o_f.write('header line' + '\n') # the first line is header; don't care what its content in this test
for line in duplicate_records:
o_f.write(line + '\n')
o_f.flush()

# validate that the file with duplicate records was written fine
with open(f, 'r') as i_f:
with open(file_with_duplicates, 'r') as i_f:
next(i_f) # skip header
index = 0
for line in i_f:
self.assertEqual(line.rstrip(), duplicate_records[index])
index += 1


temp_file_1 = 'tmp1.txt'
f_no_duplicates = os.path.join(temp_dir, temp_file_1)

# now, generate a new file without duplicates
string_loader = NDExSTRINGLoader(self._args)
string_loader.create_output_tsv_file(f_no_duplicates, f, ensembl_ids)
# generate tsv file without duplicates
string_loader.create_output_tsv_file(ensembl_ids)


# records that should be in the new file after calling create_output_tsv_file
Expand All @@ -167,7 +156,7 @@ def test_0020_remove_duplicate_edges(self):
]

# open the newly-generated file and validate that all records are unique
with open(f_no_duplicates, 'r') as i_f:
with open(string_loader._output_tsv_file_name, 'r') as i_f:
index = 0
next(i_f) # skip header
for line in i_f:
Expand All @@ -177,9 +166,9 @@ def test_0020_remove_duplicate_edges(self):
finally:
shutil.rmtree(temp_dir)

@unittest.skip("skip it now - uncomment later")
def test_0030_exception_on_duplicate_edge_with_different_scores(self):

#@unittest.skip("skip it now - uncomment later")
def test_0030_exception_on_duplicate_edge_with_different_scores(self):

# some duplicate records in the same format as in STRING 9606.protein.links.full.v11.0.txt
duplicate_records = [
Expand All @@ -199,45 +188,33 @@ def test_0030_exception_on_duplicate_edge_with_different_scores(self):
}
}

for i in range(0, 2):

for i in range(0, 2):
temp_dir = self._args['datadir']
temp_file = 'tmp.txt'
temp_file_1 = 'tmp1.txt'

try:
f = os.path.join(temp_dir, temp_file)

self._full_name_file = f

self._output_tsv_file_name = os.path.join(temp_dir, temp_file_1)
string_loader = NDExSTRINGLoader(self._args)

f = os.path.join(temp_dir, temp_file)
file_with_duplicates = os.path.join(temp_dir, string_loader._full_file_name)

# create file with duplicate records
with open(f, 'w') as o_f:
o_f.write('header line' + '\n') # the first line is header; don't care what its content in this test
with open(file_with_duplicates, 'w') as o_f:
o_f.write(
'header line' + '\n') # the first line is header; don't care what its content in this test
for line in duplicate_records:
o_f.write(line + '\n')
o_f.flush()

# validate that the file with duplicate records was written fine
with open(f, 'r') as i_f:
with open(file_with_duplicates, 'r') as i_f:
next(i_f) # skip header
index = 0
for line in i_f:
self.assertEqual(line.rstrip(), duplicate_records[index])
index += 1


temp_file_1 = 'tmp1.txt'
f_no_duplicates = os.path.join(temp_dir, temp_file_1)

# now, generate a new file without duplicates
string_loader = NDExSTRINGLoader(self._args)

with self.assertRaises(ValueError):
string_loader.create_output_tsv_file(f_no_duplicates, f, ensembl_ids)
string_loader.create_output_tsv_file(ensembl_ids)

finally:
shutil.rmtree(temp_dir)
Expand All @@ -250,6 +227,7 @@ def test_0030_exception_on_duplicate_edge_with_different_scores(self):
]


#@unittest.skip("skip it now - uncomment later")
def test_0040_init_network_atributes(self):
net_attributes = {}

Expand Down Expand Up @@ -295,3 +273,26 @@ def test_0040_init_network_atributes(self):
network_attributes = loader._init_network_attributes()

self.assertDictEqual(net_attributes, network_attributes, 'unexpected network properties')


#@unittest.skip("skip it now - uncomment later")
def test_0050_check_if_data_dir_exists(self):

self._args['datadir'] = '__temp_dir_for_testing__'
absolute_path = os.path.abspath(self._args['datadir'])

if os.path.exists(absolute_path):
os.rmdir(absolute_path)

loader = NDExSTRINGLoader(self._args)

# _check_if_data_dir_exists will create dir if it doesn't exist
loader._check_if_data_dir_exists()
self.assertTrue(os.path.exists(absolute_path))

os.rmdir(absolute_path)
self.assertFalse(os.path.exists(absolute_path))




0 comments on commit faeed1d

Please sign in to comment.