This resolves for UD-467 For ndexstringloader Hardcode the following …

…network attributes instead of pulling them from the style file, and UD-482 For STRING loader, check if the specified working directory exists.
ndexcontent · Jul 12, 2019 · faeed1d · faeed1d
1 parent 95e5db6
commit faeed1d
Show file tree

Hide file tree

Showing 2 changed files with 92 additions and 86 deletions.
diff --git a/ndexstringloader/ndexloadstring.py b/ndexstringloader/ndexloadstring.py
@@ -182,6 +182,25 @@ def __init__(self, args):
             "combined_score"
         ]
 
+        self._protein_links_url = \
+            'https://stringdb-static.org/download/protein.links.full.v11.0/9606.protein.links.full.v11.0.txt.gz'
+
+        self._names_file_url = \
+            'https://string-db.org/mapping_files/STRING_display_names/human.name_2_string.tsv.gz'
+
+        self._entrez_ids_file_url = \
+            'https://stringdb-static.org/mapping_files/entrez/human.entrez_2_string.2018.tsv.gz'
+
+        self._uniprot_ids_file_url = \
+            'https://string-db.org/mapping_files/uniprot/human.uniprot_2_string.2018.tsv.gz'
+
+        self._full_file_name = os.path.join(self._datadir, '9606.protein.links.full.v11.0.txt')
+        self._entrez_file = os.path.join(self._datadir, 'entrez_2_string.2018.tsv')
+        self._names_file = os.path.join(self._datadir, 'name_2_string.tsv')
+        self._uniprot_file = os.path.join(self._datadir, 'uniprot_2_string.2018.tsv')
+        self._output_tsv_file_name = os.path.join(self._datadir, '9606.protein.links.tsv')
+        self._cx_network = os.path.join(self._datadir, '9606.protein.links.cx')
+
 
     def _parse_config(self):
         """
@@ -194,17 +213,6 @@ def _parse_config(self):
         self._pass = con.get(self._profile, NDExUtilConfig.PASSWORD)
         self._server = con.get(self._profile, NDExUtilConfig.SERVER)
 
-        self._protein_links_url = con.get(self._profile, 'ProteinLinksFile')
-        self._names_file_url = con.get(self._profile, 'NamesFile')
-        self._entrez_ids_file_url = con.get(self._profile, 'EntrezIdsFile')
-        self._uniprot_ids_file_url = con.get(self._profile, 'UniprotIdsFile')
-
-        self._full_file_name = os.path.join(self._datadir, con.get(self._profile, 'full_file_name'))
-        self._entrez_file = os.path.join(self._datadir, con.get(self._profile, 'entrez_file'))
-        self._names_file = os.path.join(self._datadir, con.get(self._profile, 'names_file'))
-        self._uniprot_file = os.path.join(self._datadir, con.get(self._profile, 'uniprot_file'))
-        self._output_tsv_file_name = os.path.join(self._datadir, con.get(self._profile, 'output_tsv_file_name'))
-
 
     def _load_style_template(self):
         """
@@ -300,9 +308,10 @@ def check_if_edge_is_duplicate(self, edge_key_1, edge_key_2, edges, combined_sco
         return is_duplicate
 
 
-    def create_output_tsv_file(self, output_file, input_file, ensembl_ids):
+    def create_output_tsv_file(self, ensembl_ids):
 
         # generate output tsv file
+        output_file = self._output_tsv_file_name
         logger.debug('Creating target {} file...'.format(output_file))
 
 
@@ -318,7 +327,7 @@ def create_output_tsv_file(self, output_file, input_file, ensembl_ids):
 
             edges = {}
 
-            with open(input_file, 'r') as f_f:
+            with open(self._full_file_name, 'r') as f_f:
                 next(f_f)
                 for line in f_f:
                     columns_in_row = line.split(' ')
@@ -348,6 +357,11 @@ def create_output_tsv_file(self, output_file, input_file, ensembl_ids):
             logger.debug('Created {} ({:,} lines) \n'.format(output_file, row_count))
             logger.debug('{:,} duplicate rows detected \n'.format(dup_count))
 
+
+    def _check_if_data_dir_exists(self):
+        if not os.path.exists(self._datadir):
+            os.makedirs(self._datadir)
+
     def run(self):
         """
         Runs content loading for NDEx STRING Content Loader
@@ -357,6 +371,8 @@ def run(self):
         self._parse_config()
         self._load_style_template()
 
+        self._check_if_data_dir_exists()
+
         if self._args.skipdownload is False:
             self._download_STRING_files()
             self._unpack_STRING_files()
@@ -491,7 +507,7 @@ def run(self):
         logger.debug('Populated {:,} represents from {}\n'.format(row_count, self._uniprot_file))
 
 
-        self.create_output_tsv_file(self._output_tsv_file_name, self._full_file_name, ensembl_ids)
+        self.create_output_tsv_file(ensembl_ids)
 
         return 0
 
@@ -538,14 +554,12 @@ def _init_network_attributes(self):
 
 
     def _generate_CX_file(self, network_attributes):
-        file_name = self._output_tsv_file_name
-        new_cx_file = file_name + '.cx'
 
         logger.debug('generating CX file for network {}...'.format(network_attributes['name']))
 
-        with open(file_name, 'r') as tsvfile:
+        with open(self._output_tsv_file_name, 'r') as tsvfile:
 
-            with open(new_cx_file, "w") as out:
+            with open(self._cx_network, "w") as out:
                 loader = StreamTSVLoader(self._load_plan, self._template)
 
                 loader.write_cx_network(tsvfile, out,
@@ -564,16 +578,15 @@ def _generate_CX_file(self, network_attributes):
                     ])
 
         logger.debug('CX file for network {} generated\n'.format(network_attributes['name']))
-        return new_cx_file
 
 
-    def _load_or_update_network_on_server(self, new_cx_file, network_name, network_id):
+
+    def _load_or_update_network_on_server(self, network_name, network_id):
 
         logger.debug('updating network {} on server {} for user {}...'.format(network_name,
                                                                               self._server,
                                                                               self._user))
-        with open(new_cx_file, 'br') as network_out:
-
+        with open(self._cx_network, 'br') as network_out:
             try:
                 if network_id is None:
                     self._ndex.save_cx_stream_as_new_network(network_out)
@@ -624,16 +637,15 @@ def load_to_NDEx(self):
         if self.create_ndex_connection() is None:
             return 2
 
-
         network_attributes = self._init_network_attributes()
 
-        cx_file_name = self._generate_CX_file(network_attributes)
+        self._generate_CX_file(network_attributes)
 
         network_name = network_attributes['name']
 
         network_id = self.get_network_uuid(network_name)
 
-        self._load_or_update_network_on_server(cx_file_name, network_name, network_id)
+        self._load_or_update_network_on_server(network_name, network_id)
 
 
 
@@ -659,16 +671,7 @@ def main(args):
     {user} = <NDEx username>
     {password} = <NDEx password>
     {server} = <NDEx server(omit http) ie public.ndexbio.org>
-    ProteinLinksFile = https://stringdb-static.org/download/protein.links.full.v11.0/9606.protein.links.full.v11.0.txt.gz
-    NamesFile = https://string-db.org/mapping_files/STRING_display_names/human.name_2_string.tsv.gz
-    EntrezIdsFile = https://stringdb-static.org/mapping_files/entrez/human.entrez_2_string.2018.tsv.gz
-    UniprotIdsFile = https://string-db.org/mapping_files/uniprot/human.uniprot_2_string.2018.tsv.gz
-    full_file_name = 9606.protein.links.full.v11.0.txt
-    entrez_file = human.entrez_2_string.2018.tsv
-    names_file = human.name_2_string.tsv
-    uniprot_file = human.uniprot_2_string.2018.tsv
-    output_tsv_file_name = 9606.protein.links.full.v11.0.tsv.txt
-    output_hi_conf_tsv_file_name = 9606.protein.links.full.v11.0.hi_conf.tsv.txt
+
 
     """.format(confname=NDExUtilConfig.CONFIG_FILE,
                user=NDExUtilConfig.USER,
@@ -686,7 +689,9 @@ def main(args):
         loader.load_to_NDEx()
         return 0
     except Exception as e:
-        logger.exception('Caught exception')
+        #sys.tracebacklimit = 1
+        print("\n{}: {}".format(type(e).__name__, e))
+        logger.exception(e)
         return 2
     finally:
         logging.shutdown()

diff --git a/tests/test_ndexloadstring.py b/tests/test_ndexloadstring.py
@@ -47,36 +47,37 @@ def setUp(self):
         self._args = dotdict(self._args)
 
 
-
     def tearDown(self):
         """Tear down test fixtures, if any."""
+        if os.path.exists(self._args['datadir']):
+            shutil.rmtree(self._args['datadir'])
 
-    @unittest.skip("skip it  now - will add later")
+    #@unittest.skip("skip it  now - uncomment later")
     def test_0010_parse_config(self):
 
         temp_dir = self._args['datadir']
         try:
             p = Param()
-            p.profile = 'test_conf_section'
-            conf = os.path.join(temp_dir, 'temp.conf')
-            p.conf = conf
+            self._args['profile'] = 'test_conf_section'
+            self._args['conf'] = os.path.join(temp_dir, 'temp.conf')
 
-            with open(conf, 'w') as f:
-                f.write('[' + p.profile + ']' + '\n')
+            with open(self._args['conf'], 'w') as f:
+                f.write('[' + self._args['profile'] + ']' + '\n')
                 f.write(NDExUtilConfig.USER + ' = aaa\n')
                 f.write(NDExUtilConfig.PASSWORD + ' = bbb\n')
                 f.write(NDExUtilConfig.SERVER + ' = dev.ndexbio.org\n')
                 f.flush()
 
-            loader = NDExSTRINGLoader(p)
+            loader = NDExSTRINGLoader(self._args)
             loader._parse_config()
             self.assertEqual('aaa', loader._user)
             self.assertEqual('bbb', loader._pass)
             self.assertEqual('dev.ndexbio.org', loader._server)
         finally:
             shutil.rmtree(temp_dir)
 
-    @unittest.skip("skip it  now - uncomment later")
+
+    #@unittest.skip("skip it  now - uncomment later")
     def test_0020_remove_duplicate_edges(self):
 
         # some duplicate records in the same format as in STRING 9606.protein.links.full.v11.0.txt
@@ -122,41 +123,29 @@ def test_0020_remove_duplicate_edges(self):
         }
 
         temp_dir = self._args['datadir']
-        temp_file = 'tmp.txt'
-        temp_file_1 = 'tmp1.txt'
 
         try:
-            f = os.path.join(temp_dir, temp_file)
-
-            self._full_name_file = f
-
-            self._output_tsv_file_name = os.path.join(temp_dir, temp_file_1)
-
+            string_loader = NDExSTRINGLoader(self._args)
 
-            f = os.path.join(temp_dir, temp_file)
+            file_with_duplicates = os.path.join(temp_dir, string_loader._full_file_name)
 
             # create file with duplicate records
-            with open(f, 'w') as o_f:
+            with open(file_with_duplicates, 'w') as o_f:
                 o_f.write('header line' + '\n') # the first line is header; don't care what its content in this test
                 for line in duplicate_records:
                     o_f.write(line + '\n')
                 o_f.flush()
 
             # validate that the file with duplicate records was written fine
-            with open(f, 'r') as i_f:
+            with open(file_with_duplicates, 'r') as i_f:
                 next(i_f)  # skip header
                 index = 0
                 for line in i_f:
                     self.assertEqual(line.rstrip(), duplicate_records[index])
                     index += 1
 
-
-            temp_file_1 = 'tmp1.txt'
-            f_no_duplicates = os.path.join(temp_dir, temp_file_1)
-
-            # now, generate a new file without duplicates
-            string_loader = NDExSTRINGLoader(self._args)
-            string_loader.create_output_tsv_file(f_no_duplicates, f, ensembl_ids)
+            # generate tsv file without duplicates
+            string_loader.create_output_tsv_file(ensembl_ids)
 
 
             # records that should be in the new file after calling create_output_tsv_file
@@ -167,7 +156,7 @@ def test_0020_remove_duplicate_edges(self):
             ]
 
             # open the newly-generated file and validate that all records are unique
-            with open(f_no_duplicates, 'r') as i_f:
+            with open(string_loader._output_tsv_file_name, 'r') as i_f:
                 index = 0
                 next(i_f) # skip header
                 for line in i_f:
@@ -177,9 +166,9 @@ def test_0020_remove_duplicate_edges(self):
         finally:
             shutil.rmtree(temp_dir)
 
-    @unittest.skip("skip it  now - uncomment later")
-    def test_0030_exception_on_duplicate_edge_with_different_scores(self):
 
+    #@unittest.skip("skip it  now - uncomment later")
+    def test_0030_exception_on_duplicate_edge_with_different_scores(self):
 
         # some duplicate records in the same format as in STRING 9606.protein.links.full.v11.0.txt
         duplicate_records = [
@@ -199,45 +188,33 @@ def test_0030_exception_on_duplicate_edge_with_different_scores(self):
             }
         }
 
-        for i in range(0, 2):
 
+        for i in range(0, 2):
             temp_dir = self._args['datadir']
-            temp_file = 'tmp.txt'
-            temp_file_1 = 'tmp1.txt'
 
             try:
-                f = os.path.join(temp_dir, temp_file)
-
-                self._full_name_file = f
-
-                self._output_tsv_file_name = os.path.join(temp_dir, temp_file_1)
+                string_loader = NDExSTRINGLoader(self._args)
 
-                f = os.path.join(temp_dir, temp_file)
+                file_with_duplicates = os.path.join(temp_dir, string_loader._full_file_name)
 
                 # create file with duplicate records
-                with open(f, 'w') as o_f:
-                    o_f.write('header line' + '\n') # the first line is header; don't care what its content in this test
+                with open(file_with_duplicates, 'w') as o_f:
+                    o_f.write(
+                        'header line' + '\n')  # the first line is header; don't care what its content in this test
                     for line in duplicate_records:
                         o_f.write(line + '\n')
                     o_f.flush()
 
                 # validate that the file with duplicate records was written fine
-                with open(f, 'r') as i_f:
+                with open(file_with_duplicates, 'r') as i_f:
                     next(i_f)  # skip header
                     index = 0
                     for line in i_f:
                         self.assertEqual(line.rstrip(), duplicate_records[index])
                         index += 1
 
-
-                temp_file_1 = 'tmp1.txt'
-                f_no_duplicates = os.path.join(temp_dir, temp_file_1)
-
-                # now, generate a new file without duplicates
-                string_loader = NDExSTRINGLoader(self._args)
-
                 with self.assertRaises(ValueError):
-                    string_loader.create_output_tsv_file(f_no_duplicates, f, ensembl_ids)
+                    string_loader.create_output_tsv_file(ensembl_ids)
 
             finally:
                 shutil.rmtree(temp_dir)
@@ -250,6 +227,7 @@ def test_0030_exception_on_duplicate_edge_with_different_scores(self):
                 ]
 
 
+    #@unittest.skip("skip it  now - uncomment later")
     def test_0040_init_network_atributes(self):
         net_attributes = {}
 
@@ -295,3 +273,26 @@ def test_0040_init_network_atributes(self):
         network_attributes = loader._init_network_attributes()
 
         self.assertDictEqual(net_attributes, network_attributes, 'unexpected network properties')
+
+
+    #@unittest.skip("skip it  now - uncomment later")
+    def test_0050_check_if_data_dir_exists(self):
+
+        self._args['datadir'] = '__temp_dir_for_testing__'
+        absolute_path = os.path.abspath(self._args['datadir'])
+
+        if os.path.exists(absolute_path):
+            os.rmdir(absolute_path)
+
+        loader = NDExSTRINGLoader(self._args)
+
+        # _check_if_data_dir_exists will create dir if it doesn't exist
+        loader._check_if_data_dir_exists()
+        self.assertTrue(os.path.exists(absolute_path))
+
+        os.rmdir(absolute_path)
+        self.assertFalse(os.path.exists(absolute_path))
+
+
+
+