Added junit tests and refactored STRING Loader to be more modular and…

… easier for testing. Current test coverage is 60%.
ndexcontent · Jul 18, 2019 · f9a814c · f9a814c
1 parent 2eec8e3
commit f9a814c
Show file tree

Hide file tree

Showing 2 changed files with 174 additions and 56 deletions.
diff --git a/ndexstringloader/ndexloadstring.py b/ndexstringloader/ndexloadstring.py
@@ -201,6 +201,10 @@ def __init__(self, args):
         self._output_tsv_file_name = os.path.join(self._datadir, '9606.protein.links.tsv')
         self._cx_network = os.path.join(self._datadir, '9606.protein.links.cx')
 
+        self.ensembl_ids = {}
+        self.duplicate_display_names = {}
+        self.duplicate_uniprot_ids = {}
+
 
     def _parse_config(self):
         """
@@ -262,8 +266,8 @@ def _unpack_STRING_files(self):
         self._unzip(self._names_file + '.gz')
         self._unzip(self._uniprot_file + '.gz')
 
-    def _get_name_rep_alias(self, ensembl_protein_id, ensembl_ids):
-        name_rep_alias = ensembl_ids[ensembl_protein_id]
+    def _get_name_rep_alias(self, ensembl_protein_id):
+        name_rep_alias = self.ensembl_ids[ensembl_protein_id]
         use_ensembl_id_for_represents = False
 
         if name_rep_alias['display_name'] is None:
@@ -308,7 +312,7 @@ def check_if_edge_is_duplicate(self, edge_key_1, edge_key_2, edges, combined_sco
         return is_duplicate
 
 
-    def create_output_tsv_file(self, ensembl_ids):
+    def create_output_tsv_file(self):
 
         # generate output tsv file
         output_file = self._output_tsv_file_name
@@ -345,8 +349,8 @@ def create_output_tsv_file(self, ensembl_ids):
                             dup_count += 1
                             continue
 
-                        name_rep_alias_1 = self._get_name_rep_alias(protein1, ensembl_ids)
-                        name_rep_alias_2 = self._get_name_rep_alias(protein2, ensembl_ids)
+                        name_rep_alias_1 = self._get_name_rep_alias(protein1)
+                        name_rep_alias_2 = self._get_name_rep_alias(protein2)
 
                         tsv_string = name_rep_alias_1 + '\t' + name_rep_alias_2 + '\t' + \
                                      '\t'.join(x for x in columns_in_row[2:])
@@ -367,34 +371,21 @@ def _check_if_data_dir_exists(self):
 
         return data_dir_existed
 
-    def run(self):
-        """
-        Runs content loading for NDEx STRING Content Loader
-        :param theargs:
-        :return:
-        """
-        self._parse_config()
-        self._load_style_template()
-
-        data_dir_existed = self._check_if_data_dir_exists()
 
-        if self._args.skipdownload is False or data_dir_existed is False:
-            self._download_STRING_files()
-            self._unpack_STRING_files()
+    def _get_headers_headers_of_links_file(self):
+        headers = None
 
-        ensembl_ids = {}
-        duplicate_display_names = {}
-        duplicate_uniprot_ids = {}
+        with open(self._full_file_name, 'r') as f:
+            d_reader = csv.DictReader(f)
+            headers = ((d_reader.fieldnames)[0]).split()
 
+        return headers
 
 
-        logger.info('\nLoading {} for reading...'.format(self._full_file_name))
 
-        with open(self._full_file_name, 'r') as f:
-            d_reader = csv.DictReader(f)
-            headers = ((d_reader.fieldnames)[0]).split()
+    def _init_ensembl_ids(self):
 
-        logger.debug('{} loaded\n'.format(self._full_file_name))
+        headers = self._get_headers_headers_of_links_file()
 
         logger.debug('Preparing a dictionary of Ensembl Ids ...')
 
@@ -404,12 +395,32 @@ def run(self):
             df.drop_duplicates(subset=headers[i], keep='first', inplace=True)
 
             for index, row in df.iterrows():
-                ensembl_ids[row[headers[i]]] = {}
-                ensembl_ids[row[headers[i]]]['display_name'] = None
-                ensembl_ids[row[headers[i]]]['alias'] = None
-                ensembl_ids[row[headers[i]]]['represents'] = None
+                self.ensembl_ids[row[headers[i]]] = {}
+                self.ensembl_ids[row[headers[i]]]['display_name'] = None
+                self.ensembl_ids[row[headers[i]]]['alias'] = None
+                self.ensembl_ids[row[headers[i]]]['represents'] = None
+
+        logger.info('Found {:,} unique Ensembl Ids in {}\n'.format(len(self.ensembl_ids), self._full_file_name))
+
+
+    def run(self):
+        """
+        Runs content loading for NDEx STRING Content Loader
+        :param theargs:
+        :return:
+        """
+        self._parse_config()
+        self._load_style_template()
+
+        data_dir_existed = self._check_if_data_dir_exists()
+
+        if self._args.skipdownload is False or data_dir_existed is False:
+            self._download_STRING_files()
+            self._unpack_STRING_files()
+
+
+        self._init_ensembl_ids()
 
-        logger.info('Found {:,} unique Ensembl Ids in {}\n'.format(len(ensembl_ids), self._full_file_name))
 
         #populate name - 4.display name -> becomes name
 
@@ -424,19 +435,19 @@ def run(self):
                 ensembl_id = columns_in_row[2]
                 display_name = columns_in_row[1]
 
-                if ensembl_id in ensembl_ids:
+                if ensembl_id in self.ensembl_ids:
 
-                    if (ensembl_ids[ensembl_id]['display_name'] is None):
-                        ensembl_ids[ensembl_id]['display_name'] = display_name
+                    if (self.ensembl_ids[ensembl_id]['display_name'] is None):
+                        self.ensembl_ids[ensembl_id]['display_name'] = display_name
 
-                    elif display_name != ensembl_ids[ensembl_id]['display_name']:
+                    elif display_name != self.ensembl_ids[ensembl_id]['display_name']:
                         # duplicate: we found entries in human.name_2_string.tsv where same Ensembl Id maps to
                         # multiple display name.  This should never happen though
-                        if ensembl_id not in duplicate_display_names:
-                            duplicate_display_names[ensembl_id] = []
-                            duplicate_display_names[ensembl_id].append(ensembl_ids[ensembl_id]['display_name'])
+                        if ensembl_id not in self.duplicate_display_names:
+                            self.duplicate_display_names[ensembl_id] = []
+                            self.duplicate_display_names[ensembl_id].append(self.ensembl_ids[ensembl_id]['display_name'])
 
-                        duplicate_display_names[ensembl_id].append(display_name)
+                            self.duplicate_display_names[ensembl_id].append(display_name)
 
                 row_count = row_count + 1;
 
@@ -456,9 +467,9 @@ def run(self):
                 ensembl_id = columns_in_row[2]
                 ncbi_gene_id = columns_in_row[1]
 
-                if ensembl_id in ensembl_ids:
+                if ensembl_id in self.ensembl_ids:
 
-                    if (ensembl_ids[ensembl_id]['alias'] is None):
+                    if (self.ensembl_ids[ensembl_id]['alias'] is None):
 
                         ensembl_alias = 'ensembl:' + ensembl_id.split('.')[1]
 
@@ -472,7 +483,7 @@ def run(self):
                         else:
                             alias_string = ncbi_gene_id_split[0] + ensembl_alias
 
-                        ensembl_ids[ensembl_id]['alias'] = alias_string
+                            self.ensembl_ids[ensembl_id]['alias'] = alias_string
 
                     else:
                         pass
@@ -493,26 +504,26 @@ def run(self):
                 ensembl_id = columns_in_row[2]
                 uniprot_id = columns_in_row[1].split('|')[0]
 
-                if ensembl_id in ensembl_ids:
+                if ensembl_id in self.ensembl_ids:
 
-                    if (ensembl_ids[ensembl_id]['represents'] is None):
-                        ensembl_ids[ensembl_id]['represents'] = 'uniprot:' + uniprot_id
+                    if (self.ensembl_ids[ensembl_id]['represents'] is None):
+                        self.ensembl_ids[ensembl_id]['represents'] = 'uniprot:' + uniprot_id
 
-                    elif uniprot_id != ensembl_ids[ensembl_id]['represents']:
+                    elif uniprot_id != self.ensembl_ids[ensembl_id]['represents']:
                         # duplicate: we found entries in human.uniprot_2_string.tsv where same Ensembl Id maps to
                         # multiple uniprot ids.
-                        if ensembl_id not in duplicate_uniprot_ids:
-                            duplicate_uniprot_ids[ensembl_id] = []
-                            duplicate_uniprot_ids[ensembl_id].append(ensembl_ids[ensembl_id]['represents'])
+                        if ensembl_id not in self.duplicate_uniprot_ids:
+                            self.duplicate_uniprot_ids[ensembl_id] = []
+                            self.duplicate_uniprot_ids[ensembl_id].append(self.ensembl_ids[ensembl_id]['represents'])
 
-                            duplicate_uniprot_ids[ensembl_id].append(uniprot_id)
+                            self.duplicate_uniprot_ids[ensembl_id].append(uniprot_id)
 
                 row_count = row_count + 1;
 
         logger.debug('Populated {:,} represents from {}\n'.format(row_count, self._uniprot_file))
 
 
-        self.create_output_tsv_file(ensembl_ids)
+        self.create_output_tsv_file()
 
         return 0
 

diff --git a/tests/test_ndexloadstring.py b/tests/test_ndexloadstring.py
@@ -51,7 +51,7 @@ def setUp(self):
 
 
     def tearDown(self):
-        """Tear down test fixtures, if any."""
+        """Remove temp directory created by setUp"""
         if os.path.exists(self._args['datadir']):
             shutil.rmtree(self._args['datadir'])
 
@@ -129,6 +129,7 @@ def test_0020_remove_duplicate_edges(self):
 
         try:
             string_loader = NDExSTRINGLoader(self._args)
+            string_loader.__setattr__('ensembl_ids', ensembl_ids)
 
             file_with_duplicates = os.path.join(temp_dir, string_loader._full_file_name)
 
@@ -148,7 +149,7 @@ def test_0020_remove_duplicate_edges(self):
                     index += 1
 
             # generate tsv file without duplicates
-            string_loader.create_output_tsv_file(ensembl_ids)
+            string_loader.create_output_tsv_file()
 
 
             # records that should be in the new file after calling create_output_tsv_file
@@ -197,6 +198,7 @@ def test_0030_exception_on_duplicate_edge_with_different_scores(self):
 
             try:
                 string_loader = NDExSTRINGLoader(self._args)
+                string_loader.__setattr__('ensembl_ids', ensembl_ids)
 
                 file_with_duplicates = os.path.join(temp_dir, string_loader._full_file_name)
 
@@ -217,7 +219,7 @@ def test_0030_exception_on_duplicate_edge_with_different_scores(self):
                         index += 1
 
                 with self.assertRaises(ValueError):
-                    string_loader.create_output_tsv_file(ensembl_ids)
+                    string_loader.create_output_tsv_file()
 
             finally:
                 shutil.rmtree(temp_dir)
@@ -392,6 +394,7 @@ def test_0110_load_style_template(self):
         self.assertDictEqual(style_template_actual.__dict__, style_template_expected.__dict__)
 
 
+    #@unittest.skip("skip it  now - uncomment later")
     def test_0120_download_and_unzip(self):
 
         entrez_url = \
@@ -410,10 +413,9 @@ def test_0120_download_and_unzip(self):
         self.assertTrue(os.path.exists(local_downloaded_file_name_unzipped))
 
 
-
+    #@unittest.skip("skip it  now - uncomment later")
     def test_0130_download_and_unzip_STRING_files(self):
 
-
         loader = NDExSTRINGLoader(self._args)
 
         loader._download_STRING_files()
@@ -442,6 +444,111 @@ def test_0130_download_and_unzip_STRING_files(self):
         self.assertTrue(os.path.exists(uniprot_file))
 
 
+    def test_0140_get_headers_headers_of_links_file(self):
+        header = [
+            'protein1',
+            'protein2',
+            'neighborhood',
+            'neighborhood_transferred',
+            'fusion',
+            'cooccurence',
+            'homology',
+            'coexpression',
+            'coexpression_transferred',
+            'experiments',
+            'experiments_transferred',
+            'database',
+            'database_transferred',
+            'textmining',
+            'textmining_transferred',
+            'combined_score'
+        ]
+
+        header_str = ' '.join(header)
+
+        temp_dir = self._args['datadir']
+        tempfile = os.path.join(temp_dir, '__temp_link_file__.txt')
+
+        with open(tempfile, 'w') as f:
+            f.write(header_str + '\n')
+            f.flush()
+
+        loader = NDExSTRINGLoader(self._args)
+        loader.__setattr__('_full_file_name', tempfile)
+
+        header_actual = loader._get_headers_headers_of_links_file()
+
+        self.assertEqual(header, header_actual)
+
+
+    def test_0150_init_ensembl_ids(self):
+        header = [
+            'protein1',
+            'protein2',
+            'neighborhood',
+            'neighborhood_transferred',
+            'fusion',
+            'cooccurence',
+            'homology',
+            'coexpression',
+            'coexpression_transferred',
+            'experiments',
+            'experiments_transferred',
+            'database',
+            'database_transferred',
+            'textmining',
+            'textmining_transferred',
+            'combined_score'
+        ]
+        content = [
+            '9606.ENSP00000000233 9606.ENSP00000272298 0 0 0 332 0 0 62 0 181 0 0 0 125 490',
+            '9606.ENSP00000000233 9606.ENSP00000253401 0 0 0 0 0 0 0 0 186 0 0 0 56 198',
+            '9606.ENSP00000000233 9606.ENSP00000401445 0 0 0 0 0 0 0 0 160 0 0 0 0 159',
+            '9606.ENSP00000000233 9606.ENSP00000418915 0 0 0 0 0 0 61 0 158 0 0 542 0 606',
+            '9606.ENSP00000000233 9606.ENSP00000327801 0 0 0 0 0 69 61 0 78 0 0 0 89 167',
+            '9606.ENSP00000000233 9606.ENSP00000466298 0 0 0 0 0 141 0 0 131 0 0 0 98 267',
+            '9606.ENSP00000000233 9606.ENSP00000232564 0 0 0 0 0 0 62 0 171 0 0 0 56 201',
+            '9606.ENSP00000000233 9606.ENSP00000393379 0 0 0 0 0 0 61 0 131 0 0 0 43 150',
+            '9606.ENSP00000000233 9606.ENSP00000371253 0 0 0 0 0 0 61 0 0 0 0 0 224 240',
+            '9606.ENSP00000000233 9606.ENSP00000373713 0 0 0 0 0 0 63 0 63 0 0 0 237 271'
+        ]
+
+        ensembl_ids_expected = {
+            '9606.ENSP00000000233': { 'display_name': None, 'alias': None, 'represents': None },
+            '9606.ENSP00000272298': { 'display_name': None, 'alias': None, 'represents': None },
+            '9606.ENSP00000253401': { 'display_name': None, 'alias': None, 'represents': None },
+            '9606.ENSP00000401445': { 'display_name': None, 'alias': None, 'represents': None },
+            '9606.ENSP00000418915': { 'display_name': None, 'alias': None, 'represents': None },
+            '9606.ENSP00000327801': { 'display_name': None, 'alias': None, 'represents': None },
+            '9606.ENSP00000466298': { 'display_name': None, 'alias': None, 'represents': None },
+            '9606.ENSP00000232564': { 'display_name': None, 'alias': None, 'represents': None },
+            '9606.ENSP00000393379': { 'display_name': None, 'alias': None, 'represents': None },
+            '9606.ENSP00000371253': { 'display_name': None, 'alias': None, 'represents': None },
+            '9606.ENSP00000373713': { 'display_name': None, 'alias': None, 'represents': None }
+        }
+
+        header_str = ' '.join(header)
+
+        temp_dir = self._args['datadir']
+        tempfile = os.path.join(temp_dir, '__temp_link_file__.txt')
+
+        with open(tempfile, 'w') as f:
+            f.write(header_str + '\n')
+            for c in content:
+                f.write(c + '\n')
+            f.flush()
+
+        loader = NDExSTRINGLoader(self._args)
+        loader.__setattr__('_full_file_name', tempfile)
+
+        loader._init_ensembl_ids()
+
+        ensembl_ids_actual = loader.__getattribute__(('ensembl_ids'))
+
+        self.assertEqual(ensembl_ids_expected, ensembl_ids_actual)
+
+
+