diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 982fa8a97156fe..bf2ae89e3eb59a 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -735,7 +735,7 @@ I/O - :func:`read_excel()` will correctly show the deprecation warning for previously deprecated ``sheetname`` (:issue:`17994`) - :func:`read_csv()` will correctly parse timezone-aware datetimes (:issue:`22256`) - :func:`read_sas` will correctly parse sas7bdat files with many columns (:issue:`22628`) - +- :func:`read_sas` will correctly parse sas7bdat files with data page types having also bit 7 set (so page type is 128 + 256 = 384) (:issue:`16615`) Plotting ^^^^^^^^ diff --git a/pandas/io/sas/sas.pyx b/pandas/io/sas/sas.pyx index 8395dccc1e7444..a5bfd5866a2613 100644 --- a/pandas/io/sas/sas.pyx +++ b/pandas/io/sas/sas.pyx @@ -375,7 +375,7 @@ cdef class Parser(object): if done: return True return False - elif self.current_page_type == page_data_type: + elif self.current_page_type & page_data_type == page_data_type: self.process_byte_array_with_data( bit_offset + subheader_pointers_offset + self.current_row_on_page_index * self.row_length, diff --git a/pandas/io/sas/sas7bdat.py b/pandas/io/sas/sas7bdat.py index 5efc6483df4fa9..b5fd62959c8194 100644 --- a/pandas/io/sas/sas7bdat.py +++ b/pandas/io/sas/sas7bdat.py @@ -301,8 +301,10 @@ def _process_page_meta(self): pt = [const.page_meta_type, const.page_amd_type] + const.page_mix_types if self._current_page_type in pt: self._process_page_metadata() - return ((self._current_page_type in [256] + const.page_mix_types) or - (self._current_page_data_subheader_pointers != [])) + is_data_page = self._current_page_type & const.page_data_type + is_mix_page = self._current_page_type in const.page_mix_types + return (is_data_page or is_mix_page + or self._current_page_data_subheader_pointers != []) def _read_page_header(self): bit_offset = self._page_bit_offset @@ -644,11 +646,13 @@ def _read_next_page(self): self._page_length)) self._read_page_header() - if self._current_page_type == const.page_meta_type: + page_type = self._current_page_type + if page_type == const.page_meta_type: self._process_page_metadata() - pt = [const.page_meta_type, const.page_data_type] - pt += [const.page_mix_types] - if self._current_page_type not in pt: + + is_data_page = page_type & const.page_data_type + pt = [const.page_meta_type] + const.page_mix_types + if not is_data_page and self._current_page_type not in pt: return self._read_next_page() return False diff --git a/pandas/tests/io/sas/data/load_log.sas7bdat b/pandas/tests/io/sas/data/load_log.sas7bdat new file mode 100644 index 00000000000000..dc78925471baf4 Binary files /dev/null and b/pandas/tests/io/sas/data/load_log.sas7bdat differ diff --git a/pandas/tests/io/sas/test_sas7bdat.py b/pandas/tests/io/sas/test_sas7bdat.py index 5608e19fa32246..77794b2f8a4209 100644 --- a/pandas/tests/io/sas/test_sas7bdat.py +++ b/pandas/tests/io/sas/test_sas7bdat.py @@ -184,6 +184,7 @@ def test_date_time(datapath): def test_many_columns(datapath): + # Test for looking for column information in more places (PR #22628) fname = datapath("io", "sas", "data", "many_columns.sas7bdat") df = pd.read_sas(fname, encoding='latin-1') fname = datapath("io", "sas", "data", "many_columns.csv") @@ -199,6 +200,13 @@ def test_many_columns(datapath): tm.assert_frame_equal(df, df0) +def test_inconsistent_number_of_rows(datapath): + # Regression test for issue #16615. (PR #22628) + fname = datapath("io", "sas", "data", "load_log.sas7bdat") + df = pd.read_sas(fname, encoding='latin-1') + assert len(df) == 2097 + + def test_zero_variables(datapath): # Check if the SAS file has zero variables (PR #18184) fname = datapath("io", "sas", "data", "zero_variables.sas7bdat")