Skip to content

Commit

Permalink
BUG: Fix parsing of sas7bdat files with odd data pages (#16615)
Browse files Browse the repository at this point in the history
SAS can apparently generate data pages having bit 7 (128) set on
the page type.
It seems that the presence of bit 8 (256) determines whether it's
a data page or not. So treat page as a data page if bit 8 is set and
don't mind the lower bits.
  • Loading branch information
troels committed Sep 9, 2018
1 parent 2baa169 commit f05ebc8
Show file tree
Hide file tree
Showing 5 changed files with 20 additions and 8 deletions.
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v0.24.0.txt
Expand Up @@ -735,7 +735,7 @@ I/O
- :func:`read_excel()` will correctly show the deprecation warning for previously deprecated ``sheetname`` (:issue:`17994`)
- :func:`read_csv()` will correctly parse timezone-aware datetimes (:issue:`22256`)
- :func:`read_sas` will correctly parse sas7bdat files with many columns (:issue:`22628`)

- :func:`read_sas` will correctly parse sas7bdat files with data page types having also bit 7 set (so page type is 128 + 256 = 384) (:issue:`16615`)

Plotting
^^^^^^^^
Expand Down
2 changes: 1 addition & 1 deletion pandas/io/sas/sas.pyx
Expand Up @@ -375,7 +375,7 @@ cdef class Parser(object):
if done:
return True
return False
elif self.current_page_type == page_data_type:
elif self.current_page_type & page_data_type == page_data_type:
self.process_byte_array_with_data(
bit_offset + subheader_pointers_offset +
self.current_row_on_page_index * self.row_length,
Expand Down
16 changes: 10 additions & 6 deletions pandas/io/sas/sas7bdat.py
Expand Up @@ -301,8 +301,10 @@ def _process_page_meta(self):
pt = [const.page_meta_type, const.page_amd_type] + const.page_mix_types
if self._current_page_type in pt:
self._process_page_metadata()
return ((self._current_page_type in [256] + const.page_mix_types) or
(self._current_page_data_subheader_pointers != []))
is_data_page = self._current_page_type & const.page_data_type
is_mix_page = self._current_page_type in const.page_mix_types
return (is_data_page or is_mix_page
or self._current_page_data_subheader_pointers != [])

def _read_page_header(self):
bit_offset = self._page_bit_offset
Expand Down Expand Up @@ -644,11 +646,13 @@ def _read_next_page(self):
self._page_length))

self._read_page_header()
if self._current_page_type == const.page_meta_type:
page_type = self._current_page_type
if page_type == const.page_meta_type:
self._process_page_metadata()
pt = [const.page_meta_type, const.page_data_type]
pt += [const.page_mix_types]
if self._current_page_type not in pt:

is_data_page = page_type & const.page_data_type
pt = [const.page_meta_type] + const.page_mix_types
if not is_data_page and self._current_page_type not in pt:
return self._read_next_page()

return False
Expand Down
Binary file added pandas/tests/io/sas/data/load_log.sas7bdat
Binary file not shown.
8 changes: 8 additions & 0 deletions pandas/tests/io/sas/test_sas7bdat.py
Expand Up @@ -184,6 +184,7 @@ def test_date_time(datapath):


def test_many_columns(datapath):
# Test for looking for column information in more places (PR #22628)
fname = datapath("io", "sas", "data", "many_columns.sas7bdat")
df = pd.read_sas(fname, encoding='latin-1')
fname = datapath("io", "sas", "data", "many_columns.csv")
Expand All @@ -199,6 +200,13 @@ def test_many_columns(datapath):
tm.assert_frame_equal(df, df0)


def test_inconsistent_number_of_rows(datapath):
# Regression test for issue #16615. (PR #22628)
fname = datapath("io", "sas", "data", "load_log.sas7bdat")
df = pd.read_sas(fname, encoding='latin-1')
assert len(df) == 2097


def test_zero_variables(datapath):
# Check if the SAS file has zero variables (PR #18184)
fname = datapath("io", "sas", "data", "zero_variables.sas7bdat")
Expand Down

0 comments on commit f05ebc8

Please sign in to comment.