Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: Some sas7bdat files with many columns are not parseable by read_sas #22628

Merged
merged 2 commits into from Sep 18, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v0.24.0.txt
Expand Up @@ -742,6 +742,8 @@ I/O
- :func:`read_excel()` will correctly show the deprecation warning for previously deprecated ``sheetname`` (:issue:`17994`)
- :func:`read_csv()` will correctly parse timezone-aware datetimes (:issue:`22256`)
- :func:`read_sas()` will parse numbers in sas7bdat-files that have width less than 8 bytes correctly. (:issue:`21616`)
- :func:`read_sas()` will correctly parse sas7bdat files with many columns (:issue:`22628`)
- :func:`read_sas()` will correctly parse sas7bdat files with data page types having also bit 7 set (so page type is 128 + 256 = 384) (:issue:`16615`)

Plotting
^^^^^^^^
Expand Down
10 changes: 5 additions & 5 deletions pandas/io/sas/sas.pyx
Expand Up @@ -244,8 +244,8 @@ cdef class Parser(object):
self.parser = parser
self.header_length = self.parser.header_length
self.column_count = parser.column_count
self.lengths = parser._column_data_lengths
self.offsets = parser._column_data_offsets
self.lengths = parser.column_data_lengths()
self.offsets = parser.column_data_offsets()
self.byte_chunk = parser._byte_chunk
self.string_chunk = parser._string_chunk
self.row_length = parser.row_length
Expand All @@ -257,7 +257,7 @@ cdef class Parser(object):
# page indicators
self.update_next_page()

column_types = parser.column_types
column_types = parser.column_types()

# map column types
for j in range(self.column_count):
Expand Down Expand Up @@ -375,7 +375,7 @@ cdef class Parser(object):
if done:
return True
return False
elif self.current_page_type == page_data_type:
elif self.current_page_type & page_data_type == page_data_type:
self.process_byte_array_with_data(
bit_offset + subheader_pointers_offset +
self.current_row_on_page_index * self.row_length,
Expand Down Expand Up @@ -437,7 +437,7 @@ cdef class Parser(object):
elif column_types[j] == column_type_string:
# string
string_chunk[js, current_row] = np.array(source[start:(
start + lngt)]).tostring().rstrip()
start + lngt)]).tostring().rstrip(b"\x00 ")
troels marked this conversation as resolved.
Show resolved Hide resolved
js += 1

self.current_row_on_page_index += 1
Expand Down
61 changes: 35 additions & 26 deletions pandas/io/sas/sas7bdat.py
Expand Up @@ -82,14 +82,15 @@ def __init__(self, path_or_buf, index=None, convert_dates=True,
self.compression = ""
self.column_names_strings = []
self.column_names = []
self.column_types = []
self.column_formats = []
self.columns = []

self._current_page_data_subheader_pointers = []
self._cached_page = None
self._column_data_lengths = []
self._column_data_offsets = []
self._column_types = []

self._current_row_in_file_index = 0
self._current_row_on_page_index = 0
self._current_row_in_file_index = 0
Expand All @@ -102,6 +103,19 @@ def __init__(self, path_or_buf, index=None, convert_dates=True,
self._get_properties()
self._parse_metadata()

def column_data_lengths(self):
"""Return a numpy int64 array of the column data lengths"""
return np.asarray(self._column_data_lengths, dtype=np.int64)

def column_data_offsets(self):
"""Return a numpy int64 array of the column offsets"""
return np.asarray(self._column_data_offsets, dtype=np.int64)

def column_types(self):
"""Returns a numpy character array of the column types:
s (string) or d (double)"""
return np.asarray(self._column_types, dtype=np.dtype('S1'))

def close(self):
try:
self.handle.close()
Expand Down Expand Up @@ -287,8 +301,10 @@ def _process_page_meta(self):
pt = [const.page_meta_type, const.page_amd_type] + const.page_mix_types
if self._current_page_type in pt:
self._process_page_metadata()
return ((self._current_page_type in [256] + const.page_mix_types) or
(self._current_page_data_subheader_pointers is not None))
is_data_page = self._current_page_type & const.page_data_type
is_mix_page = self._current_page_type in const.page_mix_types
return (is_data_page or is_mix_page
or self._current_page_data_subheader_pointers != [])

def _read_page_header(self):
bit_offset = self._page_bit_offset
Expand Down Expand Up @@ -503,12 +519,6 @@ def _process_columnattributes_subheader(self, offset, length):
int_len = self._int_length
column_attributes_vectors_count = (
length - 2 * int_len - 12) // (int_len + 8)
self.column_types = np.empty(
column_attributes_vectors_count, dtype=np.dtype('S1'))
self._column_data_lengths = np.empty(
column_attributes_vectors_count, dtype=np.int64)
self._column_data_offsets = np.empty(
column_attributes_vectors_count, dtype=np.int64)
for i in range(column_attributes_vectors_count):
col_data_offset = (offset + int_len +
const.column_data_offset_offset +
Expand All @@ -520,16 +530,13 @@ def _process_columnattributes_subheader(self, offset, length):
const.column_type_offset + i * (int_len + 8))

x = self._read_int(col_data_offset, int_len)
self._column_data_offsets[i] = x
self._column_data_offsets.append(x)

x = self._read_int(col_data_len, const.column_data_length_length)
self._column_data_lengths[i] = x
self._column_data_lengths.append(x)

x = self._read_int(col_types, const.column_type_length)
if x == 1:
self.column_types[i] = b'd'
else:
self.column_types[i] = b's'
self._column_types.append(b'd' if x == 1 else b's')

def _process_columnlist_subheader(self, offset, length):
# unknown purpose
Expand Down Expand Up @@ -586,7 +593,7 @@ def _process_format_subheader(self, offset, length):
col.name = self.column_names[current_column_number]
col.label = column_label
col.format = column_format
col.ctype = self.column_types[current_column_number]
col.ctype = self._column_types[current_column_number]
col.length = self._column_data_lengths[current_column_number]

self.column_formats.append(column_format)
Expand All @@ -599,7 +606,7 @@ def read(self, nrows=None):
elif nrows is None:
nrows = self.row_count

if len(self.column_types) == 0:
if len(self._column_types) == 0:
self.close()
raise EmptyDataError("No columns to parse from file")

Expand All @@ -610,8 +617,8 @@ def read(self, nrows=None):
if nrows > m:
nrows = m

nd = (self.column_types == b'd').sum()
ns = (self.column_types == b's').sum()
nd = self._column_types.count(b'd')
ns = self._column_types.count(b's')

self._string_chunk = np.empty((ns, nrows), dtype=np.object)
self._byte_chunk = np.zeros((nd, 8 * nrows), dtype=np.uint8)
Expand Down Expand Up @@ -639,11 +646,13 @@ def _read_next_page(self):
self._page_length))

self._read_page_header()
if self._current_page_type == const.page_meta_type:
page_type = self._current_page_type
if page_type == const.page_meta_type:
self._process_page_metadata()
pt = [const.page_meta_type, const.page_data_type]
pt += [const.page_mix_types]
if self._current_page_type not in pt:

is_data_page = page_type & const.page_data_type
pt = [const.page_meta_type] + const.page_mix_types
if not is_data_page and self._current_page_type not in pt:
return self._read_next_page()

return False
Expand All @@ -660,7 +669,7 @@ def _chunk_to_dataframe(self):

name = self.column_names[j]

if self.column_types[j] == b'd':
if self._column_types[j] == b'd':
rslt[name] = self._byte_chunk[jb, :].view(
dtype=self.byte_order + 'd')
rslt[name] = np.asarray(rslt[name], dtype=np.float64)
Expand All @@ -674,7 +683,7 @@ def _chunk_to_dataframe(self):
rslt[name] = pd.to_datetime(rslt[name], unit=unit,
origin="1960-01-01")
jb += 1
elif self.column_types[j] == b's':
elif self._column_types[j] == b's':
rslt[name] = self._string_chunk[js, :]
if self.convert_text and (self.encoding is not None):
rslt[name] = rslt[name].str.decode(
Expand All @@ -686,6 +695,6 @@ def _chunk_to_dataframe(self):
else:
self.close()
raise ValueError("unknown column type %s" %
self.column_types[j])
self._column_types[j])

return rslt
Binary file added pandas/tests/io/sas/data/load_log.sas7bdat
Binary file not shown.
4 changes: 4 additions & 0 deletions pandas/tests/io/sas/data/many_columns.csv
@@ -0,0 +1,4 @@
DATASRC,PDDOCID,age,agegt89,ASSESSA,ASSESS1,ASSESS3,ASSESS4,ASSESS5,ASSESS6,ASSESS7,week,BECK,conf1,conf2,conf3,demo3,demo4,demo5,demo6,demo7,demo11a,demo11b,demo11c,demo11d,derm1b,derm2,derm3,derm4,derm5a,derm5b,derm7,derm7a,derm7b,derm8,derm9,ECG3,ecgrtxt,ecgrhr,ecgrpr,ecgrqrs,ecgrqrsaxis,ecgrqt,ecgrqtc,ecgrrep,ecgrtime,mmse1,mmse2,mmse3,mmse4,mmse5,mmse6,mmse7,mmse8,mmse9,mmse10,mmse11,mmse12,mmse13,mmse14,mmse15,mmse16,mmse17,mmse18,mmse19,mmse20,mmse,mmsescor,mrf1,mrf2,mrf3,mrf4,mrf5,mrf6,mrf7,mrf8,mrf9,mrf10,mrf11,mrf12,mrf13,nvitl1s,nvitl1d,nvitl1r,nvitl2s,nvitl2d,nvitl2r,nvitl3s,nvitl3d,nvitl3r,nvitl4s,nvitl4d,nvitl4r,nvitl5,nvitl1,nvitl2,nvitl3,nvitl4,phys1,phys1a,phys14,phys15a,phys15b,phys15c,phys15d,phys16a,phys16b,phys16c,phys16d,phys17a,phys17b,phys17c,phys17d,phys18a,phys18b,phys18c,phys18d,phys19a,phys19b,phys20,phys22,phys24,phys26,phys28,PREG1,PREG2,updrsa,updrs1,updrs2,updrs3,updrs4,updrs5a,updrs6a,updrs7a,updrs8a,updrs9a,updrs10a,updrs11a,updrs12a,updrs13a,updrs14a,updrs15a,updrs16a,updrs17a,updrs18a,updrs19a,updrs20a1,updrs20b1,updrs20c1,updrs20d1,updrs20e1,updrs21a1,updrs21b1,updrs22a1,updrs22b1,updrs22c1,updrs22d1,updrs22e1,updrs23a1,updrs23b1,updrs24a1,updrs24b1,updrs25a1,updrs25b1,updrs26a1,updrs26b1,updrs26c1,updrs26d1,updrs27a,updrs28a,updrs29a,updrs30a,updrs31a,updrs32a,updrs33a,updrs34a,updrs35,updrs36,updrs37,updrs38,updrs39,updrs5b,updrs6b,updrs7b,updrs8b,updrs9b,updrs10b,updrs11b,updrs12b,updrs13b,updrs14b,updrs15b,updrs16b,updrs17b,updrs18b,updrs19b,updrs20a2,updrs20b2,updrs20c2,updrs20d2,updrs20e2,updrs21a2,updrs21b2,updrs22a2,updrs22b2,updrs22c2,updrs22d2,updrs22e2,updrs23a2,updrs23b2,updrs24a2,updrs24b2,updrs25a2,updrs25b2,updrs26a2,updrs26b2,updrs26c2,updrs26d2,updrs27b,updrs28b,updrs29b,updrs30b,updrs31b,updrs32b,updrs33b,updrs34b,updrs5c,updrs6c,updrs7c,updrs8c,updrs9c,updrs10c,updrs11c,updrs12c,updrs13c,updrs14c,updrs15c,updrs16c,updrs17c,updrs32c,updrs33c,updrs34c,updrsmental,updrsadl,updrsadlon,updrsadloff,updrsadlmin,updrstremor,updrstremortreat,updrstremormin,updrsrigid,updrsrigidtreat,updrsrigidmin,updrsmotor,updrsmotortreat,updrsmotormin,updrs,updrstrt,updrsmin,updrs4a,updrs41,updrs42,updrs43,updrs44,updrs45,updrs46,updrs47,updrs48,updrs49,updrs410,updrs411,vitl1s,vitl1d,vitl2,vitl3s,vitl3d,vitl4,vitl5,vitl6,assess,fbeck,conf,demo1,derm,ecg,ecgr,mrf,nvitl,fphys1,fpreg,fupdrs,fupdrs4,vitl,site,race,rImaged,rPD,rPDlt5,rAgeGt30,rHY,rMed,rMelanoma,rPreclude,rNeed,rEligible,gender,incsae,incsusp,incterm,increlated,inctermat,increason,incafter24,incendp,incres,disp2,disp3,disp4,disp6,inex1,inex2,inex3,inex4,inex5,inex6,inex7,inex8,inex9,inex10,inex11,inex12,inex13,inex14,inex15,inex16,inex17,inex18,inex19,inex20,inex21,inex22,inex23,inex24,inex25,inex26,inex27,inex28,treatment,treat,disp,inex,classify,enrollyr,demoyear,dob_yr,inexdays,demodays,onsetdays,diagdays,medstartdays,physdays,phys21dys,phys23dys,phys25dys,phys27dys,phys29dys,confdays,pregdays,nvitldays,nvitlscandays,vitldays,labdays,ecgdays,ecgtestdays,mrfdays,dermdays,dermexamdays,dermbiopdays,mmsedays,beckdays,updrdays,updr4days,assessdays,daystotherapy,dispdays,endpdys,termdys,SAEdys,resdys,lmeddys,wddays,VISIT_NO
a030,ab304,43.0,0.0,0.0,0.0,,,,,,-2.0,0.0,1.0,1.0,,2.0,1.0,19.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,,,,,,,0.0,2.0,ABNORMAL,75.0,150.0,100.0,-3.0,410.0,460.0,2.0,1000.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,3.0,5.0,2.0,1.0,1.0,1.0,0.0,3.0,1.0,1.0,1.0,26.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,150.0,94.0,73.0,155.0,96.0,71.0,148.0,91.0,69.0,146.0,67.0,72.0,1.0,42840.0,46080.0,46980.0,30600.0,100.0,175.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,4.0,4.0,4.0,4.0,2.0,1.0,,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.5,0.0,0.0,0.0,1.0,1.0,2.0,2.0,1.0,1.5,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,2.5,95.0,95.0,7.0,,2.0,1.0,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,5.0,,,5.0,1.5,,1.5,7.5,,7.5,20.0,,20.0,25.0,,25.0,,,,,,,,,,,,,138.0,86.0,72.0,130.0,80.0,80.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,abc,1.0,1.0,1.0,0.0,1.0,34.0,5.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,,0.0,3.0,0.0,1.0,0.0,4.0,3.0,,1.0,1.0,1.0,1.0,1.0,1.0,,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,Placebo,1.0,1.0,1.0,1.0,2002.0,2002.0,1914.0,-28.0,-28.0,-404.0,-28.0,0.0,-28.0,,,,,-6.0,-28.0,-13.0,-13.0,-12.0,-28.0,-28.0,-28.0,-28.0,-28.0,-14.0,-14.0,,-28.0,-28.0,-28.0,,-28.0,,659.0,426.0,659.0,,,658.0,100.0,ab
a030,ab304,43.0,0.0,0.0,0.0,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1000.0,,,,,,,,,,,,,,,,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,2.0,0.0,0.0,1.0,0.0,1.0,2.0,95.0,95.0,7.0,,2.0,1.0,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,3.0,,,3.0,0.0,,0.0,3.0,,3.0,13.0,,13.0,16.0,,16.0,,,,,,,,,,,,,140.0,86.0,76.0,132.0,80.0,84.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,abc,0.0,0.0,1.0,0.0,1.0,34.0,5.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,,0.0,3.0,0.0,1.0,0.0,4.0,3.0,,1.0,1.0,1.0,1.0,1.0,1.0,,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,Placebo,1.0,1.0,1.0,1.0,2002.0,,1914.0,-28.0,,,,0.0,,,,,,,,,,,0.0,0.0,,,,,,,,,0.0,,0.0,,659.0,426.0,659.0,,,658.0,100.0,ab
a030,ab304,43.0,0.0,0.0,0.0,,,,,,4.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1000.0,,,,,,,,,,,,,,,,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,2.0,0.0,1.0,1.0,0.5,1.0,2.0,90.0,95.0,7.0,,2.0,2.0,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,5.0,,,5.0,0.5,,0.5,2.0,,2.0,16.0,,16.0,21.0,,21.0,0.0,,,,,,,,,,,,149.0,88.0,80.0,136.0,90.0,82.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,abc,0.0,0.0,1.0,1.0,1.0,34.0,5.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,,0.0,3.0,0.0,1.0,0.0,4.0,3.0,,1.0,1.0,1.0,1.0,1.0,1.0,,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,Placebo,1.0,1.0,1.0,1.0,2002.0,,1914.0,-28.0,,,,0.0,,,,,,,,,,,29.0,29.0,,,,,,,,,29.0,29.0,29.0,,659.0,426.0,659.0,,,658.0,100.0,ab
Binary file added pandas/tests/io/sas/data/many_columns.sas7bdat
Binary file not shown.
16 changes: 16 additions & 0 deletions pandas/tests/io/sas/test_sas7bdat.py
Expand Up @@ -199,6 +199,22 @@ def test_compact_numerical_values(datapath):
tm.assert_series_equal(result, expected, check_exact=True)


def test_many_columns(datapath):
# Test for looking for column information in more places (PR #22628)
fname = datapath("io", "sas", "data", "many_columns.sas7bdat")
df = pd.read_sas(fname, encoding='latin-1')
fname = datapath("io", "sas", "data", "many_columns.csv")
df0 = pd.read_csv(fname, encoding='latin-1')
tm.assert_frame_equal(df, df0)


def test_inconsistent_number_of_rows(datapath):
# Regression test for issue #16615. (PR #22628)
fname = datapath("io", "sas", "data", "load_log.sas7bdat")
df = pd.read_sas(fname, encoding='latin-1')
assert len(df) == 2097


def test_zero_variables(datapath):
# Check if the SAS file has zero variables (PR #18184)
fname = datapath("io", "sas", "data", "zero_variables.sas7bdat")
Expand Down