diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index e4dad8800d78f..eb3e2b875eaba 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -44,6 +44,8 @@ Other enhancements - :meth:`DataFrame.fillna` and :meth:`Series.fillna` can now accept ``value=None``; for non-object dtype the corresponding NA value will be used (:issue:`57723`) - :meth:`Series.cummin` and :meth:`Series.cummax` now supports :class:`CategoricalDtype` (:issue:`52335`) - :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`) +- Support reading Stata 110-format (Stata 7) dta files (:issue:`47176`) +- .. --------------------------------------------------------------------------- .. _whatsnew_300.notable_bug_fixes: diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 47d879c022ee6..b87ec94b85bb0 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -91,7 +91,7 @@ _version_error = ( "Version of given Stata file is {version}. pandas supports importing " - "versions 105, 108, 111 (Stata 7SE), 113 (Stata 8/9), " + "versions 105, 108, 110 (Stata 7), 111 (Stata 7SE), 113 (Stata 8/9), " "114 (Stata 10/11), 115 (Stata 12), 117 (Stata 13), 118 (Stata 14/15/16)," "and 119 (Stata 15/16, over 32,767 variables)." ) @@ -1393,7 +1393,7 @@ def _get_seek_variable_labels(self) -> int: def _read_old_header(self, first_char: bytes) -> None: self._format_version = int(first_char[0]) - if self._format_version not in [104, 105, 108, 111, 113, 114, 115]: + if self._format_version not in [104, 105, 108, 110, 111, 113, 114, 115]: raise ValueError(_version_error.format(version=self._format_version)) self._set_encoding() self._byteorder = ">" if self._read_int8() == 0x1 else "<" @@ -1408,7 +1408,7 @@ def _read_old_header(self, first_char: bytes) -> None: self._time_stamp = self._get_time_stamp() # descriptors - if self._format_version > 108: + if self._format_version >= 111: typlist = [int(c) for c in self._path_or_buf.read(self._nvar)] else: buf = self._path_or_buf.read(self._nvar) diff --git a/pandas/tests/io/data/stata/stata-compat-110.dta b/pandas/tests/io/data/stata/stata-compat-110.dta new file mode 100644 index 0000000000000..68e591aba829a Binary files /dev/null and b/pandas/tests/io/data/stata/stata-compat-110.dta differ diff --git a/pandas/tests/io/data/stata/stata-compat-be-110.dta b/pandas/tests/io/data/stata/stata-compat-be-110.dta new file mode 100644 index 0000000000000..0936be478028c Binary files /dev/null and b/pandas/tests/io/data/stata/stata-compat-be-110.dta differ diff --git a/pandas/tests/io/data/stata/stata4_110.dta b/pandas/tests/io/data/stata/stata4_110.dta new file mode 100644 index 0000000000000..3ea01040448b0 Binary files /dev/null and b/pandas/tests/io/data/stata/stata4_110.dta differ diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index d7fb3c0049965..2f981953a6237 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -225,11 +225,9 @@ def test_read_dta3(self, file, datapath): tm.assert_frame_equal(parsed, expected) - @pytest.mark.parametrize( - "file", ["stata4_111", "stata4_113", "stata4_114", "stata4_115", "stata4_117"] - ) - def test_read_dta4(self, file, datapath): - file = datapath("io", "data", "stata", f"{file}.dta") + @pytest.mark.parametrize("version", [110, 111, 113, 114, 115, 117]) + def test_read_dta4(self, version, datapath): + file = datapath("io", "data", "stata", f"stata4_{version}.dta") parsed = self.read_dta(file) expected = DataFrame.from_records( @@ -271,11 +269,11 @@ def test_read_dta4(self, file, datapath): # stata doesn't save .category metadata tm.assert_frame_equal(parsed, expected) - @pytest.mark.parametrize("file", ["stata4_105", "stata4_108"]) - def test_readold_dta4(self, file, datapath): + @pytest.mark.parametrize("version", [105, 108]) + def test_readold_dta4(self, version, datapath): # This test is the same as test_read_dta4 above except that the columns # had to be renamed to match the restrictions in older file format - file = datapath("io", "data", "stata", f"{file}.dta") + file = datapath("io", "data", "stata", f"stata4_{version}.dta") parsed = self.read_dta(file) expected = DataFrame.from_records( @@ -2002,7 +2000,7 @@ def test_read_write_ea_dtypes(self, dtype_backend, temp_file, tmp_path): tm.assert_frame_equal(written_and_read_again.set_index("index"), expected) -@pytest.mark.parametrize("version", [105, 108, 111, 113, 114]) +@pytest.mark.parametrize("version", [105, 108, 110, 111, 113, 114]) def test_backward_compat(version, datapath): data_base = datapath("io", "data", "stata") ref = os.path.join(data_base, "stata-compat-118.dta") @@ -2012,7 +2010,7 @@ def test_backward_compat(version, datapath): tm.assert_frame_equal(old_dta, expected, check_dtype=False) -@pytest.mark.parametrize("version", [105, 108, 111, 113, 114, 118]) +@pytest.mark.parametrize("version", [105, 108, 110, 111, 113, 114, 118]) def test_bigendian(version, datapath): ref = datapath("io", "data", "stata", f"stata-compat-{version}.dta") big = datapath("io", "data", "stata", f"stata-compat-be-{version}.dta")