From 3533134f17ea3da14bb36e03d614c86739bcb3eb Mon Sep 17 00:00:00 2001 From: Chris Charlton Date: Thu, 28 Mar 2024 13:30:16 +0000 Subject: [PATCH 1/5] ENH: Add support for reading 110-format Stata dta files --- pandas/io/stata.py | 6 +++--- pandas/tests/io/data/stata/stata-compat-110.dta | Bin 0 -> 1514 bytes pandas/tests/io/test_stata.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) create mode 100644 pandas/tests/io/data/stata/stata-compat-110.dta diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 47d879c022ee6..37494ff42d650 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -91,7 +91,7 @@ _version_error = ( "Version of given Stata file is {version}. pandas supports importing " - "versions 105, 108, 111 (Stata 7SE), 113 (Stata 8/9), " + "versions 105, 108, 110 (Stata 7), 111 (Stata 7SE), 113 (Stata 8/9), " "114 (Stata 10/11), 115 (Stata 12), 117 (Stata 13), 118 (Stata 14/15/16)," "and 119 (Stata 15/16, over 32,767 variables)." ) @@ -1393,7 +1393,7 @@ def _get_seek_variable_labels(self) -> int: def _read_old_header(self, first_char: bytes) -> None: self._format_version = int(first_char[0]) - if self._format_version not in [104, 105, 108, 111, 113, 114, 115]: + if self._format_version not in [104, 105, 108, 110, 111, 113, 114, 115]: raise ValueError(_version_error.format(version=self._format_version)) self._set_encoding() self._byteorder = ">" if self._read_int8() == 0x1 else "<" @@ -1408,7 +1408,7 @@ def _read_old_header(self, first_char: bytes) -> None: self._time_stamp = self._get_time_stamp() # descriptors - if self._format_version > 108: + if self._format_version > 110: typlist = [int(c) for c in self._path_or_buf.read(self._nvar)] else: buf = self._path_or_buf.read(self._nvar) diff --git a/pandas/tests/io/data/stata/stata-compat-110.dta b/pandas/tests/io/data/stata/stata-compat-110.dta new file mode 100644 index 0000000000000000000000000000000000000000..68e591aba829a31bdce0a3bcfae2f5b5a300801e GIT binary patch literal 1514 zcmc~}Vr1Z8U}m5TNJ`4gNlVG;%*;zkt-xnvrUgD(pj@V*8GaR+#zy!Rq~VulU`QdT zpoD;mVnYK0Dlnu~E%Xf1p`tk1hDLBjs+MpjgDSFkic5TMd?eT;RYQYf42wnuMnhmU z1O`V4F#iAl`~Uy?|7Xvf**j|{2<)^MvSeULOiE5kO-s+n%wmE%^z0d*eGnBV?S)ZP XFf%f;FfueS0A=?XgTZBc5fl{wc`rPh literal 0 HcmV?d00001 diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index d7fb3c0049965..36b47d629a856 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -2002,7 +2002,7 @@ def test_read_write_ea_dtypes(self, dtype_backend, temp_file, tmp_path): tm.assert_frame_equal(written_and_read_again.set_index("index"), expected) -@pytest.mark.parametrize("version", [105, 108, 111, 113, 114]) +@pytest.mark.parametrize("version", [105, 108, 110, 111, 113, 114]) def test_backward_compat(version, datapath): data_base = datapath("io", "data", "stata") ref = os.path.join(data_base, "stata-compat-118.dta") From 48f98f0a7eaa423b0ea0741793032d976b977af3 Mon Sep 17 00:00:00 2001 From: Chris Charlton Date: Wed, 8 May 2024 21:28:38 +0100 Subject: [PATCH 2/5] Add whatsnew note to v3.0.0.rst --- doc/source/whatsnew/v3.0.0.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index e4dad8800d78f..eb3e2b875eaba 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -44,6 +44,8 @@ Other enhancements - :meth:`DataFrame.fillna` and :meth:`Series.fillna` can now accept ``value=None``; for non-object dtype the corresponding NA value will be used (:issue:`57723`) - :meth:`Series.cummin` and :meth:`Series.cummax` now supports :class:`CategoricalDtype` (:issue:`52335`) - :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`) +- Support reading Stata 110-format (Stata 7) dta files (:issue:`47176`) +- .. --------------------------------------------------------------------------- .. _whatsnew_300.notable_bug_fixes: From 605924b932d761b0ab83e5105adfd921d91c1985 Mon Sep 17 00:00:00 2001 From: Chris Charlton Date: Tue, 9 Apr 2024 18:23:29 +0100 Subject: [PATCH 3/5] Add a test data file containing value labels --- pandas/tests/io/data/stata/stata4_110.dta | Bin 0 -> 1528 bytes pandas/tests/io/test_stata.py | 14 ++++++-------- 2 files changed, 6 insertions(+), 8 deletions(-) create mode 100644 pandas/tests/io/data/stata/stata4_110.dta diff --git a/pandas/tests/io/data/stata/stata4_110.dta b/pandas/tests/io/data/stata/stata4_110.dta new file mode 100644 index 0000000000000000000000000000000000000000..3ea01040448b09f1efa0bf182ba6b814a105bdb1 GIT binary patch literal 1528 zcmd5+OHRWu5S!RboTcR9(<+un)i)q6D1Al51CS+QN!%xdlh*QF;@Y zNdryQMnM83EWMFOd4BU|YnjRtpA}_B_;~&)@{2p%=1>iu=l!O zNg0QF@pY&Ra+}*q!n)ZKoxH!2uUM_JgkagoSw1#44H6ZETDCW>em@h&rm;+ZPgZPs zV>2uunX@Cg`sP@Gc|MFv=eq>)vV5M$5$K6@l``v*tOc{Xg%^&EtupD)hFJq(0s|1{ zcXfcKzyJi`dl&=-0_XbxOA0`k^9I0z0=&NfyzzrPWa)w*5(Pt+F8L7&V8~Js$GGDZ z=xkHq^Dw`oH-b_oUU64>sgOe_JOnNBMEa+QbHoMW5^;sNM%*B%N$QN+AwHQ$+dwwy zEEytAYb2yI3u&B436Y&75UHsIr=GycsWc$fFiycJB4RqgLzVv#a;WGTf>zOR(b%>D Psz4*D{1K7!Z$|O~T9whd literal 0 HcmV?d00001 diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index 36b47d629a856..c31501a631d99 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -225,11 +225,9 @@ def test_read_dta3(self, file, datapath): tm.assert_frame_equal(parsed, expected) - @pytest.mark.parametrize( - "file", ["stata4_111", "stata4_113", "stata4_114", "stata4_115", "stata4_117"] - ) - def test_read_dta4(self, file, datapath): - file = datapath("io", "data", "stata", f"{file}.dta") + @pytest.mark.parametrize("version", [110, 111, 113, 114, 115, 117]) + def test_read_dta4(self, version, datapath): + file = datapath("io", "data", "stata", f"stata4_{version}.dta") parsed = self.read_dta(file) expected = DataFrame.from_records( @@ -271,11 +269,11 @@ def test_read_dta4(self, file, datapath): # stata doesn't save .category metadata tm.assert_frame_equal(parsed, expected) - @pytest.mark.parametrize("file", ["stata4_105", "stata4_108"]) - def test_readold_dta4(self, file, datapath): + @pytest.mark.parametrize("version", [105, 108]) + def test_readold_dta4(self, version, datapath): # This test is the same as test_read_dta4 above except that the columns # had to be renamed to match the restrictions in older file format - file = datapath("io", "data", "stata", f"{file}.dta") + file = datapath("io", "data", "stata", f"stata4_{version}.dta") parsed = self.read_dta(file) expected = DataFrame.from_records( From 524c28b2ae93ca7ed6c037d1843bfa853385d585 Mon Sep 17 00:00:00 2001 From: Chris Charlton Date: Tue, 9 Apr 2024 22:00:12 +0100 Subject: [PATCH 4/5] Compare version number inclusively when determining whether to use old or new typlist version --- pandas/io/stata.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 37494ff42d650..b87ec94b85bb0 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -1408,7 +1408,7 @@ def _read_old_header(self, first_char: bytes) -> None: self._time_stamp = self._get_time_stamp() # descriptors - if self._format_version > 110: + if self._format_version >= 111: typlist = [int(c) for c in self._path_or_buf.read(self._nvar)] else: buf = self._path_or_buf.read(self._nvar) From ee3bae80cef9d1e8ec429cd68053e9b9315a4ed9 Mon Sep 17 00:00:00 2001 From: Chris Charlton Date: Tue, 9 Apr 2024 23:02:32 +0100 Subject: [PATCH 5/5] Add a big-endian version of the test data set --- .../tests/io/data/stata/stata-compat-be-110.dta | Bin 0 -> 1514 bytes pandas/tests/io/test_stata.py | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) create mode 100644 pandas/tests/io/data/stata/stata-compat-be-110.dta diff --git a/pandas/tests/io/data/stata/stata-compat-be-110.dta b/pandas/tests/io/data/stata/stata-compat-be-110.dta new file mode 100644 index 0000000000000000000000000000000000000000..0936be478028c463201c542bba7dc27f0cb89cc5 GIT binary patch literal 1514 zcmc~}WMp9AU|?Wi24Y%+q@>K8w3M9A%)FG;3VfDjTHuof%4Hgw;a8DqY=mDy8h&X8 zh7^JdN(iVZHZ&lh0z+EWLeC%_DvFbBXarZJY6)jDs3MD}xWwniM}kdKH8d#3uxM0Z zGz3ONU~q&0