From b6440efbdb0185fbed32d83248f6ea4ab5776097 Mon Sep 17 00:00:00 2001 From: neva03 <136854031+neva03@users.noreply.github.com> Date: Fri, 28 Mar 2025 14:47:43 +0300 Subject: [PATCH 01/11] Update README.md added my name --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 1a273fdb896c5..3595910961dd2 100644 --- a/README.md +++ b/README.md @@ -188,3 +188,4 @@ As contributors and maintainers to this project, you are expected to abide by pa
[Go to Top](#table-of-contents) +neva aydın \ No newline at end of file From cb6d09cc2942f9255d96cd7a0b4b3beeb504724d Mon Sep 17 00:00:00 2001 From: heba walid <138723578+hebawl@users.noreply.github.com> Date: Fri, 28 Mar 2025 16:33:15 +0300 Subject: [PATCH 02/11] Update README.md Added my name, to the contributors list in the README.md file. --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 3595910961dd2..4742954deb5f3 100644 --- a/README.md +++ b/README.md @@ -188,4 +188,6 @@ As contributors and maintainers to this project, you are expected to abide by pa
[Go to Top](#table-of-contents) -neva aydın \ No newline at end of file +neva aydın + +Heba Walid Awad From c782d9f2a17f7606f70b44e08ba081b566ea0e7a Mon Sep 17 00:00:00 2001 From: zeynepkeneth Date: Fri, 28 Mar 2025 18:13:54 +0300 Subject: [PATCH 03/11] Added my name to README --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 4742954deb5f3..82aea1e0a5d8d 100644 --- a/README.md +++ b/README.md @@ -191,3 +191,5 @@ As contributors and maintainers to this project, you are expected to abide by pa neva aydın Heba Walid Awad + +Zeynep Genel From 2a0d772496bca4d0bffe671da47bc58a1c0b8abf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=BCl?= <145806683+GulAkkoca@users.noreply.github.com> Date: Fri, 28 Mar 2025 21:13:10 +0300 Subject: [PATCH 04/11] Update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 82aea1e0a5d8d..e0db7fa2a88ec 100644 --- a/README.md +++ b/README.md @@ -193,3 +193,4 @@ neva aydın Heba Walid Awad Zeynep Genel +Gül Akkoca From 64237c095d0f42eb1401e8d436a7f7ae24c632f8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=BCl?= <145806683+GulAkkoca@users.noreply.github.com> Date: Fri, 28 Mar 2025 21:13:32 +0300 Subject: [PATCH 05/11] Update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index e0db7fa2a88ec..6403b30c49115 100644 --- a/README.md +++ b/README.md @@ -193,4 +193,5 @@ neva aydın Heba Walid Awad Zeynep Genel + Gül Akkoca From 37d956fb7f4a1f7565337ea8db6f8f54f6f40898 Mon Sep 17 00:00:00 2001 From: beratnvc Date: Fri, 28 Mar 2025 21:57:01 +0300 Subject: [PATCH 06/11] Update README.md --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 6403b30c49115..35d2b29693667 100644 --- a/README.md +++ b/README.md @@ -188,10 +188,12 @@ As contributors and maintainers to this project, you are expected to abide by pa
[Go to Top](#table-of-contents) -neva aydın +Neva Aydın Heba Walid Awad Zeynep Genel Gül Akkoca + +Berat Nevcanoğlu From 6821d1594e14de6a631be19776fc9462514c4199 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=BCl?= <145806683+GulAkkoca@users.noreply.github.com> Date: Fri, 28 Mar 2025 22:17:56 +0300 Subject: [PATCH 07/11] Update README.md Added my name to the README file From 5295edbb6d9c18d35c464a12826bd77b607201c6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=BCl?= <145806683+GulAkkoca@users.noreply.github.com> Date: Mon, 9 Jun 2025 03:17:06 +0300 Subject: [PATCH 08/11] Update parquet.py --- pandas/io/parquet.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 6a5a83088e986..30b9f8b02861d 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -186,6 +186,19 @@ def write( from_pandas_kwargs["preserve_index"] = index table = self.api.Table.from_pandas(df, **from_pandas_kwargs) + if any(isinstance(dtype,pd.StringDtype) for dtype in df.dtype): + string_dtype={ + col:str(dtype.storage) + for col,dtype in df.dtypes.items() + if isinstance(dtype,pd.StringDtype) + } + metadata = table.schema.metadata or{} + for col,storage in string_dtypes.items(): + key=f"pandas_string_dtype_{col}".encode() + val= storage.encode() + metadata[key]= val + table= table.replace_schema_metadata(metadata) + if df.attrs: df_metadata = {"PANDAS_ATTRS": json.dumps(df.attrs)} @@ -255,6 +268,16 @@ def read( ) try: pa_table = self.api.parquet.read_table( + metadata = pa_table.schema.metadata + string_dtypes = {} + if metadata: + for key, value in metadata.items(): + if key.startswith(b"pandas_string_dtype_"): + col_name = key.replace(b"pandas_string_dtype_", b"").decode() + string_dtypes[col_name] = value.decode() + + + path_or_handle, columns=columns, filesystem=filesystem, From d330c5f32b104a215ca6b1a78b41d6d427631c0d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=BCl?= <145806683+GulAkkoca@users.noreply.github.com> Date: Mon, 9 Jun 2025 10:18:11 +0300 Subject: [PATCH 09/11] =?UTF-8?q?=20Write=20and=20read=20function=20is=20u?= =?UTF-8?q?pdated=20in=20parquet.py=20By=20G=C3=BCl?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pandas/io/parquet.py | 36 ++++++++++++++++++++++++------------ 1 file changed, 24 insertions(+), 12 deletions(-) diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 30b9f8b02861d..1803c06d564e3 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -184,7 +184,7 @@ def write( from_pandas_kwargs: dict[str, Any] = {"schema": kwargs.pop("schema", None)} if index is not None: from_pandas_kwargs["preserve_index"] = index - +#ekleme yaptığım yer. table = self.api.Table.from_pandas(df, **from_pandas_kwargs) if any(isinstance(dtype,pd.StringDtype) for dtype in df.dtype): string_dtype={ @@ -267,23 +267,35 @@ def read( mode="rb", ) try: - pa_table = self.api.parquet.read_table( - metadata = pa_table.schema.metadata - string_dtypes = {} - if metadata: - for key, value in metadata.items(): - if key.startswith(b"pandas_string_dtype_"): - col_name = key.replace(b"pandas_string_dtype_", b"").decode() - string_dtypes[col_name] = value.decode() - - - + pa_table = self.api.parquet.read_table( path_or_handle, columns=columns, filesystem=filesystem, filters=filters, **kwargs, ) + + #eklediğim bölüm pandas_string_dtype_* metadata'larını oku + string_dtypes = {} + metadata = pa_table.schema.metadata + if metadata: + for key, value in metadata.items(): + if key.startswith(b"pandas_string_dtype_"): + col_name = key.replace(b"pandas_string_dtype_", b"").decode() + string_dtypes[col_name] = value.decode() + + # Eklediğim bölüm: types_mapper fonksiyonu + def types_mapper(pa_type): + for field in pa_table.schema: + if field.type == pa_type: + colname = field.name + if colname in string_dtypes: + return pd.StringDtype(storage=string_dtypes[colname]) + return None # fallback to default mapper + + if to_pandas_kwargs is None: + to_pandas_kwargs = {} + to_pandas_kwargs["types_mapper"] = types_mapper with catch_warnings(): filterwarnings( "ignore", From 522ed7f8fbf01d7d77479300118562ea00168184 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=BCl?= <145806683+GulAkkoca@users.noreply.github.com> Date: Mon, 9 Jun 2025 10:18:58 +0300 Subject: [PATCH 10/11] Write and read funct. is updated in parquet.py From bab5572c99c815c220adaa848bf9e21d4ae9c630 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=BCl?= <145806683+GulAkkoca@users.noreply.github.com> Date: Mon, 9 Jun 2025 10:43:45 +0300 Subject: [PATCH 11/11] Update test_parquet.py I added test method for parquet stringdtype --- pandas/tests/io/test_parquet.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 78f39b649cb9a..ce77bfa6f36d9 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -718,6 +718,26 @@ def test_basic_subset_columns(self, pa, df_full): expected=df[["string", "int"]], read_kwargs={"columns": ["string", "int"]}, ) + #ekleme yapılan yeni yer*** + @pytest.mark.parametrize("string_storage", ["pyarrow", "python"]) + def test_parquet_stringdtype_roundtrip(self, tmp_path, pa): + import pandas as pd + from pandas.testing import assert_frame_equal + + df = pd.DataFrame({ + "a": pd.Series(["x", "y", "z"], dtype=pd.StringDtype(storage=string_storage)) + }) + + file_path = tmp_path / "stringdtype.parquet" + df.to_parquet(file_path, engine="pyarrow") + + result = pd.read_parquet(file_path, engine="pyarrow") + + expected_dtype = pd.StringDtype(storage=string_storage) + assert result["a"].dtype == expected_dtype, f"Dtype mismatch: got {result['a'].dtype}, expected {expected_dtype}" + + assert_frame_equal(result, df) + def test_to_bytes_without_path_or_buf_provided(self, pa, df_full): # GH 37105