From b6440efbdb0185fbed32d83248f6ea4ab5776097 Mon Sep 17 00:00:00 2001
From: neva03 <136854031+neva03@users.noreply.github.com>
Date: Fri, 28 Mar 2025 14:47:43 +0300
Subject: [PATCH 01/11] Update README.md
added my name
---
README.md | 1 +
1 file changed, 1 insertion(+)
diff --git a/README.md b/README.md
index 1a273fdb896c5..3595910961dd2 100644
--- a/README.md
+++ b/README.md
@@ -188,3 +188,4 @@ As contributors and maintainers to this project, you are expected to abide by pa
[Go to Top](#table-of-contents)
+neva aydın
\ No newline at end of file
From cb6d09cc2942f9255d96cd7a0b4b3beeb504724d Mon Sep 17 00:00:00 2001
From: heba walid <138723578+hebawl@users.noreply.github.com>
Date: Fri, 28 Mar 2025 16:33:15 +0300
Subject: [PATCH 02/11] Update README.md
Added my name, to the contributors list in the README.md file.
---
README.md | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/README.md b/README.md
index 3595910961dd2..4742954deb5f3 100644
--- a/README.md
+++ b/README.md
@@ -188,4 +188,6 @@ As contributors and maintainers to this project, you are expected to abide by pa
[Go to Top](#table-of-contents)
-neva aydın
\ No newline at end of file
+neva aydın
+
+Heba Walid Awad
From c782d9f2a17f7606f70b44e08ba081b566ea0e7a Mon Sep 17 00:00:00 2001
From: zeynepkeneth
Date: Fri, 28 Mar 2025 18:13:54 +0300
Subject: [PATCH 03/11] Added my name to README
---
README.md | 2 ++
1 file changed, 2 insertions(+)
diff --git a/README.md b/README.md
index 4742954deb5f3..82aea1e0a5d8d 100644
--- a/README.md
+++ b/README.md
@@ -191,3 +191,5 @@ As contributors and maintainers to this project, you are expected to abide by pa
neva aydın
Heba Walid Awad
+
+Zeynep Genel
From 2a0d772496bca4d0bffe671da47bc58a1c0b8abf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?G=C3=BCl?= <145806683+GulAkkoca@users.noreply.github.com>
Date: Fri, 28 Mar 2025 21:13:10 +0300
Subject: [PATCH 04/11] Update README.md
---
README.md | 1 +
1 file changed, 1 insertion(+)
diff --git a/README.md b/README.md
index 82aea1e0a5d8d..e0db7fa2a88ec 100644
--- a/README.md
+++ b/README.md
@@ -193,3 +193,4 @@ neva aydın
Heba Walid Awad
Zeynep Genel
+Gül Akkoca
From 64237c095d0f42eb1401e8d436a7f7ae24c632f8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?G=C3=BCl?= <145806683+GulAkkoca@users.noreply.github.com>
Date: Fri, 28 Mar 2025 21:13:32 +0300
Subject: [PATCH 05/11] Update README.md
---
README.md | 1 +
1 file changed, 1 insertion(+)
diff --git a/README.md b/README.md
index e0db7fa2a88ec..6403b30c49115 100644
--- a/README.md
+++ b/README.md
@@ -193,4 +193,5 @@ neva aydın
Heba Walid Awad
Zeynep Genel
+
Gül Akkoca
From 37d956fb7f4a1f7565337ea8db6f8f54f6f40898 Mon Sep 17 00:00:00 2001
From: beratnvc
Date: Fri, 28 Mar 2025 21:57:01 +0300
Subject: [PATCH 06/11] Update README.md
---
README.md | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/README.md b/README.md
index 6403b30c49115..35d2b29693667 100644
--- a/README.md
+++ b/README.md
@@ -188,10 +188,12 @@ As contributors and maintainers to this project, you are expected to abide by pa
[Go to Top](#table-of-contents)
-neva aydın
+Neva Aydın
Heba Walid Awad
Zeynep Genel
Gül Akkoca
+
+Berat Nevcanoğlu
From 6821d1594e14de6a631be19776fc9462514c4199 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?G=C3=BCl?= <145806683+GulAkkoca@users.noreply.github.com>
Date: Fri, 28 Mar 2025 22:17:56 +0300
Subject: [PATCH 07/11] Update README.md
Added my name to the README file
From 5295edbb6d9c18d35c464a12826bd77b607201c6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?G=C3=BCl?= <145806683+GulAkkoca@users.noreply.github.com>
Date: Mon, 9 Jun 2025 03:17:06 +0300
Subject: [PATCH 08/11] Update parquet.py
---
pandas/io/parquet.py | 23 +++++++++++++++++++++++
1 file changed, 23 insertions(+)
diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py
index 6a5a83088e986..30b9f8b02861d 100644
--- a/pandas/io/parquet.py
+++ b/pandas/io/parquet.py
@@ -186,6 +186,19 @@ def write(
from_pandas_kwargs["preserve_index"] = index
table = self.api.Table.from_pandas(df, **from_pandas_kwargs)
+ if any(isinstance(dtype,pd.StringDtype) for dtype in df.dtype):
+ string_dtype={
+ col:str(dtype.storage)
+ for col,dtype in df.dtypes.items()
+ if isinstance(dtype,pd.StringDtype)
+ }
+ metadata = table.schema.metadata or{}
+ for col,storage in string_dtypes.items():
+ key=f"pandas_string_dtype_{col}".encode()
+ val= storage.encode()
+ metadata[key]= val
+ table= table.replace_schema_metadata(metadata)
+
if df.attrs:
df_metadata = {"PANDAS_ATTRS": json.dumps(df.attrs)}
@@ -255,6 +268,16 @@ def read(
)
try:
pa_table = self.api.parquet.read_table(
+ metadata = pa_table.schema.metadata
+ string_dtypes = {}
+ if metadata:
+ for key, value in metadata.items():
+ if key.startswith(b"pandas_string_dtype_"):
+ col_name = key.replace(b"pandas_string_dtype_", b"").decode()
+ string_dtypes[col_name] = value.decode()
+
+
+
path_or_handle,
columns=columns,
filesystem=filesystem,
From d330c5f32b104a215ca6b1a78b41d6d427631c0d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?G=C3=BCl?= <145806683+GulAkkoca@users.noreply.github.com>
Date: Mon, 9 Jun 2025 10:18:11 +0300
Subject: [PATCH 09/11] =?UTF-8?q?=20Write=20and=20read=20function=20is=20u?=
=?UTF-8?q?pdated=20in=20parquet.py=20By=20G=C3=BCl?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
pandas/io/parquet.py | 36 ++++++++++++++++++++++++------------
1 file changed, 24 insertions(+), 12 deletions(-)
diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py
index 30b9f8b02861d..1803c06d564e3 100644
--- a/pandas/io/parquet.py
+++ b/pandas/io/parquet.py
@@ -184,7 +184,7 @@ def write(
from_pandas_kwargs: dict[str, Any] = {"schema": kwargs.pop("schema", None)}
if index is not None:
from_pandas_kwargs["preserve_index"] = index
-
+#ekleme yaptığım yer.
table = self.api.Table.from_pandas(df, **from_pandas_kwargs)
if any(isinstance(dtype,pd.StringDtype) for dtype in df.dtype):
string_dtype={
@@ -267,23 +267,35 @@ def read(
mode="rb",
)
try:
- pa_table = self.api.parquet.read_table(
- metadata = pa_table.schema.metadata
- string_dtypes = {}
- if metadata:
- for key, value in metadata.items():
- if key.startswith(b"pandas_string_dtype_"):
- col_name = key.replace(b"pandas_string_dtype_", b"").decode()
- string_dtypes[col_name] = value.decode()
-
-
-
+ pa_table = self.api.parquet.read_table(
path_or_handle,
columns=columns,
filesystem=filesystem,
filters=filters,
**kwargs,
)
+
+ #eklediğim bölüm pandas_string_dtype_* metadata'larını oku
+ string_dtypes = {}
+ metadata = pa_table.schema.metadata
+ if metadata:
+ for key, value in metadata.items():
+ if key.startswith(b"pandas_string_dtype_"):
+ col_name = key.replace(b"pandas_string_dtype_", b"").decode()
+ string_dtypes[col_name] = value.decode()
+
+ # Eklediğim bölüm: types_mapper fonksiyonu
+ def types_mapper(pa_type):
+ for field in pa_table.schema:
+ if field.type == pa_type:
+ colname = field.name
+ if colname in string_dtypes:
+ return pd.StringDtype(storage=string_dtypes[colname])
+ return None # fallback to default mapper
+
+ if to_pandas_kwargs is None:
+ to_pandas_kwargs = {}
+ to_pandas_kwargs["types_mapper"] = types_mapper
with catch_warnings():
filterwarnings(
"ignore",
From 522ed7f8fbf01d7d77479300118562ea00168184 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?G=C3=BCl?= <145806683+GulAkkoca@users.noreply.github.com>
Date: Mon, 9 Jun 2025 10:18:58 +0300
Subject: [PATCH 10/11] Write and read funct. is updated in parquet.py
From bab5572c99c815c220adaa848bf9e21d4ae9c630 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?G=C3=BCl?= <145806683+GulAkkoca@users.noreply.github.com>
Date: Mon, 9 Jun 2025 10:43:45 +0300
Subject: [PATCH 11/11] Update test_parquet.py I added test method for parquet
stringdtype
---
pandas/tests/io/test_parquet.py | 20 ++++++++++++++++++++
1 file changed, 20 insertions(+)
diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py
index 78f39b649cb9a..ce77bfa6f36d9 100644
--- a/pandas/tests/io/test_parquet.py
+++ b/pandas/tests/io/test_parquet.py
@@ -718,6 +718,26 @@ def test_basic_subset_columns(self, pa, df_full):
expected=df[["string", "int"]],
read_kwargs={"columns": ["string", "int"]},
)
+ #ekleme yapılan yeni yer***
+ @pytest.mark.parametrize("string_storage", ["pyarrow", "python"])
+ def test_parquet_stringdtype_roundtrip(self, tmp_path, pa):
+ import pandas as pd
+ from pandas.testing import assert_frame_equal
+
+ df = pd.DataFrame({
+ "a": pd.Series(["x", "y", "z"], dtype=pd.StringDtype(storage=string_storage))
+ })
+
+ file_path = tmp_path / "stringdtype.parquet"
+ df.to_parquet(file_path, engine="pyarrow")
+
+ result = pd.read_parquet(file_path, engine="pyarrow")
+
+ expected_dtype = pd.StringDtype(storage=string_storage)
+ assert result["a"].dtype == expected_dtype, f"Dtype mismatch: got {result['a'].dtype}, expected {expected_dtype}"
+
+ assert_frame_equal(result, df)
+
def test_to_bytes_without_path_or_buf_provided(self, pa, df_full):
# GH 37105