-
Notifications
You must be signed in to change notification settings - Fork 646
/
io.py
77 lines (67 loc) · 3.42 KB
/
io.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
# Licensed to Modin Development Team under one or more contributor license agreements.
# See the NOTICE file distributed with this work for additional information regarding
# copyright ownership. The Modin Development Team licenses this file to you under the
# Apache License, Version 2.0 (the "License"); you may not use this file except in
# compliance with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software distributed under
# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific language
# governing permissions and limitations under the License.
from numpy import full
import pandas
import time
from modin.engines.base.io import BaseIO
from modin.db_conn import ModinDatabaseConnection
class RayIO(BaseIO):
@classmethod
def to_sql(cls, qc, name, con, schema=None, **kwargs):
"""Write records stored in a DataFrame to a SQL database.
Args:
qc: the query compiler of the DF that we want to run to_sql on
kwargs: parameters for pandas.to_sql(**kwargs)
"""
# we first insert an empty DF in order to create the full table in the database
# This also helps to validate the input against pandas
# we would like to_sql() to complete only when all rows have been inserted into the database
# since the mapping operation is non-blocking, each partition will return an empty DF
# so at the end, the blocking operation will be this empty DF to_pandas
empty_df = qc.getitem_row_array([0]).to_pandas().head(0)
empty_df_con = con
if isinstance(empty_df_con, ModinDatabaseConnection):
empty_df_con = con.get_connection()
empty_df.to_sql(name, con=empty_df_con, schema=schema, **kwargs)
# so each partition will append its respective DF
kwargs["if_exists"] = "append"
columns = qc.columns
def func(df):
df.columns = columns
partition_con = con
if isinstance(partition_con, ModinDatabaseConnection):
partition_con = con.get_connection()
if con.identity_insert:
full_table_name = name
if schema:
full_table_name = f"{schema}.{full_table_name}"
partition_con.execute(f"SET IDENTITY_INSERT {full_table_name} ON")
try:
df.to_sql(name, con=partition_con, schema=schema, **kwargs)
except Exception as ex:
try:
import pyodbc
# if we can identify the error as being a pyodbc error, we retry
if isinstance(ex, pyodbc.Error):
# retry once after 1/2 second
time.sleep(0.5)
df.to_sql(name, con=partition_con, schema=schema, **kwargs)
else:
raise ex
except ImportError as _: # noqa: F841
# throw the original exception if we weren't able to verify that it was a deadlock exception
raise ex
return pandas.DataFrame()
result = qc._modin_frame._apply_full_axis(1, func, new_index=[], new_columns=[])
# blocking operation
result.to_pandas()