In [10]:
# Load Snowflake credentials from external file
import json
with open('snowflake_credentials.json') as f:
    snowflake_creds = json.load(f)
snowflake_user = snowflake_creds['user']
snowflake_password = snowflake_creds['password']
snowflake_account = snowflake_creds['account']
snowflake_warehouse = snowflake_creds['warehouse']
snowflake_database = snowflake_creds['database']
snowflake_schema = snowflake_creds['schema']
snowflake_role = snowflake_creds.get('role', None)

In [None]:
# Connect to postgresql db using sqlalchemy library and read data from it to pandas dataframe

import pandas as pd
from sqlalchemy import create_engine

# Load PostgreSQL credentials from external file
import json
with open('postgres_credentials.json') as f:
    postgres_creds = json.load(f)
postgresql_user = postgres_creds['user']
postgresql_password = postgres_creds['password']
postgresql_host = postgres_creds['host']
postgresql_port = postgres_creds['port']
postgresql_dbname = postgres_creds['dbname']

#database_uri = 'postgresql+psycopg2://user:password@host:port/dbname'

database_uri = f'postgresql+psycopg2://{postgresql_user}:{postgresql_password}@{postgresql_host}:{postgresql_port}/{postgresql_dbname}'

# Create the SQLAlchemy engine
engine = create_engine(database_uri)

# Define the sql query
query = 'SELECT * FROM migration.sales_data --LIMIT 100;'

# Use Pandas to read the SQL query into a DataFrame (57.8 sec)
sales_data_df = pd.read_sql(query, engine) 

# Print dataframe
sales_data_df


Unnamed: 0,row_id,order_id,order_date,ship_date,ship_mode,customer_id,customer_name,segment,country,city,...,postal_code,region,product_id,category,sub_category,product_name,sales,quantity,discount,profit
0,1,CA-2024-34188,2024-09-29,2025-05-01,Standard Class,CG-69747,Customer 1,Consumer,United States,City 1,...,69140,East,Off-PROD-3302911,Office Supplies,Appliances,Product B,483.03,5,0.03,2342.70
1,2,CA-2024-98883,2024-12-17,2025-05-09,Standard Class,CG-10912,Customer 2,Home Office,United States,City 2,...,83959,West,Off-PROD-4353748,Office Supplies,Chairs,Product D,432.11,2,0.08,795.08
2,3,CA-2024-58450,2024-05-30,2025-08-19,Second Class,CG-87990,Customer 3,Consumer,United States,City 3,...,86038,West,Fur-PROD-1254745,Furniture,Storage,Product D,279.60,9,0.01,2491.24
3,4,CA-2024-22077,2024-08-22,2025-06-20,Second Class,CG-29866,Customer 4,Home Office,United States,City 4,...,56703,East,Off-PROD-9663815,Office Supplies,Phones,Product D,404.71,5,0.08,1861.67
4,5,CA-2024-89826,2024-10-08,2025-11-06,Same Day,CG-56320,Customer 5,Home Office,United States,City 5,...,36707,South,Tec-PROD-7574801,Technology,Tables,Product A,404.20,1,0.05,383.99
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,96,CA-2024-78993,2024-12-08,2025-02-15,First Class,CG-85551,Customer 96,Consumer,United States,City 96,...,94824,East,Fur-PROD-4079308,Furniture,Appliances,Product A,275.70,1,0.25,206.77
96,97,CA-2024-62555,2024-09-09,2025-09-02,Standard Class,CG-44693,Customer 97,Home Office,United States,City 97,...,52468,South,Off-PROD-2113298,Office Supplies,Paper,Product A,675.21,9,0.23,4679.21
97,98,CA-2024-51246,2024-10-21,2025-07-08,Same Day,CG-96305,Customer 98,Home Office,United States,City 98,...,71965,South,Fur-PROD-5403188,Furniture,Phones,Product C,276.93,2,0.08,509.55
98,99,CA-2024-12729,2024-08-26,2025-04-20,Second Class,CG-59387,Customer 99,Home Office,United States,City 99,...,91736,Central,Off-PROD-9831736,Office Supplies,Bookcases,Product C,500.27,7,0.15,2976.61


In [4]:
# Create a new field "Shipping duration" to calculate the difference between ship_date and order_date

## Create a new dataframe that will store the new field
updated_sales_data_df = sales_data_df.copy(deep=True)

## Convert the order_date and ship_date columns to datetime
updated_sales_data_df['order_date'] = pd.to_datetime(updated_sales_data_df['order_date'])
updated_sales_data_df['ship_date'] = pd.to_datetime(updated_sales_data_df['ship_date'])

## Calculate the difference between ship_date and order_date
updated_sales_data_df['shipping_duration'] = (updated_sales_data_df['ship_date'] - updated_sales_data_df['order_date']).dt.days

## Print the updated dataframe (see the last column)
updated_sales_data_df

## Send back updated dataframe to Postgres to the table `updated_sales_data` (9 mins)
#updated_sales_data_df.to_sql('updated_sales_data', con=engine, schema='migration', if_exists='replace', index=False)

Unnamed: 0,row_id,order_id,order_date,ship_date,ship_mode,customer_id,customer_name,segment,country,city,...,region,product_id,category,sub_category,product_name,sales,quantity,discount,profit,shipping_duration
0,1,CA-2024-34188,2024-09-29,2025-05-01,Standard Class,CG-69747,Customer 1,Consumer,United States,City 1,...,East,Off-PROD-3302911,Office Supplies,Appliances,Product B,483.03,5,0.03,2342.70,214
1,2,CA-2024-98883,2024-12-17,2025-05-09,Standard Class,CG-10912,Customer 2,Home Office,United States,City 2,...,West,Off-PROD-4353748,Office Supplies,Chairs,Product D,432.11,2,0.08,795.08,143
2,3,CA-2024-58450,2024-05-30,2025-08-19,Second Class,CG-87990,Customer 3,Consumer,United States,City 3,...,West,Fur-PROD-1254745,Furniture,Storage,Product D,279.60,9,0.01,2491.24,446
3,4,CA-2024-22077,2024-08-22,2025-06-20,Second Class,CG-29866,Customer 4,Home Office,United States,City 4,...,East,Off-PROD-9663815,Office Supplies,Phones,Product D,404.71,5,0.08,1861.67,302
4,5,CA-2024-89826,2024-10-08,2025-11-06,Same Day,CG-56320,Customer 5,Home Office,United States,City 5,...,South,Tec-PROD-7574801,Technology,Tables,Product A,404.20,1,0.05,383.99,394
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,96,CA-2024-78993,2024-12-08,2025-02-15,First Class,CG-85551,Customer 96,Consumer,United States,City 96,...,East,Fur-PROD-4079308,Furniture,Appliances,Product A,275.70,1,0.25,206.77,69
96,97,CA-2024-62555,2024-09-09,2025-09-02,Standard Class,CG-44693,Customer 97,Home Office,United States,City 97,...,South,Off-PROD-2113298,Office Supplies,Paper,Product A,675.21,9,0.23,4679.21,358
97,98,CA-2024-51246,2024-10-21,2025-07-08,Same Day,CG-96305,Customer 98,Home Office,United States,City 98,...,South,Fur-PROD-5403188,Furniture,Phones,Product C,276.93,2,0.08,509.55,260
98,99,CA-2024-12729,2024-08-26,2025-04-20,Second Class,CG-59387,Customer 99,Home Office,United States,City 99,...,Central,Off-PROD-9831736,Office Supplies,Bookcases,Product C,500.27,7,0.15,2976.61,237


In [8]:
# Connect to snowflake using snowflake connector and read data from it to pandas dataframe

import snowflake.connector 
from snowflake.connector.pandas_tools import write_pandas

conn = snowflake.connector.connect(
          user=snowflake_user,
          password=snowflake_password,
          account=snowflake_account,
          warehouse=snowflake_warehouse,
          database=snowflake_database,
          schema=snowflake_schema,
          role=snowflake_role)

# Create a cursor object.
cur = conn.cursor()

# Execute a statement that will generate a result set.
sql = "select * from sales_data limit 100"
cur.execute(sql)

# Fetch the result set from the cursor and deliver it as the pandas DataFrame.
df = cur.fetch_pandas_all()

df

Unnamed: 0,ROW_ID,ORDER_ID,ORDER_DATE,SHIP_DATE,SHIP_MODE,CUSTOMER_ID,CUSTOMER_NAME,SEGMENT,COUNTRY,CITY,...,REGION,PRODUCT_ID,CATEGORY,SUB_CATEGORY,PRODUCT_NAME,SALES,QUANTITY,DISCOUNT,PROFIT,SHIPPING_DURATION
0,1,CA-2024-34188,2024-09-29,2025-05-01,Standard Class,CG-69747,Customer 1,Consumer,United States,City 1,...,East,Off-PROD-3302911,Office Supplies,Appliances,Product B,483.03,5,0.03,2342.70,
1,2,CA-2024-98883,2024-12-17,2025-05-09,Standard Class,CG-10912,Customer 2,Home Office,United States,City 2,...,West,Off-PROD-4353748,Office Supplies,Chairs,Product D,432.11,2,0.08,795.08,
2,3,CA-2024-58450,2024-05-30,2025-08-19,Second Class,CG-87990,Customer 3,Consumer,United States,City 3,...,West,Fur-PROD-1254745,Furniture,Storage,Product D,279.60,9,0.01,2491.24,
3,4,CA-2024-22077,2024-08-22,2025-06-20,Second Class,CG-29866,Customer 4,Home Office,United States,City 4,...,East,Off-PROD-9663815,Office Supplies,Phones,Product D,404.71,5,0.08,1861.67,
4,5,CA-2024-89826,2024-10-08,2025-11-06,Same Day,CG-56320,Customer 5,Home Office,United States,City 5,...,South,Tec-PROD-7574801,Technology,Tables,Product A,404.20,1,0.05,383.99,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,96,CA-2024-78993,2024-12-08,2025-02-15,First Class,CG-85551,Customer 96,Consumer,United States,City 96,...,East,Fur-PROD-4079308,Furniture,Appliances,Product A,275.70,1,0.25,206.77,
96,97,CA-2024-62555,2024-09-09,2025-09-02,Standard Class,CG-44693,Customer 97,Home Office,United States,City 97,...,South,Off-PROD-2113298,Office Supplies,Paper,Product A,675.21,9,0.23,4679.21,
97,98,CA-2024-51246,2024-10-21,2025-07-08,Same Day,CG-96305,Customer 98,Home Office,United States,City 98,...,South,Fur-PROD-5403188,Furniture,Phones,Product C,276.93,2,0.08,509.55,
98,99,CA-2024-12729,2024-08-26,2025-04-20,Second Class,CG-59387,Customer 99,Home Office,United States,City 99,...,Central,Off-PROD-9831736,Office Supplies,Bookcases,Product C,500.27,7,0.15,2976.61,


In [12]:
# Migrate data from PostgreSQL database table to Snowflake using Pandas dataframe as intermediary 

import pandas as pd
from sqlalchemy import create_engine
import snowflake.connector 
from snowflake.connector.pandas_tools import write_pandas

import json
with open('postgres_credentials.json') as f:
    postgres_creds = json.load(f)
postgresql_user = postgres_creds['user']
postgresql_password = postgres_creds['password']
postgresql_host = postgres_creds['host']
postgresql_port = postgres_creds['port']
postgresql_dbname = postgres_creds['dbname']

# Create connection string for PostgreSQL
database_uri = f'postgresql+psycopg2://{postgresql_user}:{postgresql_password}@{postgresql_host}:{postgresql_port}/{postgresql_dbname}'

# Create the SQLAlchemy engine
engine = create_engine(database_uri)

# Define the sql query
query = 'SELECT * FROM migration.sales_data -- LIMIT 3000000;'

# Use Pandas to read the SQL query into a DataFrame
sales_data_df = pd.read_sql(query, engine)
print("# of rows in sales_data_df = ", len(sales_data_df))

# Configure connection to Snowflake
snowflake_conn = snowflake.connector.connect(
          user=snowflake_user,
          password=snowflake_password,
          account=snowflake_account,
          warehouse=snowflake_warehouse,
          database=snowflake_database,
          schema=snowflake_schema,
          role=snowflake_role)

success, nchunks, nrows, _ = write_pandas(snowflake_conn, sales_data_df, 'SALES_DATA4',  auto_create_table=True)
print(f'success = {success}, nchunks = {nchunks}, nrows = {nrows}')

# of rows in sales_data_df =  3000000
success = True, nchunks = 1, nrows = 3000000
