# ETL Using Python
Important points to note here regarding authentication access to the database drivers.

In [None]:
#import needed libraries
from sqlalchemy import create_engine
import pyodbc
import pandas as pd
import os


'sqlalchemy' is the module used to interact with PostgreSQL. 'pyodbc' is the module used to query SQL Server. 'pandas' is the module used to perform the data extraction / loading. 'os' is the module used to retrieve the username and passwordwhich in this case is stored separately in the 'System -> Environment Variables -> System Variables (Lower window section)'. 

The user's credentials could be stored in a .sh , .ps or .xml file.

Grab the password from the environment variable.

In [None]:
#get password from environment var
pwd = os.environ['PGPASS']
uid = os.environ['PGUID']
#sql db details
driver = "{SQL Server Native Client 11.0}"
server = "haq-PC"
database = "AdventureWorksDW2019;"


In [None]:
#extract data from sql server
def extract():
    try:
        src_conn = pyodbc.connect('DRIVER=' + driver + ';SERVER=' + server + '\SQLEXPRESS' + ';DATABASE=' + database + ';UID=' + uid + ';PWD=' + pwd)
        src_cursor = src_conn.cursor()
        # execute query
        src_cursor.execute(""" select  t.name as table_name
        from sys.tables t where t.name in ('DimProduct','DimProductSubcategory','DimProductSubcategory','DimProductCategory','DimSalesTerritory','FactInternetSales') """)
        src_tables = src_cursor.fetchall()
        for tbl in src_tables:
            #query and load save data to dataframe
            df = pd.read_sql_query(f'select * FROM {tbl[0]}', src_conn)
            load(df, tbl[0])
    except Exception as e:
        print("Data extract error: " + str(e))
    finally:
        src_conn.close()


## Transform
This is the phase where I need to check for missing values and generally clean the data by exploring the type of information present and determining the scale or units of measurement to produce a snapshot which can be used in production.

In [None]:
#load data to postgres
def load(df, tbl):
    try:
        rows_imported = 0
        engine = create_engine(f'postgresql://{uid}:{pwd}@{server}:5432/AdventureWorks')
        print(f'importing rows {rows_imported} to {rows_imported + len(df)}... for table {tbl}')
        # save df to postgres
        df.to_sql(f'stg_{tbl}', engine, if_exists='replace', index=False)
        rows_imported += len(df)
        # add elapsed time to final print out
        print("Data imported successful")
    except Exception as e:
        print("Data load error: " + str(e))

try:
    #call extract function
    extract()
except Exception as e:
    print("Error while extracting data: " + str(e))

May need to provide a list of dependencies.