In [1]:
# import necessary modules
from datetime import datetime
import pandas as pd
import db_connection
conn = db_connection.db_conn
db_name = db_connection.database_name
raw_table = db_connection.table_name

Database connected successfully


In [2]:
# print start of the job
start_time = datetime.now()
print('Data Load job started at at {}'.format(start_time))

Data Load job started at at 2023-05-23 19:40:26.826438


In [3]:
# import data from the reports folder (staging environment)
df_raw = pd.read_csv('Reports/Combined_pcard_expenditures.csv')
# drop any duplicates
df_raw.drop_duplicates(subset=['BatchTransactionID'],inplace=True)
print(df_raw.shape)
df_raw.head()

(24322, 17)


Unnamed: 0,Division,BatchTransactionID,TransactionDate,CardPostingDt,MerchantName,TransactionAmt,TrxCurrency,OriginalAmount,OriginalCurrency,GLAccount,GLAccountDescription,CostCenterWBSElementOrder,CostCenterWBSElementOrderDescription,MerchantType,MerchantTypeDescription,Purpose,SourceFileName
0,PUBLIC HEALTH,4608-1,2017-06-15,2017-06-16,PAYPAL *OBC2012,50.0,CAD,50.0,CAD,4760,MEMBERSHIP FEES,PH3071,MATERNAL INFANT HEALTH PROGRAM SUPPORT,8641.0,"Associations - Civic, Social, and Frater",MEMBERSHIP FEE,PCard Expenses_201706.xlsx
1,PUBLIC HEALTH,4617-1,2017-06-21,2017-06-23,CHNC,423.75,CAD,423.75,CAD,4256,CONFERENCES/SEMINARS - REGISTRATION FEES,PH3071,MATERNAL INFANT HEALTH PROGRAM SUPPORT,8641.0,"Associations - Civic, Social, and Frater",CHNC CONFERENCE,PCard Expenses_201706.xlsx
2,PUBLIC HEALTH,4621-1,2017-06-26,2017-06-27,POST MD-CPD-UOFT,2601.0,CAD,2601.0,CAD,4256,CONFERENCES/SEMINARS - REGISTRATION FEES,PH3071,MATERNAL INFANT HEALTH PROGRAM SUPPORT,8220.0,"Colleges, Universities, Professional Sch",UOFT CONFERENCE,PCard Expenses_201706.xlsx
3,PUBLIC HEALTH,4626-1,2017-06-28,2017-06-29,EVENTBRITE/PERINATALMO,480.66,CAD,480.66,CAD,4256,CONFERENCES/SEMINARS - REGISTRATION FEES,PH3071,MATERNAL INFANT HEALTH PROGRAM SUPPORT,7922.0,Theatrical Producers (except Motion Pict,REGISTRATION FEES,PCard Expenses_201706.xlsx
4,ECONOMIC DEVELOPMENT & CULTURE,4590-1,2017-06-02,2017-06-05,LEE VALLEY - DOWNTOWN,38.31,CAD,38.31,CAD,2600,RECREATIONAL & EDUCATIONAL SUPPLIES,AH0073,HS-MH-EDUCATION/OUTRCH-FORT YORK,5072.0,Hardware Equipment and Supplies,LEMON RASPS,PCard Expenses_201706.xlsx


In [4]:
df_raw.columns

Index(['Division', 'BatchTransactionID', 'TransactionDate', 'CardPostingDt',
       'MerchantName', 'TransactionAmt', 'TrxCurrency', 'OriginalAmount',
       'OriginalCurrency', 'GLAccount', 'GLAccountDescription',
       'CostCenterWBSElementOrder', 'CostCenterWBSElementOrderDescription',
       'MerchantType', 'MerchantTypeDescription', 'Purpose', 'SourceFileName'],
      dtype='object')

In [5]:
# we now want to save our data in the database
# we get all the records into individual rows and save them at once
raw_rows = []
for index, row in df_raw.iterrows():
    record = (row[0],row[1],row[2],row[3],row[4],row[5],row[6],row[7],row[8],
              row[9],row[10],row[11],row[12],row[13],row[14],row[15],row[16])
    raw_rows.append(record)
raw_rows[:2]

[('PUBLIC HEALTH',
  '4608-1',
  '2017-06-15',
  '2017-06-16',
  'PAYPAL *OBC2012',
  50.0,
  'CAD',
  50.0,
  'CAD',
  '4760',
  'MEMBERSHIP FEES',
  'PH3071',
  'MATERNAL INFANT HEALTH PROGRAM SUPPORT',
  8641.0,
  'Associations - Civic, Social, and Frater',
  'MEMBERSHIP FEE',
  'PCard Expenses_201706.xlsx'),
 ('PUBLIC HEALTH',
  '4617-1',
  '2017-06-21',
  '2017-06-23',
  'CHNC',
  423.75,
  'CAD',
  423.75,
  'CAD',
  '4256',
  'CONFERENCES/SEMINARS - REGISTRATION FEES',
  'PH3071',
  'MATERNAL INFANT HEALTH PROGRAM SUPPORT',
  8641.0,
  'Associations - Civic, Social, and Frater',
  'CHNC CONFERENCE',
  'PCard Expenses_201706.xlsx')]

In [6]:
# save the raw data
try:
    # disable autocommit
    conn.autocommit = True
    
    # Create a cursor object
    cursor = conn.cursor()
    
    # transaction 1: truncate the raw data table
    sql = f'TRUNCATE TABLE {raw_table}'
    cursor.execute(sql)
    
    # transaction 2: save the new data
    sql = f"""
        INSERT INTO {raw_table} ("Division","BatchTransactionID","TransactionDate","CardPostingDt",
        "MerchantName","TransactionAmt","TrxCurrency","OriginalAmount",
        "OriginalCurrency","GLAccount","GLAccountDescription",
        "CostCenterWBSElementOrder","CostCenterWBSElementOrderDescription",
        "MerchantType","MerchantTypeDescription","Purpose","SourceFileName"
        )
        VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
    """
    cursor.executemany(sql,raw_rows)    
    conn.commit()
    
    # Close the cursor
    cursor.close()
    
    print(f'{len(df_raw)} rows successfully saved in {db_name}.{raw_table}')
    
except Exception as e:
    print(f'An error occurred while inserting records: {e}')

24322 rows successfully saved in pcard_expenditures.raw_data_transactions


In [7]:
df_raw.head()

Unnamed: 0,Division,BatchTransactionID,TransactionDate,CardPostingDt,MerchantName,TransactionAmt,TrxCurrency,OriginalAmount,OriginalCurrency,GLAccount,GLAccountDescription,CostCenterWBSElementOrder,CostCenterWBSElementOrderDescription,MerchantType,MerchantTypeDescription,Purpose,SourceFileName
0,PUBLIC HEALTH,4608-1,2017-06-15,2017-06-16,PAYPAL *OBC2012,50.0,CAD,50.0,CAD,4760,MEMBERSHIP FEES,PH3071,MATERNAL INFANT HEALTH PROGRAM SUPPORT,8641.0,"Associations - Civic, Social, and Frater",MEMBERSHIP FEE,PCard Expenses_201706.xlsx
1,PUBLIC HEALTH,4617-1,2017-06-21,2017-06-23,CHNC,423.75,CAD,423.75,CAD,4256,CONFERENCES/SEMINARS - REGISTRATION FEES,PH3071,MATERNAL INFANT HEALTH PROGRAM SUPPORT,8641.0,"Associations - Civic, Social, and Frater",CHNC CONFERENCE,PCard Expenses_201706.xlsx
2,PUBLIC HEALTH,4621-1,2017-06-26,2017-06-27,POST MD-CPD-UOFT,2601.0,CAD,2601.0,CAD,4256,CONFERENCES/SEMINARS - REGISTRATION FEES,PH3071,MATERNAL INFANT HEALTH PROGRAM SUPPORT,8220.0,"Colleges, Universities, Professional Sch",UOFT CONFERENCE,PCard Expenses_201706.xlsx
3,PUBLIC HEALTH,4626-1,2017-06-28,2017-06-29,EVENTBRITE/PERINATALMO,480.66,CAD,480.66,CAD,4256,CONFERENCES/SEMINARS - REGISTRATION FEES,PH3071,MATERNAL INFANT HEALTH PROGRAM SUPPORT,7922.0,Theatrical Producers (except Motion Pict,REGISTRATION FEES,PCard Expenses_201706.xlsx
4,ECONOMIC DEVELOPMENT & CULTURE,4590-1,2017-06-02,2017-06-05,LEE VALLEY - DOWNTOWN,38.31,CAD,38.31,CAD,2600,RECREATIONAL & EDUCATIONAL SUPPLIES,AH0073,HS-MH-EDUCATION/OUTRCH-FORT YORK,5072.0,Hardware Equipment and Supplies,LEMON RASPS,PCard Expenses_201706.xlsx


In [8]:
df_raw.columns

Index(['Division', 'BatchTransactionID', 'TransactionDate', 'CardPostingDt',
       'MerchantName', 'TransactionAmt', 'TrxCurrency', 'OriginalAmount',
       'OriginalCurrency', 'GLAccount', 'GLAccountDescription',
       'CostCenterWBSElementOrder', 'CostCenterWBSElementOrderDescription',
       'MerchantType', 'MerchantTypeDescription', 'Purpose', 'SourceFileName'],
      dtype='object')

#### Infer tables

In [33]:
# transactions
transactions = df_raw.groupby(['BatchTransactionID','TransactionDate', 'CardPostingDt',
                               'TransactionAmt', 'TrxCurrency', 'OriginalAmount','OriginalCurrency',
                               'GLAccount','CostCenterWBSElementOrder','MerchantName',
                               'Division','Purpose'
                              ]).size().reset_index(name='Count').drop(columns='Count')
print(transactions.shape)
transactions.head()

(24322, 12)


Unnamed: 0,BatchTransactionID,TransactionDate,CardPostingDt,TransactionAmt,TrxCurrency,OriginalAmount,OriginalCurrency,GLAccount,CostCenterWBSElementOrder,MerchantName,Division,Purpose
0,4582-1,2017-05-31,2017-06-01,18.13,CAD,18.13,CAD,2750,P09913,METRO #442,"PARKS, FORESTRY & RECREATION","PEPPERS, TOMATOES, AIR FRESHENER"
1,4582-10,2017-05-31,2017-06-01,69.93,CAD,69.93,CAD,2750,P13580,NOFRILLS ROSS 782,"PARKS, FORESTRY & RECREATION",GROCERIES FOR PRGM
2,4582-100,2017-05-31,2017-06-01,13.86,CAD,13.86,CAD,2099,C01205,REAL CANADIAN WHOLESAL,EMPLOYMENT & SOCIAL SERVICES,SNACKS-OMMUNITY OF PRACTICE MEETING MAY31/17
3,4582-101,2017-05-31,2017-06-01,55.6,CAD,55.6,CAD,4820,C01281,SOBEYS 657 QPS,EMPLOYMENT & SOCIAL SERVICES,STAFF MEETING-WELLNESS KICKOFF EVENT JUN 2/17
4,4582-102,2017-05-31,2017-06-01,179.12,CAD,179.12,CAD,4414,ED0139,FACEBK *3CA2CCS2E2,ECONOMIC DEVELOPMENT & CULTURE,FACEBOOK ADS


In [35]:
# merchants
merchants = df_raw.groupby(['MerchantName','MerchantType']).size().reset_index(name='Count').drop(columns='Count')
print(merchants.shape)
merchants.head()

(4252, 2)


Unnamed: 0,MerchantName,MerchantType
0,"""77"" AUTO PARTS",5533.0
1,#183 MARK'S,5651.0
2,#187 MARK'S,5651.0
3,#241 SPORT CHEK,5941.0
4,#263 SPORT CHEK,5941.0


In [36]:
# merchant types
merchant_types = df_raw.groupby(['MerchantType','MerchantTypeDescription']).size().reset_index(name='Count').drop(columns='Count')
print(merchant_types.shape)
merchant_types.head()

(239, 2)


Unnamed: 0,MerchantType,MerchantTypeDescription
0,742.0,Veterinary Services
1,763.0,Agricultural Cooperatives
2,780.0,Horticultural and Landscaping Services
3,1520.0,General Contractors - Residential and Co
4,1711.0,"Air Conditioning, Heating, and Plumbing"


In [38]:
# gl_accounts
gl_accounts = df_raw.groupby(['GLAccount','GLAccountDescription']).size().reset_index(name='Count').drop(columns='Count')
print(gl_accounts.shape)
gl_accounts.head()

(259, 2)


Unnamed: 0,GLAccount,GLAccountDescription
0,*****,BLDG REPAIRS & RENOVATION SUPPLIES
1,*****,BUSINESS MEETING EXPENSES
2,*****,BUSINESS TRAV - ACCOMMODATION
3,*****,BUSINESS TRAV - OTHER EXPENSES
4,*****,CANVAS & CORDAGE


In [39]:
# cost_centre
cost_centre = df_raw.groupby(['CostCenterWBSElementOrder', 'CostCenterWBSElementOrderDescription']).size().reset_index(name='Count').drop(columns='Count')
print(cost_centre.shape)
cost_centre.head()

(2485, 2)


Unnamed: 0,CostCenterWBSElementOrder,CostCenterWBSElementOrderDescription
0,*****,"MULTIPLE COST CENTRES, WBS ELEMENTS OR WORK OR..."
1,61251855,BLDG REPAIRS & RENOVATIONS - VARIOUS LOCATIONS
2,61251855,"MULTIPLE COST CENTRES, WBS ELEMENTS OR WORK OR..."
3,61307473,BLDG REPAIRS & RENOVATIONS - VARIOUS LOCATIONS
4,61307473,"MULTIPLE COST CENTRES, WBS ELEMENTS OR WORK OR..."


In [37]:
df_raw.columns

Index(['Division', 'BatchTransactionID', 'TransactionDate', 'CardPostingDt',
       'MerchantName', 'TransactionAmt', 'TrxCurrency', 'OriginalAmount',
       'OriginalCurrency', 'GLAccount', 'GLAccountDescription',
       'CostCenterWBSElementOrder', 'CostCenterWBSElementOrderDescription',
       'MerchantType', 'MerchantTypeDescription', 'Purpose', 'SourceFileName'],
      dtype='object')

In [None]:
# divisions table
try:
    # disable autocommit
    conn.autocommit = True