In [1]:
# import necessary modules
import pandas as pd
import requests
import zipfile
import os

In [2]:
# Set display option to show all columns
pd.set_option('display.max_columns', None)

In [3]:
# create a folder to hold the data
data_folder = 'Data'
# Check if the folder exists
if not os.path.exists(data_folder):
    # Create the folder
    os.makedirs(data_folder)
    print(f'{data_folder} folder created successfully')
else:
    print(f'{data_folder} folder already exists')

Data folder already exists


In [4]:
# url of the API
base_url = "https://ckan0.cf.opendata.inter.prod-toronto.ca"

# retrieve the metadata for pcard-expenditures package and its resources
url = base_url + "/api/3/action/package_show"
params = { "id": "pcard-expenditures"}
package = requests.get(url, params = params, verify=False).json()



In [5]:
package

{'help': 'https://ckan0.cf.opendata.inter.prod-toronto.ca/api/3/action/help_show?name=package_show',
 'success': True,
 'result': {'author': 'pcard@toronto.ca',
  'author_email': 'pcard@toronto.ca',
  'civic_issues': 'Fiscal responsibility',
  'creator_user_id': '329e1506-b545-4fc7-a4ea-e614f220eea7',
  'dataset_category': 'Document',
  'date_published': '2019-07-23 17:52:47.150105',
  'excerpt': 'The dataset contains details of all purchases made by City staff members using City-issued credit cards also referred to as PCards or purchasing cards.   ',
  'formats': 'ZIP,XLS',
  'id': 'ebc3f9c2-2f80-4405-bf4f-5fb309581485',
  'information_url': 'http://www.toronto.ca/finance/index.htm',
  'is_retired': 'false',
  'isopen': False,
  'last_refreshed': '2019-09-03 19:44:00',
  'license_id': 'open-government-licence-toronto',
  'license_title': 'open-government-licence-toronto',
  'maintainer': None,
  'maintainer_email': 'pcard@toronto.ca',
  'metadata_created': '2022-03-10T19:34:25.429521'

In [6]:
package.keys()

dict_keys(['help', 'success', 'result'])

In [7]:
# get the urls with data
data_url = ''
for r in package['result']['resources']:
    print(r['format'],r['url'])
    if r['format'] == 'ZIP':
        data_url = r['url']

print(f'Link for data {data_url}')

XLS https://ckan0.cf.opendata.inter.prod-toronto.ca/dataset/ebc3f9c2-2f80-4405-bf4f-5fb309581485/resource/070bdbd3-9bae-4269-b096-e3a8bd7460c8/download/pcard_expenditures_readme.xls
ZIP https://ckan0.cf.opendata.inter.prod-toronto.ca/dataset/ebc3f9c2-2f80-4405-bf4f-5fb309581485/resource/d83a5249-fb07-4c38-9145-9e12a32ce1d4/download/expenditures.zip
Link for data https://ckan0.cf.opendata.inter.prod-toronto.ca/dataset/ebc3f9c2-2f80-4405-bf4f-5fb309581485/resource/d83a5249-fb07-4c38-9145-9e12a32ce1d4/download/expenditures.zip


In [8]:
# Send a GET request to the URL
data_response = requests.get(data_url, verify=False)



In [9]:
# Save the content of the response to a file
downloaded_zip = 'expenditures.zip'
with open(downloaded_zip, "wb") as file:
    file.write(data_response.content)

In [10]:
# we want to extract the contents of the downloaded zip folder
# Open the ZIP file
with zipfile.ZipFile(downloaded_zip,'r') as zip_obj:
    # extract the data into the data folder we created
    zip_obj.extractall(data_folder)

In [61]:
# create a function to clean column names
def clean_column_names(columns):
    # replace / with nothing
    cleaned_columns = columns.str.replace('/','')
    # remove double spaces
    cleaned_columns = cleaned_columns.str.replace(' ','')
    # remove full stops
    cleaned_columns = cleaned_columns.str.replace('.','')
    # remove hyphens
    cleaned_columns = cleaned_columns.str.replace('-','')
    # strip spaces from column names
    cleaned_columns = cleaned_columns.str.strip()
    # standardize the column names (especially centre and center)
    cleaned_columns = cleaned_columns.str.replace('Centre','Center')
    # standardize (GL Account and Expense Type Descriptions)
    cleaned_columns = cleaned_columns.str.replace('ExpTypeDesc','GLAccountDescription')
    
    return cleaned_columns

In [64]:
# import data from the extracted files above

# an empty list that will hold our data
li_df = []
# list of files we extracted
li_files = os.listdir(data_folder)
print(f'we shall import data from {len(li_files)} files')

# initialize a counter
c = 0

for file in li_files:
    # import the data from the files
    df = pd.read_excel(data_folder + '\\' + file)
    
    # clean column names     
    df.columns = clean_column_names(df.columns)
    
    # drop nulls
    df.dropna(how='any',inplace=True)
    
    # add a column to idenfity which file we extracted the data from
    df['SourceFileName'] = file
    
    # add the data to our list of dataframes
    li_df.append(df)
    print(f'{c+1}. data from {file} imported successfully. Rows: {df.shape[0]}. Columns: {df.shape[1]}')
    c += 1
    
    # +++ COMMENT OUT THE FOLLOWING IN PRODUCTION +++ 
    if c == 5:
        break
    # +++ END OF COMMENT +++ 

we shall import data from 103 files


  cleaned_columns = cleaned_columns.str.replace('.','')


1. data from PCard Expenses_201706.xlsx imported successfully. Rows: 5259. Columns: 17
2. data from PCard Expenses_201707.xlsx imported successfully. Rows: 5617. Columns: 17
3. data from PCard Expenses_201708.xlsx imported successfully. Rows: 4761. Columns: 17
4. data from PCard Expenses_201709.xlsx imported successfully. Rows: 3895. Columns: 17
5. data from PCard Expenses_201710.xlsx imported successfully. Rows: 4790. Columns: 17


In [65]:
# combine the imported data frames into one
combined_df = pd.concat(li_df)
print(combined_df.shape)
combined_df

(24322, 17)


Unnamed: 0,Division,BatchTransactionID,TransactionDate,CardPostingDt,MerchantName,TransactionAmt,TrxCurrency,OriginalAmount,OriginalCurrency,GLAccount,GLAccountDescription,CostCenterWBSElementOrder,CostCenterWBSElementOrderDescription,MerchantType,MerchantTypeDescription,Purpose,SourceFileName
0,PUBLIC HEALTH,4608-1,2017-06-15,2017-06-16,PAYPAL *OBC2012,50.00,CAD,50.00,CAD,4760,MEMBERSHIP FEES,PH3071,MATERNAL INFANT HEALTH PROGRAM SUPPORT,8641.0,"Associations - Civic, Social, and Frater",MEMBERSHIP FEE,PCard Expenses_201706.xlsx
1,PUBLIC HEALTH,4617-1,2017-06-21,2017-06-23,CHNC,423.75,CAD,423.75,CAD,4256,CONFERENCES/SEMINARS - REGISTRATION FEES,PH3071,MATERNAL INFANT HEALTH PROGRAM SUPPORT,8641.0,"Associations - Civic, Social, and Frater",CHNC CONFERENCE,PCard Expenses_201706.xlsx
2,PUBLIC HEALTH,4621-1,2017-06-26,2017-06-27,POST MD-CPD-UOFT,2601.00,CAD,2601.00,CAD,4256,CONFERENCES/SEMINARS - REGISTRATION FEES,PH3071,MATERNAL INFANT HEALTH PROGRAM SUPPORT,8220.0,"Colleges, Universities, Professional Sch",UOFT CONFERENCE,PCard Expenses_201706.xlsx
3,PUBLIC HEALTH,4626-1,2017-06-28,2017-06-29,EVENTBRITE/PERINATALMO,480.66,CAD,480.66,CAD,4256,CONFERENCES/SEMINARS - REGISTRATION FEES,PH3071,MATERNAL INFANT HEALTH PROGRAM SUPPORT,7922.0,Theatrical Producers (except Motion Pict,REGISTRATION FEES,PCard Expenses_201706.xlsx
5,ECONOMIC DEVELOPMENT & CULTURE,4590-1,2017-06-02,2017-06-05,LEE VALLEY - DOWNTOWN,38.31,CAD,38.31,CAD,2600,RECREATIONAL & EDUCATIONAL SUPPLIES,AH0073,HS-MH-EDUCATION/OUTRCH-FORT YORK,5072.0,Hardware Equipment and Supplies,LEMON RASPS,PCard Expenses_201706.xlsx
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5743,SOLID WASTE MANAGEMENT,4791-4,2017-10-23,2017-10-25,PETROCAN,22.03,CAD,22.03,CAD,2280,DIESEL - CLEAR,CSW312-16-30,KEELE VALLEY LF,5541.0,Service Stations (with or without Ancill,diesel-city use,PCard Expenses_201710.xlsx
5745,FIRE SERVICES,4761-4,2017-09-29,2017-10-03,ESSO,49.12,CAD,49.12,CAD,*****,WASHES-LICENSED MOBILE EQUIP.,FR0025,MECHANICAL MAINT-VEHICLE MAINT,5541.0,Service Stations (with or without Ancill,**MORE** GASOLINE - SELF SERVICE,PCard Expenses_201710.xlsx
5746,FIRE SERVICES,4780-3,2017-10-14,2017-10-17,PETROCAN,55.54,CAD,55.54,CAD,2260,GASOLINE,FR0025,MECHANICAL MAINT-VEHICLE MAINT,5542.0,"Fuel Dispenser, Automated",GASOLINE - SELF SERVICE,PCard Expenses_201710.xlsx
5747,FIRE SERVICES,4795-9,2017-10-27,2017-10-30,PETROCAN,60.00,CAD,60.00,CAD,2260,GASOLINE,FR0025,MECHANICAL MAINT-VEHICLE MAINT,5542.0,"Fuel Dispenser, Automated",GASOLINE - SELF SERVICE,PCard Expenses_201710.xlsx


In [56]:
combined_df.columns

Index(['Division', 'BatchTransactionID', 'TransactionDate', 'CardPostingDt',
       'MerchantName', 'TransactionAmt', 'TrxCurrency', 'OriginalAmount',
       'OriginalCurrency', 'GLAccount', 'GLAccountDescription',
       'CostCenterWBSElementOrder', 'CostCenterWBSElementOrderDescription',
       'MerchantType', 'MerchantTypeDescription', 'Purpose',
       'Source_File_Name', 'ExpTypeDesc'],
      dtype='object')