## 1. Load packages and set global variables

In [1]:
import os
import pandas as pd
from datetime import datetime as dt
import requests
import json
import database as db # our script for simpler db integration
from config import config # pull sensitive data from .ini files

# set data directory
data_path = (os.getcwd() + "/Data/")
script_path = (os.getcwd() + "/SQL/")

# connection to Postgres database
conn = db.connect()

In [2]:
JSON_KEEPS = {
    'deals':['id','creator_user_id.id','user_id.id','org_id.value','stage_id','title','value','stage_change_time','status','won_time','lost_reason','email_messages_count','activities_count','active','add_time','update_time'],
    'organizations':['id','name','owner_id.id','address','active_flag','add_time','update_time'],
    'persons':['id','owner_id.id','org_id','first_name','last_name','active_flag','add_time','update_time'],
    'pipelines':['id','name','order_nr','active','add_time','update_time'],
    'stages':['id','order_nr','name','pipeline_id','rotten_flag','rotten_days','active_flag','add_time','update_time'],
    'users':['id','name','email','active_flag','created','modified']
}

## 2. Create tables for database

See details in Schema folder.

## 3. Extract data from Pipedrive

In [26]:
def get_data(endpoint, start, limit):
    
    params = config(section='pipedrive')
    params.update({'start':start,'limit':limit})
    
    response = requests.get(params['company_domain'] + endpoint, params=params)
    
    if response.status_code == 200:
        json_msg = response.json()
        print('Extraction of {} data from Pipedrive complete.'.format(endpoint))
    else:
        print(response.status_code)
        return
    
    data = json_msg['data']
    
    if endpoint in ['users','stages']:
        next_start = len(data)
    else:
        if json_msg['additional_data']['pagination']['more_items_in_collection']:
            next_start = json_msg['additional_data']['pagination']['next_start']
        elif data is None:
            next_start = start
        else:
            next_start = len(data)
    
    return data, next_start

In [4]:
def to_date(data):
    
    date_cols = [col for col in data.columns if '_dte' in col]
    
    for col in date_cols:
        data[col] = pd.to_datetime(data[col]).dt.date
    return data

In [7]:
def fill_missing(data):
    """
    Following rulesets should be applied for filling missing data
    
    ID's : -1
    Names : blank ('')
    Descriptions : blank ('')
    Timestamps: 1900-01-01
    Counts/amounts/numbers: -1
    """
    
    id_cols = [col for col in data.columns if '_id' in col]
    nme_cols = [col for col in data.columns if '_nme' in col]
    dsc_cols = [col for col in data.columns if '_dsc' in col]
    cnt_cols = [col for col in data.columns if '_cnt' in col]
    amt_cols = [col for col in data.columns if '_amt' in col]
    nbr_cols = [col for col in data.columns if '_nbr' in col]
    dte_cols = [col for col in data.columns if '_dte' in col]
    
    for col in (id_cols + cnt_cols + amt_cols + nbr_cols):
        data[col].fillna(-1, inplace=True)
    
    for col in (nme_cols + dsc_cols):
        data[col].fillna('', inplace=True)
    
    for col in (dte_cols):
        data[col].fillna('1900-01-01', inplace=True)
    
    return data

In [22]:
def json_to_df(json_data, json_keeps, df_rename):
    
    data = pd.json_normalize(json_data)[json_keeps]
    
    data.rename(columns=dict(zip(json_keeps, df_rename)), inplace=True)
    
    data = to_date(data)
    data = fill_missing(data)
    
    return data

In [15]:
def etl(endpoint, limit=500):
    """
    arguments:
        endpoint - Pipedrive endpoint to retreive data from
        limit - the amount of data to retrieve in this batch
    
    returns output message for failure/success
    """
    
    if endpoint not in [*JSON_KEEPS]:
        print('{} is not a valid endpoint. Please try again.'.format(endpoint))
        return
    
    start = db.execute_query(conn, "select start_nbr from endpoints where endpoint_nme = '{}';".format(endpoint))
    
    if start == -1:
        return
    start = start[0][0] # unpack tuple
    
    # read max rows from most recently stored data
    json_data, next_start = get_data(endpoint, start=start, limit=limit)
    
    if start == next_start:
        print('No new data. Check back later.')
        return
    
    if json_data is not None:
        
        table_cols = db.execute_query(conn, "select column_name from information_schema.columns where table_name = '{}';".format(endpoint))
        name_updates = [col_name for i in table_cols for col_name in i]
        data = json_to_df(
            json_data, # json data
            JSON_KEEPS[endpoint], # data names from json
            name_updates # new column names for df
        )
        
        # write data to database (will rollback if not successful)
        if endpoint == 'users':
            table = 'employees'
        else:
            table = endpoint
        write_data = db.execute_values(conn, data, table)
        
        if write_data == -1:
            return
        
        # increment tracker to new start
        update_endpoint = db.execute_query(conn, "update endpoints set start_nbr = {pos}, update_dtm = current_timestamp where endpoint_nme = '{endpoint}'".format(pos=next_start, endpoint=endpoint))
        
        if update_endpoint == -1:
            return

        print('Load to database complete.')
        
    else:
        print('No more {} data - tracker at position {}.'.format(endpoint, start))
        return

In [9]:
etl('users')

Extraction of users data from Pipedrive complete.
No new data. Check back later.


In [23]:
etl('pipelines')

Extraction of pipelines data from Pipedrive complete.
Load to database complete.


In [24]:
etl('pipelines')

Extraction of pipelines data from Pipedrive complete.
No new data. Check back later.


In [28]:
etl('stages')

Extraction of stages data from Pipedrive complete.
No new data. Check back later.


In [30]:
etl('organizations')

Extraction of organizations data from Pipedrive complete.
Load to database complete.
