# Setup

## library import

In [1]:
#api website: https://api.industrialinfo.com/

import numpy as np
import pandas as pd
import datetime
import requests
pd.set_option('display.max_columns', None)
import os

import json
from io import StringIO
import boto3

codePath = os.getcwd()

## generate token

In [48]:
'''comment out code once token is generated'''

token = 'INPUT THE TOKEN HERE THAT IS PRINTED'

# def generate_token():
#     params = {
#         'username': 'INPUT USERNAME',
#         'password': 'INPUT PASSWORD',
#         'tokenLifeTime': '30',
#     }
#     response = requests.post('https://api.industrialinfo.com/idb/v2.0/token', params=params)
#     token = response.headers['AUTHORIZATION'][7:]    
#     return token, response
# token,response = generate_token()
# print(token)

# Data

## download data

In [78]:
def get_data(eventStartDateMin,eventStartDateMax='2024-12-31'):
    limit = 1000
    offset = 0
    i = 1
    fd = pd.DataFrame()
    while True:
        headers = {
            'Accept': 'application/json',
            'Authorization': 'Bearer {}'.format(token),
        }
        params = {'eventStartDateMin': str(eventStartDateMin), 'eventStartDateMax': str(eventStartDateMax), 'limit': str(limit), 'offset': str(offset)}
        response = requests.post('https://api.industrialinfo.com/idb/v2.0/offlineevents/summary', params=params, headers=headers)        
        df = pd.DataFrame(response.json()['offlineEvents'])
        fd = pd.concat([fd,df],axis=0)        
        recvd = df.shape[0]
        recvdTotal = fd.shape[0]
        print('API call #' + str(i) + ' returned ' + str(recvd) + ' rows for a total of ' + str(recvdTotal))
        if recvd < limit:
            break
        offset += limit
        i = i+1        
    print("API returned a total of " + str(fd.shape[0]) + " rows")
    
    return fd
df = get_data('2023-01-01','2023-02-28')

API call #1 returned 873 rows for a total of 873
API returned a total of 873 rows


In [79]:
def get_data(eventStartDateMin,eventStartDateMax='2024-12-31'):
    limit = 1000
    offset = 0
    i = 1
    fd = pd.DataFrame()
    while True:
        headers = {
            'Accept': 'application/json',
            'Authorization': 'Bearer {}'.format(token),
        }
        params = {'eventStartDateMin': str(eventStartDateMin), 'eventStartDateMax': str(eventStartDateMax), 'limit': str(limit), 'offset': str(offset),'derate' : 1}
        response = requests.post('https://api.industrialinfo.com/idb/v2.0/offlineevents/summary', params=params, headers=headers)        
        df = pd.DataFrame(response.json()['offlineEvents'])
        fd = pd.concat([fd,df],axis=0)        
        recvd = df.shape[0]
        recvdTotal = fd.shape[0]
        print('API call #' + str(i) + ' returned ' + str(recvd) + ' rows for a total of ' + str(recvdTotal))
        if recvd < limit:
            break
        offset += limit
        i = i+1        
    print("API returned a total of " + str(fd.shape[0]) + " rows")
    
    return fd
df_derate = get_data('2023-01-01','2023-02-28')

API call #1 returned 360 rows for a total of 360
API returned a total of 360 rows


In [82]:
df.shape, df_derate.shape
df.to_csv('asdf_noderate.csv')
df_derate.to_csv('asdf_derate.csv')

## clean data

In [50]:
class cleanData(object):
    def __init__(self):
        pass

    def normalizeJSON(self,raw):
        '''the data has json in the columns so need to run this to normalize the data'''
        df = raw.copy()        
        df.set_index('offlineEventKey', inplace=True)
        df.index.name = 'offlineEventKey'

        lst = [
            'plantPhysicalAddress',
            'offlineCapacity',
            'fuel',        
            ]
        for i in lst:
            df_norm = pd.json_normalize(df[i])
            df.drop([i], axis=1, inplace=True)
            df_norm.index = df.index
            df = pd.concat([df,df_norm], axis=1)
        return df
    
    def convertDates(self,raw):
        '''the data is a string so need to convert to datetime'''
        df = raw.copy()
        lst = [
            'associatedEntityStartDate',
            'associatedEntityEndDate',
            'associatedEntityPrevStartDate',
            'associatedEntityPrevEndDate',
            'eventStartDate',
            'eventEndDate',
            'prevStartDate',
            'prevEndDate',
            'liveDate',
            'releaseDate',
            ]
        for i in lst:
            df[i] = df[i].str[:10] 
            df[i] = pd.to_datetime(df[i])
        return df
    
    def dropUnecessaryColumns(self,raw):
        '''these are columns mostly pointless for the exercise'''
        df = raw.copy()
        lst = [
            'associatedEntityStartDate',
            'associatedEntityEndDate',
            'associatedEntityPrevStartDate',
            'associatedEntityPrevEndDate',
            'associatedEntityType',
            'prevStartDate',
            'prevEndDate',
            'liveDate',
            'isoRtoRegion',
            'uom',
            'releaseDate',
            ]
        df.drop(lst, axis=1, inplace=True)    
        return df
    
    def renameTradeRegion(self,raw):
        '''rename to preferred names'''
        df = raw.copy()
        regionAlt = {'I':'P1','II':'P2','III':'P3','IV':'P4','V':'P5'}
        df['tradingRegionName'] = df['tradingRegionName'].replace(regionAlt)
        countryAlt = {'U.S.A.':'USA'}
        df['countryName'] = df['countryName'].replace(countryAlt)    
        return df

    def map_unitType(self):
        df = pd.DataFrame.from_dict({
            'Delayed Coker': 'COK',
            'Hydrofluoric Alkylation': 'ALK',
            'Semiregen/Cyclic Reformer': 'RFM',
            'Distillate Hydrotreater': 'HT',
            'Olefin/Aromatics Hydrotreater': 'HT',
            'Diluent Recovery Unit (DRU)': 'DRU',
            'Vacuum Distillation': 'VDU',
            'Mid Distillate Hydrotreater': 'HT',
            'Diesel Hydrotreater': 'HT',
            'Distillate Hydrocracker': 'HCU',
            'FCCU (Fluid Catalytic Cracker)': 'FCC',
            'CCR(Continuous Catalytic Reformer)': 'RFM',
            'HGO (Heavy Gas Oils) Hydrotreater': 'HT',
            'Isomerization': 'ISO',
            'LPG Liquid Petroleum Gas': 'LPG',
            'Atmospheric Distillation': 'CDU',
            'FCCU Gasoline Hydrotreater': 'HT',
            'FCCU Feed Hydrotreater': 'HT',
            'Reformer Feed Hydrotreater': 'HT',
            'Visbreaker': 'VBU',
            'Sulfuric Alkylation': 'ALK',
            'Lube Oil Hydrotreater': 'HT',
            'Residual Hydrocracker': 'HCU',
            'Residual Hydrotreater': 'HT',
            'Light Naphtha Hydrotreater': 'HT',
            'RFCCU (Residue Fluid Catalytic Cracker)': 'FCC',
            'Condensate Splitter': 'CDU',
            'SR (Straight-Run) Hydrotreater': 'HT',
            'Fluidized-Bed Coker': 'COK',
            'Thermal Cracker': 'COK',
            'TCCU Thermal Catalytic Cracking Unit': 'FCC',
            'Hydrodealkylation': 'ALK',
            'Lube Oil Hydrocracker': 'HCU',
            },orient='index',columns=['unitType'])
        df = df.reset_index()
        df = df.rename(columns={'index':'unitTypeDesc'})
        return df

    def colUnitType(self,raw):
        '''this is to create a custom column for unit type.  grabbing from local so need to change'''
        df = raw.copy()
        df.reset_index(inplace=True)
        unitType = self.map_unitType()
        df = df.merge(unitType, how='left', left_on='unitTypeDesc', right_on='unitTypeDesc')
        df.set_index('offlineEventKey', inplace=True)
        return df
    
    def runAll(self,raw):
        '''this runs all the functions'''
        df = self.normalizeJSON(raw)
        df = self.convertDates(df)
        df = self.dropUnecessaryColumns(df)
        df = self.renameTradeRegion(df)
        df = self.colUnitType(df)
        return df
    

## process data

In [75]:
class processData(object):
    def __init__(self):
        pass

    def helperDateRange(self,beginning,ending):
        start = pd.to_datetime(beginning, format='%Y-%m-%d')
        end = pd.to_datetime(ending, format='%Y-%m-%d')
        rng = pd.DataFrame({'BOM':pd.date_range(start=start, end=end, freq='M')+ pd.DateOffset(months=-1) + pd.tseries.offsets.MonthEnd(0),
                            'EOM':pd.date_range(start=start, end=end, freq='M')})
        rng['period'] = rng['EOM'].dt.strftime('%Y-%m')    
        return rng

    def selectFilters(self,raw,tradingRegionName=None,unitType=None,countryName=None):
        df = raw.copy()
        if tradingRegionName is not None:
            df = df[df['tradingRegionName'] == tradingRegionName]
        if unitType is not None:
            df = df[df['unitType'] == unitType]
        if countryName is not None:
            df = df[df['countryName'] == countryName]
            #eventstatusdesc does not equal cancelled
        df = df[df['eventStatusDesc'] != 'Cancelled']               
        return df

    def minimizeData(self,raw):
        df = raw.copy()
        df = df[['eventStartDate','eventEndDate','unitCapacity','capacityOffline']]
        return df

    def calculateMonthly(self,raw,beginning,ending):
        df = raw.copy()
        df = df[df['eventEndDate'] >= beginning]
        df = df[df['eventStartDate'] <= ending]

        df['BOM'] = pd.to_datetime(beginning)
        df['EOM'] = pd.to_datetime(ending)
        df['MMM'] = df['EOM'].dt.strftime('%Y-%m')
        
        df['DAYS'] = (df[['eventEndDate','EOM']].min(axis=1) - df[['eventStartDate','BOM']].max(axis=1)).dt.days #this is to get the days in the month
        df['DAYS'] = df['DAYS'].clip(lower=0) #this is to remove negative days
        df['DAYS'] = np.where((df['eventStartDate'] + pd.tseries.offsets.MonthEnd(0)) == df['EOM'], df['DAYS'] + 1, df['DAYS']) #this is to add 1 day if the event end date is the last day of the month
        df['DAYS'] = np.where(df['DAYS'] == 0,np.nan,df['DAYS']) #this is to remove 0 days
        df.dropna(subset=['DAYS'], inplace=True)

        df['DAYS_IN_MONTH'] = (df['EOM'] - df['BOM']).dt.days # this is to get the days in the month

        df['CHARGE'] = round((df['DAYS'] / df['DAYS_IN_MONTH'] * df['capacityOffline'])/1000,2)
        
        df = df[['MMM','CHARGE']]        

        return df

    def runOneMonth(self,raw,beginning,ending,tradingRegionName=None,unitType='CDU',countryName='USA'):
        df = self.selectFilters(raw,tradingRegionName,unitType,countryName)
        df = self.minimizeData(df)
        df = self.calculateMonthly(df,beginning,ending)
        return df        

    def runMultipleMonths(self,raw,beginning,ending,tradingRegionName=None,unitType='CDU',countryName='USA'):
        rng = self.helperDateRange(beginning,ending)
        fd = pd.DataFrame()
        for i in range(len(rng)):
            df = self.runOneMonth(raw,rng['BOM'][i],rng['EOM'][i],tradingRegionName,unitType,countryName)
            fd = pd.concat([fd,df],axis=0)
        return fd

# Finalizing

In [None]:
raw_1521 = get_data('2015-01-01','2021-12-31')
raw_1521 = cleanData().runAll(raw_1521)
raw_1521.to_csv('IIR RAW 2015-2021.csv')    

In [68]:
if __name__ == '__main__':  
    
    '''initial data download which is only downloaded once if it isn't in the directory'''
    if not os.path.isfile('IIR RAW 2015-2021.csv'):
        raw_1521 = get_data('2015-01-01','2021-12-31')
        raw_1521 = cleanData().runAll(raw_1521)
        raw_1521.to_csv('IIR RAW 2015-2021.csv')    

    raw_1521 = pd.read_csv('IIR RAW 2015-2021.csv',index_col='offlineEventKey',parse_dates=['eventStartDate','eventEndDate'])
    #convert productID to object
    raw_1521['productId'] = raw_1521['productId'].astype('object')

    '''refreshing data'''
    raw = get_data('2022-01-01','2024-12-31')
    raw = cleanData().runAll(raw)
    raw.to_csv('IIR RAW 2022-2024.csv')
    
    '''combine historical 2015-2019 with the refreshed data'''
    raw = pd.concat([raw_1521,raw],axis=0)
    del raw_1521

API call #1 returned 1000 rows for a total of 1000
API call #2 returned 1000 rows for a total of 2000
API call #3 returned 1000 rows for a total of 3000
API call #4 returned 1000 rows for a total of 4000
API call #5 returned 1000 rows for a total of 5000
API call #6 returned 1000 rows for a total of 6000
API call #7 returned 1000 rows for a total of 7000
API call #8 returned 1000 rows for a total of 8000
API call #9 returned 1000 rows for a total of 9000
API call #10 returned 1000 rows for a total of 10000
API call #11 returned 1000 rows for a total of 11000
API call #12 returned 1000 rows for a total of 12000
API call #13 returned 1000 rows for a total of 13000
API call #14 returned 1000 rows for a total of 14000
API call #15 returned 1000 rows for a total of 15000
API call #16 returned 4 rows for a total of 15004
API returned a total of 15004 rows


In [73]:
if __name__ == '__main__':  

    df = processData().runMultipleMonths(raw,
        '2017-12-31','2023-12-31',
        tradingRegionName=None,unitType='CDU',countryName='USA')