In [1]:

import pandas as pd
import numpy as np
import os
import openpyxl as xl

import re

import warnings
warnings.filterwarnings('ignore')

from Utilities import FileIO
from Utilities import DataframeCleaning


# Load File

In [2]:
df_raw = FileIO.importFile("transformFinished_separate.csv")

df_raw.head()


Unnamed: 0,product_id,product,mail_class,fy,revenue,attributable_cost,volume_variable_cost,product_specific_cost,rev_pc,attributable_cost_pc,cont_pc,cost_coverage,volume,weight,weight_pc
0,3,Single Piece Letters,First Class,2008,14353.524033,8443.075618748211,8434.32261874821,8.753,0.4283392480234627,0.2519590766159719,0.1763801714074908,1.7000349968591284,33509710.0,1000328.975,0.477631
1,4,Single Piece Cards,First Class,2008,500.489597,446.5087719559969,446.02677195599694,0.482,0.2711417408674555,0.2418974669332412,0.0292442739342143,1.12089532935161,1845860.0,11830.08,0.102544
2,5,Total Single Piece Letters and Cards,First Class,2008,14854.01363,8889.584390704207,8880.349390704207,9.235,0.4201322091775561,0.2514337755281081,0.168698433649448,1.6709457919689432,35355570.0,1012159.055,0.458048
3,8,Presort Letters,First Class,2008,16327.804358,5441.6680782620415,5429.030078262042,12.638,0.3374916702247292,0.1124779307905796,0.2250137394341495,3.000514570748088,48379870.0,2174874.013,0.719266
4,9,Presort Cards,First Class,2008,732.236796,282.27046183504393,281.3414618350439,0.929,0.2059160155088801,0.0793787052418541,0.126537310267026,2.594096425250163,3555997.0,28969.254,0.130345


# Convert Numeric Columns

In [3]:
df = df_raw.copy()

def convertNumericColumns(frame):
    nonNumericCols = ['product_id', 'product', 'mail_class', 'fy']
    numericCols = [c for c in frame.columns if c not in nonNumericCols]

    def applyNumConversion(row):
        rowString = str(row)
        deleteChars = [",", "$"]
        for char in deleteChars:
            rowString = rowString.replace(char, "")

        if rowString in ["None", "NaN", "(D-E)", "(D/E)"]:
            return np.nan
        
        if len(rowString)<2:
            return np.nan
        
        rowFloat = float(rowString)

        return rowFloat


    for col in numericCols: 
        frame[col] = frame[col].apply(lambda row: applyNumConversion(row))
    
    return frame


df_converted = convertNumericColumns(df)

# df_converted.info()
# df.info()

# Correct Units

In [4]:
df = df_converted.copy()

millionCols = ['revenue',
       'attributable_cost', 'volume_variable_cost', 'product_specific_cost']

def correctUnits(frame):


    thousandCols = ['volume','weight']

    million = 1000000
    thousand = 1000

    for col in millionCols:
        frame[col] =frame[col]*million
    
    for col in thousandCols:
        frame[col] =frame[col]*thousand


    return frame

df_correctUnits = correctUnits(df)
# df_correctUnits.head()
df.columns

Index(['product_id', 'product', 'mail_class', 'fy', 'revenue',
       'attributable_cost', 'volume_variable_cost', 'product_specific_cost',
       'rev_pc', 'attributable_cost_pc', 'cont_pc', 'cost_coverage', 'volume',
       'weight', 'weight_pc'],
      dtype='object')

# Round Columns

In [5]:
df = df_correctUnits.copy()

identifierColumns = ['product_id', 'product', 'mail_class', 'fy']


def roundColumns(frame):

    valueColumns = [c for c in frame.columns if c not in  identifierColumns]

    for col in valueColumns:
        frame[col] = frame[col].round(6) 

    return frame

df_rounded = roundColumns(df)
# df_rounded.head()

# Reassign Current Totals as International

In [6]:
keyFrame = FileIO.importKey()



keyFilt =  keyFrame[keyFrame['notes'].notnull()]
keyFilt =keyFilt[keyFilt['notes'].str.contains("sum of ids")][['product_id','product','notes']]

def applyNumberMatch(row):
    numbers =  re.findall(r"\d+",row)
    return numbers

keyFilt['children'] = keyFilt['notes'].apply(lambda row: applyNumberMatch(row))
# keyFilt


summaryRowsDictStringList= dict(
    zip(
        list(keyFilt['product_id']), 
        list(keyFilt['children'])
    )
)


summaryRowsDict = {}
for keyDict in summaryRowsDictStringList.keys():
    stringList= summaryRowsDictStringList[keyDict]
    intList = [int(n) for n in stringList]
    summaryRowsDict[keyDict] = intList


summaryRowsDict


{80: [5, 10, 14, 19, 222, 230, 103, 127],
 90: [80, 81, 82, 83, 85, 86],
 92: [90, 91],
 248: [227, 101, 102, 220, 92],
 81: [21, 22, 23, 24, 25, 26, 27, 104, 139, 231, 223],
 83: [41, 42, 43, 44, 45, 226, 233, 106, 253],
 82: [31, 32, 224, 105, 232],
 91: [51, 52, 54, 55, 56, 57, 58, 61, 62, 73, 74, 142, 225, 76]}

In [7]:
df = df_rounded.copy()




summaryReassignDict = {80: 262 ,
81: 263, 
83: 265, 
82: 264,
91: 266}

def reassignSummaryRows(frame):
    def applyReassign(row):
        if row in summaryReassignDict.keys():
            return summaryReassignDict[row]
        return row
    
    frame['product_id'] = frame['product_id'].apply(lambda row: applyReassign(row))

    frame = DataframeCleaning.realignWithKey(frame, keyFrame)

    return frame


df_reassignedSummary = reassignSummaryRows(df)

# df_reassignedSummary[df_reassignedSummary['product_id']==266]
# # df_reassignedSummary[df_reassignedSummary['product_id'].isin(summaryReassignDict.values())]

# Calculate Domestic Totals

In [8]:
df = df_reassignedSummary.copy()

def calculateDomesticTotals(frame, idToRecalculate, childrenList):
    frameOrig = frame.copy()
    frame = frame[frame['product_id']!= idToRecalculate]
    frameIdListFilter = frame['product_id'].isin(childrenList)
    valueColumns = [c for c in frame.columns if c not in  identifierColumns]

    grp = frame[frameIdListFilter].groupby("fy")[valueColumns].sum()
    grp.reset_index(inplace=True)

    grp['product_id'] = idToRecalculate

    for col in frameOrig.columns:
        if col not in grp:
            grp[col] = np.nan
    
    rez = pd.concat([frame, grp])

    return rez

def calcDomesticTotals_aggregate(frame):
    for keyDict in summaryRowsDict.keys():
        summaryRow = keyDict
        childrenList = summaryRowsDict[keyDict]
        frame = calculateDomesticTotals(frame, summaryRow, childrenList)

    rez = DataframeCleaning.realignWithKey(frame, keyFrame)
    return rez


df_domesticSums = calcDomesticTotals_aggregate(df)


df = df_domesticSums.copy()

df[df['product_id'].isin(summaryRowsDict.keys())].head()


Unnamed: 0,product_id,product,mail_class,fy,revenue,attributable_cost,volume_variable_cost,product_specific_cost,rev_pc,attributable_cost_pc,cont_pc,cost_coverage,volume,weight,weight_pc
958,80,Total First Class (Domestic),Summary,2008,37276600000.0,18264230000.0,18240390000.0,23841000.0,3.799691,2.905515,0.894177,7.267123,91276700000.0,4104553000.0,9.573919
959,80,Total First Class (Domestic),Summary,2009,34955050000.0,17240190000.0,17208130000.0,32052070.0,4.25629,3.14642,1.10987,9.797005,83313730000.0,3634747000.0,10.317914
960,80,Total First Class (Domestic),Summary,2010,33131070000.0,16454660000.0,16425120000.0,29544220.0,4.016086,3.224905,0.791181,7.049557,77868720000.0,3635889000.0,9.835242
961,80,Total First Class (Domestic),Summary,2011,31313950000.0,15483610000.0,15450430000.0,33177510.0,4.070924,3.099234,0.971692,7.144492,73210160000.0,3675243000.0,9.786556
962,80,Total First Class (Domestic),Summary,2012,29516120000.0,14214900000.0,14169460000.0,45449460.0,4.689478,3.645034,1.044444,9.905881,68984540000.0,3400451000.0,10.542421


In [12]:
isYr  = df['fy']==2023
isMM = df['mail_class'].str.contains("Mark")

tempFrame = df[isYr & isMM]
vals = list(tempFrame[['revenue','attributable_cost']].sum().values)
vals[0]/vals[1]



1.4188262041089261

In [9]:

stop 

NameError: name 'stop' is not defined

# Reorder Columns



In [None]:
df = df_domesticSums.copy()

def reorderCols(frame):
    leadingCols = ['product_id', 'product', 'mail_class', 'fy', 'volume', 'revenue',
       'attributable_cost']
    
    trailingCols = [c for c in frame.columns if c not in leadingCols]
    finalCols = leadingCols + trailingCols
    return frame[finalCols]

# reorderCols(df)
# df.columns

# Aggregate Funcs

In [None]:
df_raw=  FileIO.importFile("transformFinished_separate.csv")


transformFuncs = [
convertNumericColumns,
correctUnits,
roundColumns,
reassignSummaryRows,
calcDomesticTotals_aggregate,
reorderCols,
DataframeCleaning.realignWithKey
]

def applyTransformations(frame, funcs):
    for func in funcs:
        frame = func(frame)
    return frame

df_final = applyTransformations(df_raw, transformFuncs)

df_final.head()



Unnamed: 0,product_id,product,mail_class,fy,volume,revenue,attributable_cost,volume_variable_cost,product_specific_cost,rev_pc,attributable_cost_pc,cont_pc,cost_coverage,weight,weight_pc
0,3,Single Piece Letters,First Class,2008,33509710000.0,14353520000.0,8443076000.0,8434323000.0,8753000.0,0.428339,0.251959,0.17638,1.700035,1000329000.0,0.477631
1,4,Single Piece Cards,First Class,2008,1845860000.0,500489600.0,446508800.0,446026800.0,482000.0,0.271142,0.241897,0.029244,1.120895,11830080.0,0.102544
2,5,Total Single Piece Letters and Cards,First Class,2008,35355570000.0,14854010000.0,8889584000.0,8880349000.0,9235000.0,0.420132,0.251434,0.168698,1.670946,1012159000.0,0.458048
3,8,Presort Letters,First Class,2008,48379870000.0,16327800000.0,5441668000.0,5429030000.0,12638000.0,0.337492,0.112478,0.225014,3.000515,2174874000.0,0.719266
4,9,Presort Cards,First Class,2008,3555997000.0,732236800.0,282270500.0,281341500.0,929000.0,0.205916,0.079379,0.126537,2.594096,28969250.0,0.130345


# Upload File


In [None]:

FileIO.exportFile(df_final, "transformFinished_merged.csv")

# Garbage

In [None]:
df  = df_final.copy()
# # df= df_raw.copy()
# df[df['product_id']==262]

# df[df['product'].str.contains("Cre")]

# df_raw[df_raw['product'].str.contains("Cre")]
# df_raw[df_raw['product'].str.contains("Ch")]

countSer = df.groupby(["fy", "product_id"])['product'].count()
countSer.loc[countSer>1]

Series([], Name: product, dtype: int64)

In [None]:
isYearRaw = df_raw['fy']==2010
isIdRaw = df_raw['product_id']==3
df_raw[isYearRaw & isIdRaw]

Unnamed: 0,product_id,product,mail_class,fy,revenue,attributable_cost,volume_variable_cost,product_specific_cost,rev_pc,attributable_cost_pc,cont_pc,cost_coverage,volume,weight,weight_pc
87,3,Single Piece Letters,First Class,2010,12339560000.0,7376828000.0,7354369000.0,22459501.0,0.45453,0.271727,0.182803,1.672745,27147920000.0,847776797.0,0.499649
