# PFG Sell-out

Goal: 
1. Process PFG data (source = mealticket.com)
2. Analyze data using COVID segmentation
3. Compare sell-out (PFG) to sell-in (McCain) data

### 1. Load libraries, initiate folder/file paths
Run cell below

In [1]:
import pandas as pd
import datetime
from datetime import datetime as dt
import numpy as np
import teradatasql

#path where dictionary file can be found
#Neil
DICTIONARY = r'C:\Users\NEWATTER\OneDrive - McCain Foods Limited\Distributor Sell-Out Dictionaries\\'
#Joe
#DICTIONARY = r'C:\Users\jcronk\McCain Foods Limited\GNA Data Strategy & Analytics - COVID Recovery\Distributor Sell-Out Dictionaries\\'

#main path
#Neil
PATH = r'C:\Users\NEWATTER\OneDrive - McCain Foods Limited\Historical Sell-Out Sales\\'
#Joe
#PATH = r'C:\Users\jcronk\McCain Foods Limited\GNA Data Strategy & Analytics - COVID Recovery\Historical Sell-Out Sales\\'

#backup path
#Neil
BACKUP = r'C:\Users\NEWATTER\OneDrive - McCain Foods Limited\Historical Sell-Out Sales\Backups\\'
#Joe
#BACKUP = r'C:\Users\jcronk\McCain Foods Limited\GNA Data Strategy & Analytics - COVID Recovery\Historical Sell-Out Sales\Backups\\'

#time dataframe
TIME = pd.read_excel(DICTIONARY + 'Time Definitions.xlsx')

### 2. Data Dictionary
Run cell below

In [2]:
def us_states():
    us_state_abbrev = {
        'Alabama': 'AL',
        'Alaska': 'AK',
        'American Samoa': 'AS',
        'Arizona': 'AZ',
        'Arkansas': 'AR',
        'California': 'CA',
        'Colorado': 'CO',
        'Connecticut': 'CT',
        'Delaware': 'DE',
        'District of Columbia': 'DC',
        'Florida': 'FL',
        'Georgia': 'GA',
        'Guam': 'GU',
        'Hawaii': 'HI',
        'Idaho': 'ID',
        'Illinois': 'IL',
        'Indiana': 'IN',
        'Iowa': 'IA',
        'Kansas': 'KS',
        'Kentucky': 'KY',
        'Louisiana': 'LA',
        'Maine': 'ME',
        'Maryland': 'MD',
        'Massachusetts': 'MA',
        'Michigan': 'MI',
        'Minnesota': 'MN',
        'Mississippi': 'MS',
        'Missouri': 'MO',
        'Montana': 'MT',
        'Nebraska': 'NE',
        'Nevada': 'NV',
        'New Hampshire': 'NH',
        'New Jersey': 'NJ',
        'New Mexico': 'NM',
        'New York': 'NY',
        'North Carolina': 'NC',
        'North Dakota': 'ND',
        'Northern Mariana Islands':'MP',
        'Ohio': 'OH',
        'Oklahoma': 'OK',
        'Oregon': 'OR',
        'Pennsylvania': 'PA',
        'Puerto Rico': 'PR',
        'Rhode Island': 'RI',
        'South Carolina': 'SC',
        'South Dakota': 'SD',
        'Tennessee': 'TN',
        'Texas': 'TX',
        'Utah': 'UT',
        'Vermont': 'VT',
        'Virgin Islands': 'VI',
        'Virginia': 'VA',
        'Washington': 'WA',
        'West Virginia': 'WV',
        'Wisconsin': 'WI',
        'Wyoming': 'WY'
    }

    # thank you to @kinghelix and @trevormarburger for this idea
    abbrev_us_state = dict(map(reversed, us_state_abbrev.items()))
    
    return pd.DataFrame.from_dict(abbrev_us_state, orient = 'index', columns = ['State Name']).rename_axis('State').reset_index()


def apply_dictionary(df, file_name):
    
    print(f'Starting dataframe shape: {df.shape}', flush = True)
    
    #create dictionary object from Excel file
    #adding sheet_name = None makes it a dictionary type
    _dict = pd.read_excel(DICTIONARY + file_name, sheet_name = None, engine='openpyxl')
    
    #for testing, keys = sheet names
    #print(_dict.keys())
    
    #create DataFrame from dictionary object called dict (short for dictionary)
    dict_df = pd.DataFrame.from_dict(_dict['Segment Mapping'])
    sku_df = pd.DataFrame.from_dict(_dict['SKU Mapping'])
    
    man_df = pd.DataFrame.from_dict(_dict['Manufacturer Mapping'])
    
    manufacturer = man_df[man_df['Mfg. Inclusion Flag'] == 'Include']['Manufacturer'].tolist()
    
    print(f'These manufacturers were included: {manufacturer}', flush = True)
    
    excluded = df['Manufacturer'].value_counts().reset_index().drop(columns={'Manufacturer'}).rename(columns = {'index':'Manufacturer'})
    excluded = excluded[~excluded['Manufacturer'].isin(manufacturer)]
    
    #display(df.groupby['Manufacturer'].size().reset_index().drop(columns={0}))
    
    print(f'These manufacturers were not included: {excluded}', flush = True)
    
    df = df[df['Manufacturer'].isin(manufacturer)]
    
    #strip blanks from segment
    df.loc[:, 'Segment'] = df['Segment'].str.strip()
    
    #convert Invoice Week to date
    df.loc[:, 'Invoice Week'] = pd.to_datetime(df['Invoice Week'])
    
    
    #print shape of df (dimensions)
    print(f'Shape before adding dictionary: {df.shape}', flush = True)
    #add lower case for merging
    
    dict_df.loc[:, 'customer_class_lower'] = dict_df['Customer Class'].str.lower()
    dict_df.loc[:, 'segment_lower'] = dict_df['Segment'].str.strip().str.lower()
    dict_df.loc[:, 'account_type_lower'] = dict_df['Account Type'].str.strip().str.lower()
    
    #Type Name	Category Name	COVID Segmentation - L1	COVID Segmentation - L2	COVID Segmentation - (Restaurants)	COVID Segmentation - (Restaurants: Sub-Segment)	Restaurant Service Type
    
    dict_df = dict_df.groupby(['customer_class_lower','segment_lower','account_type_lower','COVID Segmentation - L1','COVID Segmentation - L2',
                               'COVID Segmentation - (Restaurants)','COVID Segmentation - (Restaurants: Sub-Segment)','Restaurant Service Type','Cuisine Type']
                              , dropna = False).size().reset_index().drop(columns={0})
    
    #add lower case key columns for merging (removes case mismatch)
    
    df.loc[:, 'customer_class_lower'] = df['Customer Class'].str.lower()
    df.loc[:, 'segment_lower'] = df['Segment'].str.strip().str.lower()
    df.loc[:, 'account_type_lower'] = df['Account Type'].str.strip().str.lower()
    
    
    df = df.merge(dict_df, how = 'left', left_on = [
        'customer_class_lower','segment_lower','account_type_lower'],
        right_on = ['customer_class_lower','segment_lower','account_type_lower']).drop(columns = {
            'customer_class_lower','segment_lower','account_type_lower'})
    
    #print(f'Shape after 1st merge: {df.shape}', flush = True)
    
    sku_df = sku_df.groupby(['Mfr SKU','Consolidated Category','L1 Product Hierarchy','L2 Product Hierarchy','Case Weight Lbs'], dropna = False).size().reset_index().drop(columns={0})
    
    #both SKU fields need to be strings in order to match
    sku_df.loc[:, 'Mfr SKU'] = sku_df['Mfr SKU'].astype(str)
    df.loc[:, 'MFR SKU'] = df['MFR SKU'].astype(str)
    
    df = df.merge(sku_df[['Mfr SKU','Consolidated Category','L1 Product Hierarchy','L2 Product Hierarchy','Case Weight Lbs']], how = 'left', left_on = ['MFR SKU'], right_on = ['Mfr SKU']).drop(columns = {'Mfr SKU'})
    
    #print(f'Shape after 2nd merge: {df.shape}', flush = True)
    
    df = df.astype({'Qty':'float64','Weight':'float64','Case Weight Lbs':'float64'})
    
    #calculate case weight if weight = 0 and qty > 0
    df.loc[(df['Weight'] == 0) & (df['Qty'] > 0), 'Weight'] = df['Case Weight Lbs'] * df['Qty']
    
    #add time
    
    df = df.merge(TIME[['Week Starting (Mon)', 'Calendar Week Year']], how = 'left', left_on = ['Invoice Week'], right_on = ['Week Starting (Mon)']).drop(columns={'Week Starting (Mon)'})
    
    #rename metric Weight for consistancy
    df = df.rename(columns={
        'Weight':'LBS',
        'MFR SKU':'SKU ID'})
    
    df = df[~df['Calendar Week Year'].isna()]
    
    #Clean US States
    df.loc[df['State'] == 'tn', 'State'] = 'TN'
    df = df.merge(us_states(), how = 'left', on = 'State')
    df.loc[df['State Name'].isna(), ['State', 'State Name']] = 'None'
    
    print(f'Shape after adding dictionary: {df.shape}', flush = True)
    
    return df

### 3. Import File
Run cell below

In [3]:
def import_file(file_name):
    
#import file
    if '.csv' in file_name:
        _import = pd.read_csv(file_name, low_memory = False)
    else:
        _import = pd.read_excel(file_name, engine='openpyxl')
    
    return _import

### 4. Calculation Functions
Run cell below

In [4]:
def add_rolling(df, _list):
    #groupby _list
    df = df.groupby(_list, dropna = False)[['LBS','LBS_LY','LBS_Baseline']].sum().reset_index()
    
    #set index to all but last column in list
    df = df.set_index(_list)
    
    #add new metric SMA_4 (simple moving average - 4 periods)
    #level = all but last 2 items in list
    df['LBS_Lag_1'] = df.groupby(level=_list[0:-1])['LBS'].shift(periods = 1)
    df['LBS_Lag_2'] = df.groupby(level=_list[0:-1])['LBS'].shift(periods = 2)
    df['LBS_Lag_3'] = df.groupby(level=_list[0:-1])['LBS'].shift(periods = 3)
    df['LBS_Lag_4'] = df.groupby(level=_list[0:-1])['LBS'].shift(periods = 4)
    
    df['SMA_4'] = df.groupby(level=_list[0:-1])['LBS'].apply(lambda x: x.rolling(4, min_periods=1).mean())
    df['SMA_8'] = df.groupby(level=_list[0:-1])['LBS'].apply(lambda x: x.rolling(8, min_periods=1).mean())
    df['SMA_12'] = df.groupby(level=_list[0:-1])['LBS'].apply(lambda x: x.rolling(12, min_periods=1).mean())
    
    df['SMA_4_LY'] = df.groupby(level=_list[0:-1])['LBS_LY'].apply(lambda x: x.rolling(4, min_periods=1).mean())
    df['SMA_8_LY'] = df.groupby(level=_list[0:-1])['LBS_LY'].apply(lambda x: x.rolling(8, min_periods=1).mean())
    df['SMA_12_LY'] = df.groupby(level=_list[0:-1])['LBS_LY'].apply(lambda x: x.rolling(12, min_periods=1).mean())
    
    df['SMA_4_Baseline'] = df.groupby(level=_list[0:-1])['LBS_Baseline'].apply(lambda x: x.rolling(4, min_periods=1).mean())
    df['SMA_8_Baseline'] = df.groupby(level=_list[0:-1])['LBS_Baseline'].apply(lambda x: x.rolling(8, min_periods=1).mean())
    df['SMA_12_Baseline'] = df.groupby(level=_list[0:-1])['LBS_Baseline'].apply(lambda x: x.rolling(12, min_periods=1).mean())
    
    df['LBS_Baseline_Lag_1'] = df.groupby(level=_list[0:-1])['LBS_Baseline'].shift(periods = 1)
    df['LBS_LY_Lag_1'] = df.groupby(level=_list[0:-1])['LBS'].shift(periods = 1)
    
    df['SMA_4_Lag_1'] = df.groupby(level=_list[0:-1])['SMA_4'].shift(periods = 1)
    df['SMA_4_LY_Lag_1'] = df.groupby(level=_list[0:-1])['SMA_4_LY'].shift(periods = 1)
    df['SMA_4_Baseline_Lag_1'] = df.groupby(level=_list[0:-1])['SMA_4_Baseline'].shift(periods = 1)
    
    return df.reset_index()


def add_last_year(df, _list):
    #list of groupby columns
    #last item in list is Calendar Week Year which is used to pull previous history (Baseline Week = Calendar Week Year) of copied dataframe
    _groupby = _list.copy()
    
    _merge_yoy = _list.copy()[0:-1]
    _merge_yoy.extend(['YOY Week'])
    
    _merge_baseline = _list.copy()[0:-1]
    _merge_baseline.extend(['Baseline Week'])
    
    df1 = df.groupby(_list, dropna = False)['LBS'].sum().reset_index()
    
    #groupby _list
    df_new = df.groupby(_list, dropna = False)['LBS'].sum().reset_index()
    
    #add week dimensions to main dataframe
    df_new = df_new.merge(TIME[['Calendar Week Year','YOY Week','Baseline Week']], how = 'left', left_on = 'Calendar Week Year', right_on = 'Calendar Week Year')
    
    df_new = df_new.merge(df1, how='left', left_on=_merge_yoy, right_on=_groupby).drop(columns={'Calendar Week Year_y'}).rename(columns={'LBS_y':'LBS_LY'})
    
    df_new = df_new.merge(df1, how='left', left_on=_merge_baseline, right_on=_groupby).drop(columns={'Calendar Week Year'}).rename(columns={
        'LBS':'LBS_Baseline','Calendar Week Year_x':'Calendar Week Year','LBS_x':'LBS'})
    
    return df_new


def add_precovid(df, _list, begin, end):
    #datefield should be last in _list
    datefield = _list[-1]
          
    #remove datefield from list
    _list = _list[0:-1]
    
    #filter data not using last and rename columns
    _df = df[(df[datefield] >= begin) & (df[datefield] <= end)].groupby(_list)['LBS'].sum() / 52
    
    return df.merge(
        _df, how = 'left', left_on = _list, right_on = _list).rename(
        columns = {'LBS_x':'LBS', 'LBS_y':'LBS_PRECOVID'}).fillna(
        value = {'LBS_PRECOVID': 0})


def add_time(df):
    df = df.merge(TIME[['Calendar Week Year','Week Starting (Sun)','Week Ending (Sat)', 'COVID Week']],
                   how = 'left', 
                   on = 'Calendar Week Year')
    
    df = df.merge(TIME[['Calendar Week Year','YOY Week','Baseline Week']], how = 'left', left_on = 'Calendar Week Year', right_on = 'Calendar Week Year')
    
    return df


def analyze_1(df, _list, begin, end):
    if 'Calendar Week Year' not in _list:
        _list.extend(['Calendar Week Year'])
    
    df = full_dataframe(df, _list)
    
    #add last year lbs
    df = add_last_year(df, _list)
    
    #add rolling calculation
    df = add_rolling(df, _list)
        
    #add preCOVID baseline
    df = add_precovid(df, _list, begin, end)
    
    df = df.round({
        'LBS' : 2,    
        'SMA_4' : 2,
        'SMA_8' : 2,
        'SMA_12' : 2,
        'LBS_LY' : 2,    
        'SMA_4_LY' : 2,
        'SMA_8_LY' : 2,
        'SMA_12_LY' : 2,
        'LBS_Baseline' : 2,    
        'SMA_4_Baseline' : 2,
        'SMA_8_Baseline' : 2,
        'SMA_12_Baseline' : 2,
        'LBS_PRECOVID' : 2,
        'LBS_Lag_1' : 2,
        'LBS_Lag_2' : 2,
        'LBS_Lag_3' : 2,
        'LBS_Lag_4' : 2,
        'LBS_Baseline_Lag_1': 2,
        'LBS_LY_Lag_1': 2,
        'SMA_4_Lag_1' : 2,
        'SMA_4_LY_Lag_1' : 2,
        'SMA_4_Baseline_Lag_1' : 2
        
    }).fillna(value = {
        'LBS' : 0,    
        'SMA_4' : 0,
        'SMA_8' : 0,
        'SMA_12' : 0,
        'LBS_LY' : 0,    
        'SMA_4_LY' : 0,
        'SMA_8_LY' : 0,
        'SMA_12_LY' : 0,
        'LBS_Baseline' : 0,    
        'SMA_4_Baseline' : 0,
        'SMA_8_Baseline' : 0,
        'SMA_12_Baseline' : 0,
        'LBS_PRECOVID' : 0,
        'LBS_Lag_1' : 0,
        'LBS_Lag_2' : 0,
        'LBS_Lag_3' : 0,
        'LBS_Lag_4' : 0,
        'LBS_Baseline_Lag_1': 2,
        'LBS_LY_Lag_1': 2,
        'SMA_4_Lag_1' : 0,
        'SMA_4_LY_Lag_1' : 0,
        'SMA_4_Baseline_Lag_1' : 0
    })
    
    return df


def save_backup(df, file_name):
    
    df.to_csv(BACKUP + file_name)
    
    return


def td_to_pandas(query, cur, title=''):
    _data = []
    _start=dt.now()
    print(dt.now().strftime('%m/%d/%Y %r'))
    print(f'{title} Execution started...', end='', flush=True)
    cur.execute (query)
    print(f'finished. {dt.now() - _start}', flush=True) 
    _start_fetch=dt.now()
    print(f'{title} Fetching data started...', end='', flush=True)
    for row in cur.fetchall():
        _data.append(row) 
    print(f'finished. {dt.now() - _start_fetch}', flush=True) 
    _start=dt.now()
    print(f'{title} Creating DataFrame for started...', end='', flush=True)
    _df = pd.DataFrame(_data)
    _df.columns = [x[0].replace('SAP_', '').lower() for x in cur.description]
    print(f'finished. {dt.now() - _start}', flush=True)
    return _df


def td_dataframe(select_db, query):
    with teradatasql.connect(None, 
                         host='172.29.3.43',
                         user='PNWATTERS',
                         password='teradata123') as con:
        with con.cursor() as cur:
            cur.execute (select_db)
            print('Database selected!', flush=True)            
            dim_df = td_to_pandas(query, cur, 'Query:')
            print('Dim:', dim_df.shape)
    
    return dim_df


def restaurants(df):
    #restaurants = df.loc[df['COVID Segmentation - (Restaurants)'] == 'Restaurants', :]
    
    #Rename rows
    df.loc[df['COVID Segmentation - L2'] == 'Independents (IOs) / Local Eateries / Takeaway', 'COVID Segmentation - L2'] = 'IO'
    df.loc[
        (df['COVID Segmentation - L2'] == 'All Other') | 
        (df['COVID Segmentation - L2'] == 'National Account') | 
        (df['COVID Segmentation - L2'] == 'Region Chains')| 
        (df['COVID Segmentation - L2'] == 'National Accounts'),
        'COVID Segmentation - L2'] = 'Chain'
    
    return df

def process_list(df, work_list):
    
    _process = analyze_1(df, work_list, 201910, 202009)
    
    _process = restaurants(_process)
    
    _process['Distributor'] = 'PFG'
    
    _process = add_time(_process)
    
    #for standardizing output
    work_list.extend(['Distributor','LBS','SMA_4','SMA_8','SMA_12',
                      'YOY Week','LBS_LY','SMA_4_LY','SMA_8_LY','SMA_12_LY',
                      'Baseline Week','LBS_Baseline','SMA_4_Baseline','SMA_8_Baseline','SMA_12_Baseline',
                      'LBS_Lag_1','LBS_Lag_2','LBS_Lag_3','LBS_Lag_4','LBS_Baseline_Lag_1','LBS_LY_Lag_1',
                      'SMA_4_Lag_1', 'SMA_4_LY_Lag_1', 'SMA_4_Baseline_Lag_1',
                      'LBS_PRECOVID','Week Starting (Sun)','Week Ending (Sat)','COVID Week'])
    
    if 'SKU ID' in work_list:
        #last 26 weeks in dataframe
        week_list = _process.groupby(['Calendar Week Year']).size().reset_index().drop(columns={0}).sort_values(by = 'Calendar Week Year', ascending = True).squeeze().tolist()[-26:]
        
        #filter to only the last 26 weeks
        _process = _process[_process['Calendar Week Year'].isin(week_list)]
    
    return _process[work_list]

def is_missing(df):
    #check for COVID Segmentation - L1
    missing = _new_df[_new_df['COVID Segmentation - L1'].isna()].groupby(['Customer Class','Segment', 'Account Type'], as_index = False, dropna = False)['LBS'].sum()

    if len(missing) > 0:
        print('The following segments are missing:')
        display(missing)
        missing.to_excel(DICTIONARY + 'Segments Missing Dump\\' + dt.now().strftime('%Y%m%d') + '_PFG_L1_missing.xlsx', index = False)
    else:
        print(f'Nothing missing for COVID Segmentation - L1', flush = True)

    #check for product
    missing = _new_df[_new_df['Consolidated Category'].isna()].groupby(['Brand','Sub-Category','SKU ID','Item Name','Pack','Size','Unit Type','GTIN','Dist SKU'], as_index = False, dropna = False)['LBS'].sum()

    if len(missing) > 0:
        print('The following products are missing:')
        display(missing)
        missing.to_excel(DICTIONARY + 'Segments Missing Dump\\' + dt.now().strftime('%Y%m%d') + r'_PFG_product_missing.xlsx', index = False)
    else:
        print(f'Nothing missing for Product', flush = True)

        
def full_dataframe(df, _list):
    weeks = df.groupby(['Calendar Week Year']).size().reset_index().drop(columns={0})
    segments = df.groupby(_list[0:-1]).size().reset_index().drop(columns={0})
    
    _df = segments.assign(key=1).merge(weeks.assign(key=1), how='outer', on='key').drop(columns = {'key'}) 
    
    return _df.merge(df, how = 'left', on = _list) 

def clean_city(df):
    df['City'] = df['City'].str.strip()
    df['City'] = df['City'].str.upper()
    df['City'].fillna('NA', inplace = True)
    
    #cities = 'TORONTO|MONTREAL|OTTAWA|CALGARY|VANCOUVER|WINNIPEG|MONTREAL|HAMILTON|HALIFAX'
    cities = 'NOT USED CURRENTLY'
    
    #change each city name to the name of the city that matches, cleans up the city names
    for c in cities.split('|'):
        df.loc[df['City'].str.match(c), 'City'] = c
    
    #change all other cities to NA
    df.loc[~df['City'].str.match(cities), 'City'] = 'NA'
    
    return df

### 5. Sell-in vs. Sell-out
Run cell below

In [5]:
def teradata_sales(sellout):
    #SET QUERY_BAND = 'ApplicationName=MicroStrategy;Version=9.0;ClientUser=NEWATTER;Source=Vantage; Action=BEK Performance;StartTime=20200901T101924;JobID=55096;Importance=666;'  FOR SESSION;
    
    #the current week is pulled from the time dictionary table
    to_week = int(TIME[(TIME['Week Starting (Mon)'] <= dt.now()) & (TIME['Week Ending (Sun)'] >= dt.now())]['Calendar Week Year'].values)
    
    print(f'Starting Teradata connect...', flush = True)
    
    select_db = "DATABASE DL_GBL_TAS_BI"

    query = '''
    select a14.FISCAL_WEEK_NUMBER as FISCAL_WEEK_NUMBER,
        (a14.FISCAL_WEEK_NUMBER_DESCR || ' ' || a14.START_DATE_OF_SAPYW) as FISCAL_WEEK,
        a14.CALENDAR_WEEK_NAME as CALENDAR_WEEK_NUMBER,
        (a14.CALENDAR_WEEK_LONG_DESCRIPTION || ' ' || a14.START_DATE_OF_SAPYW) as CALENDAR_WEEK,
        RIGHT(a16.CUSTOMER_HIER_LVL_1,CAST(10 AS INTEGER)) as CUSTOMER_HIER_LVL_1,
        a16.CUSTOMER_HIER_LVL_1_NAME as CUSTOMER_HIER_LVL_1_NAME,
        a13.DIVISION_ID as DIVISION,
        a17.DIVISION_NAME as DIVISION_NAME,
        a12.CATEGORY_SHORT_CODE as CATEGORY_SHORT_CODE,
        a12.CATEGORY_DESC as CATEGORY_DESC,
        a12.SUB_CATEGORY_SHORT_CODE as SUB_CATEGORY_SHORT_CODE,
        a12.SUB_CATEGORY_DESC as SUB_CATEGORY_DESC,
        a15.MATERIAL_PRICING_GROUP_ID as MATERIAL_PRICING_GROUP_ID,
        a18.MATERIAL_PRICING_GROUP_DESCRIPTION as MATERIAL_PRICING_GROUP_DESCRIPTION,
        TRIM (LEADING '0' FROM a13.MATERIAL_ID) as MATERIAL_ID,
        a13.MATERIAL_DESCRIPTION as MATERIAL_NAME,
        sum(a11.SALES_VOLUME_WEIGHT_LBS) as ACTUAL_VOLUME_LBS
    from DL_GBL_TAS_BI.FACT_SALES_ACTUAL as a11
    join DL_GBL_TAS_BI.VW_H_PRODUCT_ALL_SALES as a12
        on (a11.MATERIAL_ID = a12.MATERIAL_ID)
    join DL_GBL_TAS_BI.D_MATERIAL_DN_ALL as a13
        on (a11.MATERIAL_ID = a13.MATERIAL_ID)
    join DL_GBL_TAS_BI.D_TIME_FY_V6 as a14
        on (a11.ACCOUNTING_PERIOD_DATE = a14.DAY_CALENDAR_DATE)
    join DL_GBL_TAS_BI.D_MATERIAL_SALES_DATA as a15
        on (a11.DISTRIBUTION_CHANNEL_ID = a15.DISTRIBUTION_CHANNEL_ID and 
        a11.MATERIAL_ID = a15.MATERIAL_ID and 
        a11.SALES_ORGANISATION_ID = a15.SALES_ORGANISATION_ID)
    join DL_GBL_TAS_BI.VW_H_CUSTOMER_ALL_DIVISION00 as a16
        on (a11.CUSTOMER_ID = a16.CUSTOMER and 
        a11.DISTRIBUTION_CHANNEL_ID = a16.DISTRIBUTION_CHANNEL and 
        a11.SALES_ORGANISATION_ID = a16.SALES_ORGANISATION)
    join DL_GBL_TAS_BI.D_DIVISION as a17
        on (a13.DIVISION_ID = a17.DIVISION_ID)
    join DL_GBL_TAS_BI.D_MATERIAL_PRICING_GROUP as a18
        on (a15.MATERIAL_PRICING_GROUP_ID = a18.MATERIAL_PRICING_GROUP_ID)
    where (a14.FISCAL_YEAR_CODE in ('FY2019', 'FY2020', 'FY2021','FY2022')
        and a11.SALES_ORGANISATION_ID in ('US01')
        and a11.DISTRIBUTION_CHANNEL_ID in ('10')
        and RIGHT(a16.CUSTOMER_HIER_LVL_1,CAST(10 AS INTEGER)) in ('6500002801','6500002807'))
        and a14.CALENDAR_WEEK_NAME < ''' + str(to_week) + ''' 
    group by a14.FISCAL_WEEK_NUMBER,
        (a14.FISCAL_WEEK_NUMBER_DESCR || ' ' || a14.START_DATE_OF_SAPYW),
        a14.CALENDAR_WEEK_NAME,
        (a14.CALENDAR_WEEK_LONG_DESCRIPTION || ' ' || a14.START_DATE_OF_SAPYW),
        RIGHT(a16.CUSTOMER_HIER_LVL_1,CAST(10 AS INTEGER)),
        a16.CUSTOMER_HIER_LVL_1_NAME,
        a13.DIVISION_ID,
        a17.DIVISION_NAME,
        a12.CATEGORY_SHORT_CODE,
        a12.CATEGORY_DESC,
        a12.SUB_CATEGORY_SHORT_CODE,
        a12.SUB_CATEGORY_DESC,
        a15.MATERIAL_PRICING_GROUP_ID,
        a18.MATERIAL_PRICING_GROUP_DESCRIPTION,
        TRIM (LEADING '0' FROM a13.MATERIAL_ID),
        a13.MATERIAL_DESCRIPTION
;'''

    #create dataframe using both functions td_to_pandas and td_dataframe
    df = td_dataframe(select_db, query)
    
    return teradata_transform(df, sellout)


def teradata_transform(sellin, sellout):
    #consolidates teradata sales with sellout data
    
    #convert from object datatype to float (exports as a number instead of string)
    sellin['actual_volume_lbs'] = sellin['actual_volume_lbs'].astype('float64')
    
    #rename columns for consistancy
    sellin = sellin.rename(columns = {'actual_volume_lbs':'LBS', 'calendar_week_number':'Calendar Week Year'})
    
    #transform calendar week year from teradata
    sellin['Calendar Week Year'] = pd.to_numeric(sellin['Calendar Week Year'], errors = 'coerce')

    #transform category so its consolidated
    sellin['Consolidated Category'] = sellin['category_desc']
    sellin.loc[sellin['Consolidated Category'] == 'Sweet Potato' , 'Consolidated Category'] = 'Potato'
    sellin.loc[sellin['Consolidated Category'] != 'Potato' , 'Consolidated Category'] = 'Prepared Foods'
    
    #analyze sellin data
    sellin = analyze_1(sellin, ['Consolidated Category'], 201910, 202009)
    
    #rename columns accordingly
    sellin = sellin.rename(columns = {'LBS':'MCCAIN LBS',
                                      'SMA_4':'MCCAIN SMA_4',
                                      'SMA_8':'MCCAIN SMA_8',
                                      'SMA_12':'MCCAIN SMA_12',
                                      'LBS_PRECOVID':'MCCAIN PRECOVID',
                                      'LBS_Lag_1':'MCCAIN Lag_1',
                                      'LBS_Lag_2':'MCCAIN Lag_2',
                                      'LBS_Lag_3':'MCCAIN Lag_3',
                                      'LBS_Lag_4':'MCCAIN Lag_4',
                                      'LBS_Baseline' : 'MCCAIN LBS_Baseline',
                                      'SMA_4_Baseline' : 'MCCAIN SMA_4_Baseline',
                                      'SMA_8_Baseline' : 'MCCAIN SMA_8_Baseline',
                                      'SMA_12_Baseline' : 'MCCAIN SMA_12_Baseline',
                                      'SMA_4_Lag_1':'MCCAIN SMA_4_Lag_1',
                                      'SMA_4_Baseline_Lag_1' : 'MCCAIN SMA_4_Baseline_Lag_1',
                                      'LBS_Baseline_Lag_1': 'MCCAIN LBS_Baseline_Lag_1'
                                     })
    
    #analyze sellout data
    df = analyze_1(sellout, ['Consolidated Category'], 201910, 202009)
                                    
    df = df.merge(sellin[['Calendar Week Year','Consolidated Category','MCCAIN LBS','MCCAIN SMA_4','MCCAIN SMA_8','MCCAIN SMA_12','MCCAIN PRECOVID',
                          'MCCAIN LBS_Baseline','MCCAIN SMA_4_Baseline','MCCAIN SMA_8_Baseline','MCCAIN SMA_12_Baseline',
                          'MCCAIN Lag_1', 'MCCAIN Lag_2', 'MCCAIN Lag_3', 'MCCAIN Lag_4','MCCAIN LBS_Baseline_Lag_1',
                          'MCCAIN SMA_4_Lag_1', 'MCCAIN SMA_4_Baseline_Lag_1']], how = 'left', 
                  left_on = ['Calendar Week Year','Consolidated Category'], right_on = ['Calendar Week Year','Consolidated Category'])
    
    df = df.fillna({'MCCAIN LBS': 0,
                    'MCCAIN SMA_4': 0,
                    'MCCAIN SMA_8': 0,
                    'MCCAIN SMA_12': 0,
                    'MCCAIN PRECOVID': 0,
                    'MCCAIN Lag_1': 0,
                    'MCCAIN Lag_2': 0,
                    'MCCAIN Lag_3': 0,
                    'MCCAIN Lag_4': 0,
                    'MCCAIN LBS_Baseline': 0,
                    'MCCAIN SMA_4_Baseline': 0,
                    'MCCAIN SMA_8_Baseline': 0,
                    'MCCAIN SMA_12_Baseline': 0,
                    'MCCAIN LBS_Baseline_Lag_1':0,
                    'MCCAIN SMA_4_Lag_1' : 0,
                    'MCCAIN SMA_4_Baseline_Lag_1' : 0
                   })
    
    df['Distributor'] = 'PFG'

    df = add_time(df)
    
    df = df[['Consolidated Category','Distributor','Calendar Week Year',
             'LBS','SMA_4','SMA_8','SMA_12','LBS_PRECOVID',
             'LBS_Baseline','SMA_4_Baseline','SMA_8_Baseline','SMA_12_Baseline',
             'LBS_Lag_1', 'LBS_Lag_2', 'LBS_Lag_3', 'LBS_Lag_4', 'LBS_Baseline_Lag_1', 'SMA_4_Lag_1', 'SMA_4_Baseline_Lag_1',
             'MCCAIN LBS','MCCAIN SMA_4','MCCAIN SMA_8','MCCAIN SMA_12','MCCAIN PRECOVID',
             'MCCAIN LBS_Baseline','MCCAIN SMA_4_Baseline','MCCAIN SMA_8_Baseline','MCCAIN SMA_12_Baseline',
             'MCCAIN Lag_1', 'MCCAIN Lag_2', 'MCCAIN Lag_3', 'MCCAIN Lag_4','MCCAIN LBS_Baseline_Lag_1','MCCAIN SMA_4_Lag_1','MCCAIN SMA_4_Baseline_Lag_1',
             'Week Starting (Sun)','Week Ending (Sat)','COVID Week']]
    
    return df

### 5. Import Raw Data
Run cell below

In [6]:
'''
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 425666 entries, 0 to 425665
Data columns (total 29 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   Division          425666 non-null  object        
 1   Sales Manager     322447 non-null  object        
 2   Sales Rep Number  293051 non-null  object        
 3   Sales Rep         381138 non-null  object        
 4   Account Number    425666 non-null  int64         
 5   Account Name      425666 non-null  object        
 6   Address           406082 non-null  object        
 7   City              425666 non-null  object        
 8   State             425666 non-null  object        
 9   Postal Code       425666 non-null  object        
 10  Customer Class    425666 non-null  object        
 11  Segment           425655 non-null  object        
 12  Account Type      425666 non-null  object        
 13  Vendor            425666 non-null  object        
 14  Manufacturer      425666 non-null  object        
 15  Brand             425666 non-null  object        
 16  Brand Type        424398 non-null  object        
 17  Product Category  380120 non-null  object        
 18  Sub-Category      380120 non-null  object        
 19  GTIN              425666 non-null  int64         
 20  MFR SKU           425666 non-null  object        
 21  Dist SKU          425666 non-null  object        
 22  Item Name         425666 non-null  object        
 23  Pack              425666 non-null  object        
 24  Size              412230 non-null  object        
 25  Unit Type         425666 non-null  object        
 26  Qty               425666 non-null  float64       
 27  Weight            425666 non-null  float64       
 28  Invoice Week      425666 non-null  datetime64[ns]
dtypes: datetime64[ns](1), float64(2), int64(2), object(24)
memory usage: 94.2+ MB
'''

#new file name in historical file directory
%time _new = import_file(PATH + 'Meal Ticket - PFG Sales Report v1.xlsx')

print(f'Imported {_new.shape[0]} records', flush = True)

FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\NEWATTER\\OneDrive - McCain Foods Limited\\Historical Sell-Out Sales\\\\Meal Ticket - PFG Sales Report v1.xlsx'

NameError: name '_new' is not defined

### 7. Apply Dictionary to Raw Data / Check For Missing Segmentation
Run cell below

In [29]:
#add dictionary to new data
_new_df = apply_dictionary(_new, 'PFG - US.xlsx')

is_missing(_new_df)

Starting dataframe shape: (516436, 29)
These manufacturers were included: ['ANCHOR FROZEN FOODS INC', 'DOT FOODS (FROZEN)', 'ENDICO POTATOES INC.', 'MC CAIN FOODS INC', 'MCCAIN FOODS AFD', 'MCCAIN FOODS USA INC', 'MCCAIN FOODS USA INC-ACH', 'MCCAIN FOODS, INC. ', 'MCCAIN FOODSERVICE INC', 'MCCAIN PRODUCE INC', 'WH MOSELEY CO', 'MCCAIN FOODS', 'McCain Foods USA']
These manufacturers were not included:                  Manufacturer
2             SUPPLIER VARIES
4   JR SIMPLOT COMPANY FROZEN
6         CAVENDISH FARMS INC
8       LAMB WESTON SALES INC
9                 LAMB WESTON
10   ROSINA FOOD PRODUCTS INC
16       ROSINA FOOD PRODUCTS
18        BIG DADDY FOODS INC
19   J. R. SIMPLOT (ROCHELLE)
Shape before adding dictionary: (501993, 29)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value


Shape after adding dictionary: (501993, 41)
Nothing missing for COVID Segmentation - L1
Nothing missing for Product


### 8. Combine New and Base Datasets
Run cell below

In [30]:
_import_df = pd.read_csv(BACKUP + 'PFG.csv', low_memory = False, thousands = ',', decimal = '.', dtype = {
            'Calendar Week Year':np.int64,
            'LBS':np.float64})

print(f'Imported shape...{_import_df.shape}', flush = True)

#create list to exclude from base data
exclude_list = _new_df['Calendar Week Year'].values.squeeze().tolist()
#exclude_list = _new_df['Calendar Week Year'].to_list()

#turn list to string

#only keep last 12 weeks
exclude_list = exclude_list[-12:]

include = str(exclude_list)[1:-1]

#import all records from base data minus the new data
_base = _import_df[~_import_df['Calendar Week Year'].isin(exclude_list)][['City','State','State Name','COVID Segmentation - L1','COVID Segmentation - L2',
                 'COVID Segmentation - (Restaurants)','COVID Segmentation - (Restaurants: Sub-Segment)','Restaurant Service Type',
                 'Consolidated Category','SKU ID','Brand','Cuisine Type','L1 Product Hierarchy','L2 Product Hierarchy','LBS','Calendar Week Year']].append(
        _new_df[['City','State','State Name','COVID Segmentation - L1','COVID Segmentation - L2',
                 'COVID Segmentation - (Restaurants)','COVID Segmentation - (Restaurants: Sub-Segment)','Restaurant Service Type',
                 'Consolidated Category','SKU ID','Brand','Cuisine Type','L1 Product Hierarchy','L2 Product Hierarchy','LBS','Calendar Week Year']])

_base = clean_city(_base)

#create a copy but just for restaurants#
#_restaurants = restaurants(_base)

print(f'Final shape...{_base.shape}', flush = True)
#print(f'Restaurants shape...{_restaurants.shape}', flush = True)


Imported shape...(7151430, 17)
Final shape...(7620063, 16)


In [85]:
exclude_list = _new_df.groupby(['Calendar Week Year']).size().reset_index().drop(columns={0})['Calendar Week Year'].sort_values().to_list()[-12:]
#exclude_list = _new_df['Calendar Week Year'].to_list()

#turn list to string

print(_new_df['Calendar Week Year'].max())

#only keep last 12 weeks
exclude_list = exclude_list[-12:]

include = str(exclude_list)[1:-1]

print(exclude_list)
print(include)

202205
[202146, 202147, 202148, 202149, 202150, 202151, 202152, 202201, 202202, 202203, 202204, 202205]
202146, 202147, 202148, 202149, 202150, 202151, 202152, 202201, 202202, 202203, 202204, 202205


### 9. Execute Analysis
Run cell below

In [14]:

_list = []

#Output 1: COVID L1 - List 0
_list.append(['City', 'State Name','COVID Segmentation - L1','COVID Segmentation - L2','Restaurant Service Type','Consolidated Category'])

#Output 2: COVID L1 - List 1
_list.append(['State Name','COVID Segmentation - L1','COVID Segmentation - L2','Restaurant Service Type','SKU ID','Consolidated Category','L1 Product Hierarchy','L2 Product Hierarchy'])

print(f'Processing Region', flush = True)
output1 = process_list(_base, _list[0])

print(f'Processing Sell in vs Sell out', flush = True)
output2 = teradata_sales(_base)

#print(f'Processing SKU', flush = True)
#output3 = process_list(_base, _list[1])

#Output 2: COVID L1 - List 1
#_list.append(['COVID Segmentation - L1','COVID Segmentation - L2','Restaurant Service Type','Consolidated Category','L1 Product Hierarchy','L2 Product Hierarchy'])
#output4 = process_list(_base, _list[2])

print('All done')

Processing Region
Processing Sell in vs Sell out
Starting Teradata connect...
Database selected!
07/07/2022 04:11:34 PM
Query: Execution started...finished. 0:01:09.049508
Query: Fetching data started...finished. 0:00:48.116217
Query: Creating DataFrame for started...finished. 0:00:00.171968
Dim: (87008, 17)
All done


### 10. Upload Analysis to Teradata
Run cell below

### Teradata Functions

In [15]:
def td_upload(select_db, df, table_name):
    with teradatasql.connect(None, 
                         host='172.29.3.43',
                         user='PNWATTERS',
                         password='teradata123') as con:
        with con.cursor() as cur:
            cur.execute (select_db)
            d = dt.now().strftime('%m/%d/%Y %r')
            print(f'Database selected! {d}', flush=True)           

            delete_from_td(df, table_name, cur)
            insert_into_td(df, table_name, cur)

def delete_from_td(df, table_name, cur):
    distributor = df.groupby('Distributor').size().reset_index().drop(columns=0).to_numpy()[0][0]
    
    print(f'Deleting records for: {distributor} in table: {table_name}', flush = True)          
        
    query = '''
    DELETE FROM ''' + table_name  + ''' 
    WHERE "Distributor" = ''' + "'" + distributor + "'" + ''' AND "Calendar Week Year" IN (''' + include + ")"
    
    #query = '''
    #DELETE FROM ''' + table_name  + ''' 
    #WHERE "Distributor" = ''' + "'" + distributor + "'"
    
    cur.execute (query)
    
def insert_into_td(df, table_name, cur):
    insert_list = df.values.tolist()
    
    #creates ?, ?,.... string used in query for teradata fastload
    insert_columns = ('?, ' * len(df.columns)).rstrip(', ')
    
    print(f'Inserting records into {table_name}', flush = True)
    
    query = "INSERT INTO " + table_name  + " (" + insert_columns + ")"
    #query = "{fn teradata_try_fastload}INSERT INTO " + table_name  + " (" + insert_columns + ")"
    
    cur.execute (query, insert_list)
    
    print(f'Inserted {df.shape[0]} records', flush = True)
    

select_db = 'DATABASE DL_NA_PROTOTYPING'

exclude_list = _base.groupby(['Calendar Week Year']).size().reset_index().drop(columns={0})['Calendar Week Year'].sort_values(ascending=True).to_list()[-12:]
include = str(exclude_list)[1:-1]

td_upload(select_db, output1[output1['Calendar Week Year'].isin(exclude_list)], 'SELLOUT_REGION')
td_upload(select_db, output2[output2['Calendar Week Year'].isin(exclude_list)], 'SELLOUT_AND_SELLIN')
#td_upload(select_db, output3[output3['Calendar Week Year'].isin(exclude_list)].astype({'SKU ID':'str'}), 'SELLOUT_REGION_SKU')

#For testing and cleanup
#td_upload(select_db, output1, 'SELLOUT_REGION')
#td_upload(select_db, output2, 'SELLOUT_AND_SELLIN')


Database selected! 07/07/2022 04:13:43 PM
Deleting records for: PFG in table: SELLOUT_REGION
Inserting records into SELLOUT_REGION
Inserted 22548 records
Database selected! 07/07/2022 04:14:44 PM
Deleting records for: PFG in table: SELLOUT_AND_SELLIN
Inserting records into SELLOUT_AND_SELLIN
Inserted 24 records


### Save Backup When Ready

In [13]:
save_backup(_base, 'PFG.csv')

In [16]:
def teradata_sellout():
    
    print(f'Starting Teradata connect...', flush = True)
    
    select_db = "DATABASE DL_NA_PROTOTYPING"

    query = '''
    select * from SELLOUT_REGION where DISTRIBUTOR not in ('SYSCO US');
    '''
   
    return td_dataframe(select_db, query)

teradata_sellout().to_csv('SELLOUT_REGION.csv', index=False)

Starting Teradata connect...
Database selected!
07/07/2022 04:25:23 PM
Query: Execution started...finished. 0:00:18.348386
Query: Fetching data started...finished. 0:30:22.081655
Query: Creating DataFrame for started...finished. 0:00:06.457176
Dim: (825468, 35)


In [17]:
def teradata_sellout_and_sellin():
    
    print(f'Starting Teradata connect...', flush = True)
    
    select_db = "DATABASE DL_NA_PROTOTYPING"

    query = '''
    select * from SELLOUT_AND_SELLIN where DISTRIBUTOR not in ('SYSCO US');
    '''
   
    return td_dataframe(select_db, query)

teradata_sellout_and_sellin().to_csv('SELLOUT_AND_SELLIN.csv', index=False)

Starting Teradata connect...
Database selected!
07/07/2022 04:56:33 PM
Query: Execution started...finished. 0:00:06.303671
Query: Fetching data started...finished. 0:00:00.241042
Query: Creating DataFrame for started...finished. 0:00:00.013714
Dim: (1973, 38)


In [20]:
output1.columns

Index(['City', 'State Name', 'COVID Segmentation - L1',
       'COVID Segmentation - L2', 'Restaurant Service Type',
       'Consolidated Category', 'Calendar Week Year', 'Distributor', 'LBS',
       'SMA_4', 'SMA_8', 'SMA_12', 'YOY Week', 'LBS_LY', 'SMA_4_LY',
       'SMA_8_LY', 'SMA_12_LY', 'Baseline Week', 'LBS_Baseline',
       'SMA_4_Baseline', 'SMA_8_Baseline', 'SMA_12_Baseline', 'LBS_Lag_1',
       'LBS_Lag_2', 'LBS_Lag_3', 'LBS_Lag_4', 'LBS_Baseline_Lag_1',
       'LBS_LY_Lag_1', 'SMA_4_Lag_1', 'SMA_4_LY_Lag_1', 'SMA_4_Baseline_Lag_1',
       'LBS_PRECOVID', 'Week Starting (Sun)', 'Week Ending (Sat)',
       'COVID Week'],
      dtype='object')

### PFG Refresh
Refresh with raw data

Noticed very high spikes 15-20m LBs. when regular volume is 2-4m LBs.

In [6]:
#backup file
backup = pd.read_csv(r'C:\Users\newatter\OneDrive - McCain Foods Limited\Historical Sell-Out Sales\Backups\PFG.csv')

In [7]:
def build_frame(file_name):
    date = file_name[118:128]
    
    #print(len(file_name))
    #print(file_name[118:128])
    
    df = pd.read_csv(file_name, low_memory=False, thousands=',', dtype={'Qty':'float64','Weight':'float64'})
    
    df['Invoice Week'] = pd.to_datetime(date, format='%Y-%m-%d')
    
    return(df)

In [8]:
import os
# assign directory
directory = r'C:\Users\newatter\OneDrive - McCain Foods Limited\Historical Sell-Out Sales\PFG Refresh'

#Loop through all files in directory and create a dataframe

# iterate over files in
# that directory
df = pd.DataFrame()

for filename in os.listdir(directory):
    f = os.path.join(directory, filename)
    # checking if it is a file

    if os.path.isfile(f):
        if len(f) > 120:
            df = pd.concat([df, build_frame(f)])
        
print(df.shape[0])
#df.head()

2446248


In [9]:
df['Invoice Week'].max()

Timestamp('2022-06-27 00:00:00')

### Run new data through dictionary

In [11]:
#add dictionary to new data
_new_df = apply_dictionary(df, 'PFG - US.xlsx')

is_missing(_new_df)

Starting dataframe shape: (2446248, 29)
These manufacturers were included: ['ANCHOR FROZEN FOODS INC', 'DOT FOODS (FROZEN)', 'ENDICO POTATOES INC.', 'MC CAIN FOODS INC', 'MCCAIN FOODS AFD', 'MCCAIN FOODS USA INC', 'MCCAIN FOODS USA INC-ACH', 'MCCAIN FOODS, INC. ', 'MCCAIN FOODSERVICE INC', 'MCCAIN PRODUCE INC', 'WH MOSELEY CO', 'MCCAIN FOODS', 'McCain Foods USA']
These manufacturers were not included:                  Manufacturer
2             SUPPLIER VARIES
4   JR SIMPLOT COMPANY FROZEN
5         CAVENDISH FARMS INC
8       LAMB WESTON SALES INC
11                LAMB WESTON
15   ROSINA FOOD PRODUCTS INC
16        BIG DADDY FOODS INC
17       ROSINA FOOD PRODUCTS
19          PFS IFH (HICKORY)
20   J. R. SIMPLOT (ROCHELLE)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'Segment'] = df['Segment'].str.strip()


Shape before adding dictionary: (2377491, 29)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'Invoice Week'] = pd.to_datetime(df['Invoice Week'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'customer_class_lower'] = df['Customer Class'].str.lower()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'segment_lower'] = df['Segment'].str.strip().str.lower()
A valu

Shape after adding dictionary: (2377491, 41)
Nothing missing for COVID Segmentation - L1
Nothing missing for Product


In [12]:
exclude = _new_df['Calendar Week Year'].to_list()

print(f'back up rows: {backup.shape[0]}')

#import all records from base data minus the new data
_base = backup[~backup['Calendar Week Year'].isin(exclude)][['City','State','State Name','COVID Segmentation - L1','COVID Segmentation - L2',
                 'COVID Segmentation - (Restaurants)','COVID Segmentation - (Restaurants: Sub-Segment)','Restaurant Service Type',
                 'Consolidated Category','SKU ID','Brand','Cuisine Type','L1 Product Hierarchy','L2 Product Hierarchy','LBS','Calendar Week Year']].append(
        _new_df[['City','State','State Name','COVID Segmentation - L1','COVID Segmentation - L2',
                 'COVID Segmentation - (Restaurants)','COVID Segmentation - (Restaurants: Sub-Segment)','Restaurant Service Type',
                 'Consolidated Category','SKU ID','Brand','Cuisine Type','L1 Product Hierarchy','L2 Product Hierarchy','LBS','Calendar Week Year']])

print(f'base rows: {_base.shape[0]}')

_base = clean_city(_base)

_base.shape[0]

back up rows: 6043544


  _base = backup[~backup['Calendar Week Year'].isin(exclude)][['City','State','State Name','COVID Segmentation - L1','COVID Segmentation - L2',


base rows: 6080988


6080988

In [12]:
print(_new_df[_new_df['Calendar Week Year']==202205]['LBS'].sum())

print(_base[_base['Calendar Week Year']==202205]['LBS'].sum())

2910607.42
2910607.42


In [13]:
save_backup(_base, 'PFG.csv')