# US Foods COVID Analysis

Goal: 
1. Process US Foods data (source = https://usfoods.precima.io/)
2. Analyze data using COVID segmentation
3. Compare sell-out (US Foods) to sell-in (McCain) data

### 1. Load libraries, initiate folder/file paths
Run cell below

In [1]:
import pandas as pd
import datetime
from datetime import datetime as dt
import numpy as np
#import teradatasql
import pyodbc

from distributor_transformation import transform_usfoods
from sellout_model import process_list, analyze, add_time
#from sellout_teradata import teradata_sales
from sellout_import import import_usfoods, all_df
from sellout_azure import azure_sellin

### 3. Import File
Run cell below

In [2]:
def setup_connection():
    server = 'mf-enterprise-dev-sql.46ac3df1733c.database.windows.net'
    database = 'PWRAPPDB'
    driver = '{ODBC Driver 17 for SQL Server}'

    # Establish the database connection using AAD Integrated Authentication
    conn_str = (
        f'DRIVER={driver};'
        f'SERVER=tcp:{server};'
        f'DATABASE={database};'
        'Authentication=ActiveDirectoryIntegrated'
    )

    cnxn = pyodbc.connect(conn_str)
    cursor = cnxn.cursor()

    return cnxn, cursor

In [3]:
query_str = '''
SELECT
[Area],
[Region],
[Market],
[State],
[Pyramid Segment],
[ASYS ID],
[Manufacturer GTIN],
[McCain SKU ID],
[ASYS Description],
[Week Beginning Date],
SUM([LBS]) as LBS
FROM [PWRAPPDB].[na_dist].[US_USFoods_Sellout]
GROUP BY
[Area],
[Region],
[Market],
[State],
[Pyramid Segment],
[ASYS ID],
[Manufacturer GTIN],
[McCain SKU ID],
[ASYS Description],
[Week Beginning Date]
'''

cnxn, cursor = setup_connection() 

df = pd.read_sql_query(query_str, cnxn, parse_dates=['Week Beginning Date'])

# Close the connection
cursor.close()
cnxn.close()

In [4]:
df_usfoods = transform_usfoods(df, 'US Foods - US.xlsx')

Shape before adding dictionary: (906325, 11)
Total before dictionary: 577827822.2299998
Total after dictionary: 579381464.2700002
Shape after adding dictionary: (909970, 22)
Nothing missing for COVID Segmentation - L1
The following products are missing:
     ASYS ID Manufacturer GTIN McCain SKU ID         LBS
0    1004527    10072714008143    1000010772   209088.00
1    1008899    10072714008310    1000010868    10454.16
2    1009467    10072714008204    1000010795     6224.08
3    1009468    10072714008228    1000010809    56919.58
4    1009550    10072714008129    1000010649    35024.96
..       ...               ...           ...         ...
282  9841260    50758108767183    1000006237  1496940.00
283  9865114    10072714002103    1000001467    26064.00
284   987032    10072714100946      80010094    21150.00
285  9946046               NaN           NaN    15816.00
286  9972388    10072714100106      70010010   688425.00

[287 rows x 4 columns]


In [5]:
backup_path = r'C:\Users\newatter\OneDrive - McCain Foods Limited\Historical Sell-Out Sales\Backups\\'

_base = all_df(df_usfoods, backup_path, 'US FOODS.csv')

Imported shape...(76223, 37)
Final shape...(2002089, 13)


In [6]:
_list = []

#Output 1: COVID L1 - List 0
_list.append(['City', 'State Name','COVID Segmentation - L1','COVID Segmentation - L2','Restaurant Service Type','Consolidated Category'])

#Output 2: COVID L1 - List 1
_list.append(['State Name','COVID Segmentation - L1','COVID Segmentation - L2','Restaurant Service Type','SKU ID','Consolidated Category','L1 Product Hierarchy','L2 Product Hierarchy'])

print(f'Processing Region', flush = True)
output1 = %time process_list(_base, _list[0], 'US Foods')


Processing Region
CPU times: total: 11.3 s
Wall time: 11.3 s


In [7]:
print(f'Processing Sell in vs Sell out', flush = True)
output2 = azure_sellin(_base, 'US01', "','".join(['6500002818']), 'US Foods')

print('All done')

Processing Sell in vs Sell out
Query ran for 6500002818 under sales org US01 for sales on or before 2024-01-21
All done, took 11.0 seconds...
All done


In [8]:
output1.to_csv('files/sellout_region_us_foods.zip', compression='zip', index=False)
output2.to_csv('files/sellout_sellin_us_foods.zip', compression='zip', index=False)

In [9]:
_base.to_csv(backup_path + 'US FOODS.csv')

# US Foods - Precima Combined File

In [38]:
import pandas as pd
import regex as re
import os

## Extracting files from SharePoint
1) Loop through directory folders and extract volume data from files named "US Foods Update.csv"
2) Combine into one large dataframe and export to pickle file format

In [85]:

def extract_usf_prima(file_path):

    # Read CSV file, only use certain columns
    df = pd.read_csv(file_path, low_memory = False, thousands = ','
                    ,usecols=['Market','Pyramid Segment','ASYS #','MFG #','Product Description','Year Week','LB Current'])

    return df

# Path to the directory you want to search
directory_path = r'C:\Users\newatter\OneDrive - McCain Foods Limited\Distributor Sell-Out\Weekly Update Files'

# Create blank dataframe
us_foods = pd.DataFrame()

# Loop through each folder and file in the directory
for root, dirs, files in os.walk(directory_path):
    for file in files:
        # Check if "SharedTable_weekly_cases" is in the filename
        if "US Foods Update" in file and file.lower().endswith('.csv'):

            # Construct the full file path
            file_path = os.path.join(root, file)
            
            # Create dataframe from file
            df = extract_usf_prima(file_path)

            # Create a list of weeks in new data
            weeks_in_data = df['Year Week'].tolist()

            # If this is the first file then make dataframe same
            if us_foods.empty:  
                us_foods = df
            
            # If not first file then exclude weeks in dataframe and only keep new data
            else:
                us_foods = us_foods[~us_foods['Year Week'].isin(weeks_in_data)]
                
                us_foods = pd.concat([us_foods, df])

print(us_foods.info())


<class 'pandas.core.frame.DataFrame'>
Index: 824184 entries, 1 to 76222
Data columns (total 7 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   Market               824184 non-null  object 
 1   Pyramid Segment      824184 non-null  object 
 2   MFG #                824184 non-null  float64
 3   ASYS #               824184 non-null  int64  
 4   Product Description  824184 non-null  object 
 5   Year Week            824184 non-null  int64  
 6   LB Current           824184 non-null  float64
dtypes: float64(2), int64(2), object(3)
memory usage: 50.3+ MB
None


### Add Time from Data Dictionary

In [86]:
# Path for time defintions file to convert time element to actual date (week beginning date)
file_path = r'C:\Users\newatter\OneDrive - McCain Foods Limited\Distributor Sell-Out\Data Dictionaries\Time Definitions.xlsx'

# Only use the time worksheet in the Excel file
time_df = pd.read_excel(file_path, sheet_name='time')

#time_df.info()    

In [87]:
us_foods_with_time = us_foods.merge(time_df[['Calendar Week Year','Week Starting (Sun)']], 
                          left_on = 'Year Week',
                          right_on='Calendar Week Year')

us_foods_with_time.drop(columns={'Calendar Week Year'}, inplace=True)

us_foods_with_time.head()

Unnamed: 0,Market,Pyramid Segment,MFG #,ASYS #,Product Description,Year Week,LB Current,Week Starting (Sun)
0,BISMARCK,EDUCATION,50758110000000.0,9841260,"FRENCH TOAST, STICK CKD FZN",202103,430.0,2021-01-17
1,BISMARCK,EDUCATION,10072710000000.0,4862793,"SANDWICH, EGG BACN PTATO & CHS",202103,60.0,2021-01-17
2,BISMARCK,EDUCATION,10072710000000.0,8292377,"POTATO, MSHD PTY SMILE SKNLS",202103,48.0,2021-01-17
3,BISMARCK,GOVERNMENT,10072710000000.0,1074871,"POTATO, FF 3/8 SC BTRD BUTR",202103,150.0,2021-01-17
4,BISMARCK,GOVERNMENT,50758110000000.0,9841260,"FRENCH TOAST, STICK CKD FZN",202103,10.0,2021-01-17


### Export File as Zip

In [88]:
file_path = r'C:\Users\newatter\OneDrive - McCain Foods Limited\Distributor Sell-Out\Sell-Out\us_foods_precima.zip'

us_foods_with_time.to_csv(file_path, index=False, compression='zip')

## Working with Old Data - Precima File

In [8]:
precima = pd.read_csv(r'C:\Users\newatter\OneDrive - McCain Foods Limited\Distributor Sell-Out\Source Files\us_foods_precima.zip', 
                      parse_dates=['Week Starting (Sun)'])

In [15]:
rename_columns = {
    'LB Current':'LBS',
    'Week Starting (Sun)':'Week Beginning Date'
}

precima.rename(columns=rename_columns, inplace=True)

print(precima.columns)
print(precima.info())

Index(['Market', 'Pyramid Segment', 'MFG #', 'ASYS #', 'Product Description',
       'Year Week', 'LBS', 'Week Beginning Date'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
Index: 824184 entries, 0 to 824183
Data columns (total 8 columns):
 #   Column               Non-Null Count   Dtype         
---  ------               --------------   -----         
 0   Market               824184 non-null  object        
 1   Pyramid Segment      824184 non-null  object        
 2   MFG #                824184 non-null  float64       
 3   ASYS #               824184 non-null  int64         
 4   Product Description  824184 non-null  object        
 5   Year Week            824184 non-null  int64         
 6   LBS                  824184 non-null  float64       
 7   Week Beginning Date  824184 non-null  datetime64[ns]
dtypes: datetime64[ns](1), float64(2), int64(2), object(3)
memory usage: 56.6+ MB
None


### Data Dictionary

In [16]:
# Path for time defintions file to convert time element to actual date (week beginning date)
file_path = r'C:\Users\newatter\OneDrive - McCain Foods Limited\Distributor Sell-Out\Data Dictionaries\US Foods - US.xlsx'

segments = pd.read_excel(file_path, sheet_name='Segment Mapping v2')
regions = pd.read_excel(file_path, sheet_name='Region Mapping')
skus = pd.read_excel(file_path, sheet_name='SKU Mapping v3', dtype={'Manufacturer Item Number':'str'})

#print(segments.info())
#print(regions.info())
#print(skus.info())

In [12]:
skus

Unnamed: 0,ASYS,Manufacturer Item Number,Merch Category,PIM Group,Product,Consolidated Category,L1 Product Hierarchy,L2 Product Hierarchy,McCain SKU ID
0,5601641.0,10072714003742,COOKIES,"COOKIES, READY TO EAT, FROZEN","COOKIE, SNDWH CHOC VNL CRM",Prepared Foods,Local Portfolio,Bakery,1000004861
1,,10072714006040,COOKIES,"COOKIES, READY TO EAT, FROZEN","COOKIE, SNDWH CHOC VNL CRM",Prepared Foods,Local Portfolio,Bakery,1000007909
2,,10072714105224,DESSERT BARS,"BROWNIES, FROZEN","BROWNIE, DBL CHOC NOT ICED",Prepared Foods,Local Portfolio,Bakery,15010522
3,9771668.0,6007713363174,FRIED APPETIZERS,"APPETIZERS, ONIONS, BREADED & BATTERED",ONION CHIPS-MCCAIN-1000000689,Prepared Foods,Appetizer,Onion Shapes,1000000689
4,,10041493107606,FRIED APPETIZERS,"APPETIZERS, ONIONS, BREADED & BATTERED","ONION RING, BTRD BEER EX THCK",Prepared Foods,Appetizer,Onion Rings,10210732
...,...,...,...,...,...,...,...,...,...
440,9040775.0,,,,"APPETIZER, CHS MOZZ BTRD BEER",,,,
441,9071986.0,,,,"POTATO, FF 1/2 CC FCY FZN",,,,
442,9946046.0,,,,ONION SLIVERS BREADED FZN BAG,,,,
443,950099.0,11204530026,,,,,,,1120453002


In [17]:
precima_regions = precima.merge(regions, how='left', on = 'Market')

missing_rows = precima_regions['Region'].isna().sum()

print(f'Missing rows = {missing_rows}')

Missing rows = 0


In [18]:
precima_skus = precima_regions.merge(skus, how='left', left_on = 'ASYS #', right_on = 'ASYS')

missing_rows = precima_skus['ASYS'].isna().sum()

print(f'Missing rows = {missing_rows}')

Missing rows = 0


### Clean ASYS column

In [19]:
precima_skus['ASYS'] = precima_skus['ASYS'].astype(int).astype(str)

print(precima_skus.columns)

Index(['Market', 'Pyramid Segment', 'MFG #', 'ASYS #', 'Product Description',
       'Year Week', 'LBS', 'Week Beginning Date', 'Region', 'Area', 'State',
       'ASYS', 'Manufacturer Item Number', 'Merch Category', 'PIM Group',
       'Product', 'Consolidated Category', 'L1 Product Hierarchy',
       'L2 Product Hierarchy', 'McCain SKU ID'],
      dtype='object')


In [96]:
columns_to_keep = ['Area', 'Region', 'Market','State','Pyramid Segment',
                   'ASYS','Product', 'Manufacturer Item Number','McCain SKU ID','Week Starting (Sun)','LBS']

file_path = r'C:\Users\newatter\OneDrive - McCain Foods Limited\Distributor Sell-Out\Sell-Out\us_foods_precima.pkl'

precima_skus[columns_to_keep].to_pickle(file_path)

print(precima_skus[columns_to_keep].info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 824184 entries, 0 to 824183
Data columns (total 21 columns):
 #   Column                                           Non-Null Count   Dtype         
---  ------                                           --------------   -----         
 0   Area                                             824184 non-null  object        
 1   Region                                           824184 non-null  object        
 2   Market                                           824184 non-null  object        
 3   State                                            824184 non-null  object        
 4   Pyramid Segment                                  824184 non-null  object        
 5   COVID Segmentation - L1                          824184 non-null  object        
 6   COVID Segmentation - L2                          824184 non-null  object        
 7   COVID Segmentation - (Restaurants)               824184 non-null  object        
 8   COVID Segmentation - (Re

### US Foods - New File 

In [1]:
import pandas as pd
import regex as re
import os

In [77]:
def extract_usf_sharedtable(file_path):
    df = pd.read_csv(file_path, low_memory = False, thousands = ',')

    # Last column name = date (2024-03-24 11) with 11 being the week number of the year
    last_column = df.columns[-1]

    # Pattern to capture date in last_column
    pattern = r'(\d{4}-\d{2}-\d{2})'

    # Extract date using regex
    week_begin = re.search(pattern, last_column).group(1)

    # Change data types
    df = df.astype({
        'GTIN':'str',
        'MFG Number':'str',
        last_column: 'float'
    })

    # Rename LBS column
    df.rename(columns={last_column: 'LBS', 'Week Beginning':'Week Starting (Sun)'}, inplace=True)

    # Add week beginning column
    df['Week Starting (Sun)'] = pd.to_datetime(week_begin)

    # Drop NaN rows and realign dataframe
    df = df.dropna(subset='Market')[['Area', 'Market', 'Pyramid Segment', 'ASYS Code', 'GTIN', 'MFG Number','Week Starting (Sun)', 'LBS']]

    # Drop .0 from ASYS Code
    #df['ASYS Code'] = df['ASYS Code'].astype('int').astype('str')

    return df

# Path to the directory you want to search
directory_path = r'C:\Users\newatter\OneDrive - McCain Foods Limited\Weekly Update Files'

# Create blank dataframe
us_foods = pd.DataFrame()

# Loop through each folder and file in the directory
for root, dirs, files in os.walk(directory_path):
    for file in files:
        # Check if "SharedTable_weekly_cases" is in the filename
        if "SharedTable_weekly_cases" in file:

            # Construct the full file path
            file_path = os.path.join(root, file)
            
            df = extract_usf_sharedtable(file_path)

            us_foods = pd.concat([us_foods, df])

# Drop rows that contain '-'
rows_to_drop = us_foods[us_foods['Pyramid Segment'].str.contains('-')].index
us_foods.drop(rows_to_drop, inplace=True)

us_foods['ASYS Code'] = us_foods['ASYS Code'].astype(int)

print(us_foods.shape)

us_foods.head(4)


(61354, 8)


Unnamed: 0,Area,Market,Pyramid Segment,ASYS Code,GTIN,MFG Number,Week Starting (Sun),LBS
1,ANCHORAGE,ANCHORAGE-4190,HOSPITALITY,4180733,10072714802550,BCI00255,2024-01-21,60.0
2,ANCHORAGE,ANCHORAGE-4190,HOSPITALITY,6364970,10072714102346,80010234,2024-01-21,30.0
3,ANCHORAGE,ANCHORAGE-4190,INDEPENDENT RESTAURATEURS,987149,10072714036023,MCX03602,2024-01-21,102.0
4,ANCHORAGE,ANCHORAGE-4190,NATIONAL CHAINS,8332298,10072714101950,82910195,2024-01-21,408.0


In [78]:
us_foods.columns

Index(['Area', 'Market', 'Pyramid Segment', 'ASYS Code', 'GTIN', 'MFG Number',
       'Week Starting (Sun)', 'LBS'],
      dtype='object')

In [79]:
# Path for time defintions file to convert time element to actual date (week beginning date)
file_path = r'C:\Users\newatter\OneDrive - McCain Foods Limited\Distributor Sell-Out\Data Dictionaries\US Foods - US.xlsx'

segments = pd.read_excel(file_path, sheet_name='Segment Mapping v2')
regions = pd.read_excel(file_path, sheet_name='Region Mapping v2')

skus = pd.read_excel(file_path, sheet_name='SKU Mapping v3', dtype={'Manufacturer Item Number':'str'})

skus_clean = skus[~skus['ASYS'].isna()].copy()

In [80]:
us_foods_segments = us_foods.merge(segments, how='left', on='Pyramid Segment')

missing_rows = us_foods_segments['COVID Segmentation - L1'].isna().sum()

print(f'Missing rows = {missing_rows}')

Missing rows = 0


In [81]:
us_foods_regions = us_foods_segments.merge(regions[['Market','Region','State']], how='left', on='Market')

missing_rows = us_foods_regions['State'].isna().sum()

print(f'Missing rows = {missing_rows}')

Missing rows = 0


In [99]:
skus_clean['ASYS'] = skus_clean['ASYS'].astype(int)

us_foods_skus = us_foods_regions.merge(skus_clean, how='left', left_on='ASYS Code', right_on = 'ASYS')

us_foods_skus['ASYS'] = us_foods_skus['ASYS'].astype(str)

missing_rows = us_foods_skus['ASYS'].isna().sum()

print(f'Missing rows = {missing_rows}')

Missing rows = 0


In [83]:
us_foods_skus.columns

Index(['Area', 'Market', 'Pyramid Segment', 'ASYS Code', 'GTIN', 'MFG Number',
       'Week Starting (Sun)', 'LBS', 'COVID Segmentation - L1',
       'COVID Segmentation - L2', 'COVID Segmentation - (Restaurants)',
       'COVID Segmentation - (Restaurants: Sub-Segment)',
       'Restaurant Service Type', 'Region', 'State', 'ASYS',
       'Manufacturer Item Number', 'Merch Category', 'PIM Group', 'Product',
       'Consolidated Category', 'L1 Product Hierarchy', 'L2 Product Hierarchy',
       'McCain SKU ID'],
      dtype='object')

In [100]:
columns_to_keep = ['Area', 'Region', 'Market','State','Pyramid Segment',
                   'COVID Segmentation - L1', 'COVID Segmentation - L2',
                   'COVID Segmentation - (Restaurants)','COVID Segmentation - (Restaurants: Sub-Segment)',
                   'Restaurant Service Type','ASYS', 'Product', 'Manufacturer Item Number',
                   'Merch Category','PIM Group','McCain SKU ID','Consolidated Category', 'L1 Product Hierarchy','L2 Product Hierarchy',
                   'Week Starting (Sun)','LBS']

print(us_foods_skus[columns_to_keep].info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61354 entries, 0 to 61353
Data columns (total 21 columns):
 #   Column                                           Non-Null Count  Dtype         
---  ------                                           --------------  -----         
 0   Area                                             61354 non-null  object        
 1   Region                                           61354 non-null  object        
 2   Market                                           61354 non-null  object        
 3   State                                            61354 non-null  object        
 4   Pyramid Segment                                  61354 non-null  object        
 5   COVID Segmentation - L1                          61354 non-null  object        
 6   COVID Segmentation - L2                          61354 non-null  object        
 7   COVID Segmentation - (Restaurants)               61354 non-null  object        
 8   COVID Segmentation - (Restaurants: S

### Combine Precima with New Data

In [111]:
source_path = r'C:/Users/newatter/OneDrive - McCain Foods Limited/Distributor Sell-Out/Source Files/us_foods_precima.pkl'

precima = pd.read_pickle(source_path)

us_foods_all = pd.concat([us_foods_skus[columns_to_keep], precima])

file_name = 'us_foods'
pickle_path = 'C:/Users/newatter/OneDrive - McCain Foods Limited/Distributor Sell-Out/Source Files'
csv_path = 'C:/Users/newatter/OneDrive - McCain Foods Limited/Distributor Sell-Out/Sell-Out'

#us_foods_all.to_pickle(f'{pickle_path}/{file_name}.pkl')

compression_opts = dict(method='zip', archive_name=f'{file_name}.csv')

us_foods_all.to_csv(f'{csv_path}/{file_name}.zip', index=False, 
                    compression=compression_opts)

In [104]:
all_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 885538 entries, 0 to 824183
Data columns (total 21 columns):
 #   Column                                           Non-Null Count   Dtype         
---  ------                                           --------------   -----         
 0   Area                                             885538 non-null  object        
 1   Region                                           885538 non-null  object        
 2   Market                                           885538 non-null  object        
 3   State                                            885538 non-null  object        
 4   Pyramid Segment                                  885538 non-null  object        
 5   COVID Segmentation - L1                          885538 non-null  object        
 6   COVID Segmentation - L2                          885538 non-null  object        
 7   COVID Segmentation - (Restaurants)               885538 non-null  object        
 8   COVID Segmentation - (Restaur

In [34]:
import pyodbc
import sqlalchemy
from sqlalchemy import create_engine, MetaData, Table
from sqlalchemy.engine import URL
from datetime import datetime
import pandas as pd
import os
import regex as re

class us_foods_sellout:
    def __init__(self, file_path) -> None:
        self.filepath = file_path
        self.data = self.process_file()
        self.data_import = self.filter_rows()
        #self.delete_rows()
        self.insert_rows()


    def fill_in_missing_data(df):
        file_path = r'C:\Users\newatter\OneDrive - McCain Foods Limited\Distributor Sell-Out\Data Dictionaries\US Foods - US.xlsx'
        
        regions = pd.read_excel(file_path, sheet_name='Region Mapping v2')

        df = df.merge(regions[['Market','Market Clean','Region','State']], how='left', on='Market')

        df.drop(['Market'], axis=1, inplace=True)
        
        df.rename(columns={'Market Clean':'Market'}, inplace=True)
        
        return df
        
    def process_file(self):
        df = pd.read_csv(self.filepath, low_memory = False, thousands = ',')

        # Last column name = date (2024-03-24 11) with 11 being the week number of the year
        last_column = df.columns[-1]

        # Pattern to capture date in last_column
        pattern = r'(\d{4}-\d{2}-\d{2})'

        # Extract date using regex
        week_begin = re.search(pattern, last_column).group(1)

        # Change data types
        df = df.astype({
            'GTIN':'str',
            'MFG Number':'str',
            last_column: 'float'
        })

        rename_columns = {
            last_column: 'LBS', 
            'Week Beginning':'Week Beginning Date',
            'ASYS Code':'ASYS ID',
            'ASYS':'ASYS Description',
            'GTIN':'Manufacturer GTIN',
            'MFG Number':'McCain SKU ID'
        }

        # Rename columns
        df.rename(columns=rename_columns, inplace=True)

        # Add week beginning column
        df['Week Beginning Date'] = pd.to_datetime(week_begin)

        # Drop NaN rows and realign dataframe
        df = df.dropna(subset='Market')

        # Drop rows that contain '-'
        rows_to_drop = df[df['Pyramid Segment'].str.contains('-')].index
        df.drop(rows_to_drop, inplace=True)

        df['ASYS ID'] = df['ASYS ID'].astype(int).astype(str)

        df = us_foods_sellout.fill_in_missing_data(df)

        return df[['Area', 'Region', 'Market', 'State', 'Pyramid Segment', 'ASYS ID', 'ASYS Description', 'Manufacturer GTIN', 'McCain SKU ID','Week Beginning Date', 'LBS']]
    
    def setup_connection():
        server = 'mf-enterprise-dev-sql.46ac3df1733c.database.windows.net'
        database = 'PWRAPPDB'
        driver = '{ODBC Driver 17 for SQL Server}'

        # Establish the database connection using AAD Integrated Authentication
        conn_str = (
            f'DRIVER={driver};'
            f'SERVER=tcp:{server};'
            f'DATABASE={database};'
            'Authentication=ActiveDirectoryIntegrated'
        )

        cnxn = pyodbc.connect(conn_str)
        cursor = cnxn.cursor()

        return cnxn, cursor


    def filter_rows(self):
        sql_select = """
        SELECT [Week Beginning Date] 
        FROM [na_dist].[US_USFoods_Sellout]
        GROUP BY [Week Beginning Date] """

        cnxn, cursor = us_foods_sellout.setup_connection()

        cursor.execute(sql_select)

        # Fetch the results
        results = cursor.fetchall()

        # Convert the results into a list of dates
        dates_in_db = [result[0] if result[0] is not None else None for result in results]
        dates_in_db = pd.to_datetime([date for date in dates_in_db if date is not None])

        # Commit the transactions
        cnxn.commit()

        
        # Close the connection
        cursor.close()
        cnxn.close()

        return self.data[~self.data['Week Beginning Date'].isin(dates_in_db)]

    def setup_miengine():
        server = 'mf-enterprise-dev-sql.46ac3df1733c.database.windows.net'
        database = 'PWRAPPDB'
        driver = '{ODBC Driver 17 for SQL Server}'

        # Establish the database connection using AAD Integrated Authentication
        conn_str = URL.create(
            'mssql+pyodbc',
            query={
                'odbc_connect':(
                    f'DRIVER={driver};'
                    f'SERVER=tcp:{server};'
                    f'DATABASE={database};'
                    'Authentication=ActiveDirectoryIntegrated;'
            )
            }
        )

        engine = create_engine(conn_str, connect_args={"autocommit": True}, fast_executemany=True, use_insertmanyvalues=False)

        return engine

        
    def insert_rows(self):
        engine = us_foods_sellout.setup_miengine()

        table_name = 'US_USFoods_Sellout'
        schema_name = 'na_dist'

        # If the table doesn't exist, it will be created automatically
        self.data_import.to_sql(table_name, con=engine, schema=schema_name, if_exists='append', index=False)


In [38]:
file_path = r'C:\Users\newatter\OneDrive - McCain Foods Limited\Distributor Sell-Out\Weekly Update Files\2024-04-05\SharedTable_weekly_cases 2024-01-21.csv'

us_foods_sellout(file_path)

<__main__.us_foods_sellout at 0x1a3411dbdc0>

In [39]:
import os

# The path to the directory containing the folders
directory_path = r'C:\Users\newatter\OneDrive - McCain Foods Limited\Distributor Sell-Out\Weekly Update Files'

# Loop through each folder and file in the directory
for root, dirs, files in os.walk(directory_path):
    for file in files:
        # Check if "SharedTable_weekly_cases" is in the filename
        if "SharedTable_weekly" in file or "SharedTable_export" in file:

            # Construct the full file path
            file_path = os.path.join(root, file)
            
            print(file_path)

            us_foods_sellout(file_path)

C:\Users\newatter\OneDrive - McCain Foods Limited\Distributor Sell-Out\Weekly Update Files\2024-04-05\SharedTable_export 2024-03-31.csv
C:\Users\newatter\OneDrive - McCain Foods Limited\Distributor Sell-Out\Weekly Update Files\2024-04-05\SharedTable_export 2024-04-07.csv
C:\Users\newatter\OneDrive - McCain Foods Limited\Distributor Sell-Out\Weekly Update Files\2024-04-05\SharedTable_weekly_cases 2023-12-31.csv
C:\Users\newatter\OneDrive - McCain Foods Limited\Distributor Sell-Out\Weekly Update Files\2024-04-05\SharedTable_weekly_cases 2024-01-07.csv
C:\Users\newatter\OneDrive - McCain Foods Limited\Distributor Sell-Out\Weekly Update Files\2024-04-05\SharedTable_weekly_cases 2024-01-14.csv
C:\Users\newatter\OneDrive - McCain Foods Limited\Distributor Sell-Out\Weekly Update Files\2024-04-05\SharedTable_weekly_cases 2024-01-21.csv
C:\Users\newatter\OneDrive - McCain Foods Limited\Distributor Sell-Out\Weekly Update Files\2024-04-05\SharedTable_weekly_cases 2024-01-28.csv
C:\Users\newatter\

# Upload Precima data to SQL database

In [1]:
import pyodbc
import sqlalchemy
from sqlalchemy import create_engine, MetaData, Table
from sqlalchemy.engine import URL
from datetime import datetime
import pandas as pd


def setup_miengine():
        server = 'mf-enterprise-dev-sql.46ac3df1733c.database.windows.net'
        database = 'PWRAPPDB'
        driver = '{ODBC Driver 17 for SQL Server}'

        # Establish the database connection using AAD Integrated Authentication
        conn_str = URL.create(
            'mssql+pyodbc',
            query={
                'odbc_connect':(
                    f'DRIVER={driver};'
                    f'SERVER=tcp:{server};'
                    f'DATABASE={database};'
                    'Authentication=ActiveDirectoryIntegrated;'
            )
            }
        )

        # use_insertmanyvalues=False
        engine = create_engine(conn_str, 
                               connect_args={"autocommit": True}, 
                               fast_executemany=True,
                               use_insertmanyvalues=False)

        return engine

In [11]:
def insert_rows(df):
        engine = setup_miengine()

        table_name = 'US_USFoods_Sellout'
        schema_name = 'na_dist'

        # If the table doesn't exist, it will be created automatically
        df.to_sql(table_name, con=engine, schema=schema_name, if_exists='append', index=False, chunksize = None)

In [3]:
source_path = r'C:/Users/newatter/OneDrive - McCain Foods Limited/Distributor Sell-Out/Source Files/us_foods_precima.pkl'

precima = pd.read_pickle(source_path)

print(precima.columns)
print(precima.info())

Index(['Area', 'Region', 'Market', 'State', 'Pyramid Segment',
       'COVID Segmentation - L1', 'COVID Segmentation - L2',
       'COVID Segmentation - (Restaurants)',
       'COVID Segmentation - (Restaurants: Sub-Segment)',
       'Restaurant Service Type', 'ASYS', 'Product',
       'Manufacturer Item Number', 'Merch Category', 'PIM Group',
       'McCain SKU ID', 'Consolidated Category', 'L1 Product Hierarchy',
       'L2 Product Hierarchy', 'Week Starting (Sun)', 'LBS'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 824184 entries, 0 to 824183
Data columns (total 21 columns):
 #   Column                                           Non-Null Count   Dtype         
---  ------                                           --------------   -----         
 0   Area                                             824184 non-null  object        
 1   Region                                           824184 non-null  object        
 2   Market                               

In [4]:
columns_to_keep = ['Area', 'Region', 'Market', 'State', 'Pyramid Segment','ASYS', 'Product',
       'Manufacturer Item Number', 'McCain SKU ID','Week Starting (Sun)', 'LBS']

rename_columns = {
    'ASYS':'ASYS ID', 
    'Product':'ASYS Description',
    'Manufacturer Item Number':'Manufacturer GTIN',
    'Week Starting (Sun)':'Week Beginning Date'
}

usf = precima[columns_to_keep].rename(columns=rename_columns)

print(usf.columns)

Index(['Area', 'Region', 'Market', 'State', 'Pyramid Segment', 'ASYS ID',
       'ASYS Description', 'Manufacturer GTIN', 'McCain SKU ID',
       'Week Beginning Date', 'LBS'],
      dtype='object')


In [6]:
usf['Week Beginning Date'].unique()

<DatetimeArray>
['2021-01-17 00:00:00', '2021-01-24 00:00:00', '2021-01-31 00:00:00',
 '2021-02-14 00:00:00', '2021-02-07 00:00:00', '2021-02-21 00:00:00',
 '2021-02-28 00:00:00', '2021-03-07 00:00:00', '2021-03-14 00:00:00',
 '2021-03-28 00:00:00',
 ...
 '2023-12-10 00:00:00', '2024-01-14 00:00:00', '2023-11-26 00:00:00',
 '2023-12-03 00:00:00', '2023-10-29 00:00:00', '2023-11-05 00:00:00',
 '2023-11-12 00:00:00', '2023-11-19 00:00:00', '2023-12-17 00:00:00',
 '2023-12-24 00:00:00']
Length: 157, dtype: datetime64[ns]

In [21]:
def setup_connection():
        server = 'mf-enterprise-dev-sql.46ac3df1733c.database.windows.net'
        database = 'PWRAPPDB'
        driver = '{ODBC Driver 17 for SQL Server}'

        # Establish the database connection using AAD Integrated Authentication
        conn_str = (
            f'DRIVER={driver};'
            f'SERVER=tcp:{server};'
            f'DATABASE={database};'
            'Authentication=ActiveDirectoryIntegrated'
        )

        cnxn = pyodbc.connect(conn_str)
        cursor = cnxn.cursor()

        return cnxn, cursor


def filter_rows():
    sql_select = """
    SELECT [Week Beginning Date] 
    FROM [na_dist].[US_USFoods_Sellout]
    GROUP BY [Week Beginning Date] """

    cnxn, cursor = setup_connection()

    cursor.execute(sql_select)

    # Fetch the results
    results = cursor.fetchall()

    # Convert the results into a list of dates
    dates_in_db = [result[0] if result[0] is not None else None for result in results]
    dates_in_db = pd.to_datetime([date for date in dates_in_db if date is not None])

    # Commit the transactions
    cnxn.commit()

    # Close the connection
    cursor.close()
    cnxn.close()

    return dates_in_db

In [12]:
insert_rows(usf[usf['Week Beginning Date']=='2021-02-14'])

In [37]:
cnt = 0

weeks_in_db = filter_rows()

for week in usf['Week Beginning Date'].unique():
    if cnt >= 10:
        break
    if week not in weeks_in_db:
        print(week)
        insert_rows(usf[usf['Week Beginning Date']==week])
        cnt += 1
        

    
    



2023-11-19 00:00:00
2023-12-17 00:00:00
2023-12-24 00:00:00


In [None]:
class bek_dictionary:
    def __init__(self, file_path) -> None:
        self.filepath = file_path
        self.data = self.process_file()
        self.delete_rows()
        self.insert_rows()

        pass
    

    def process_file(self):
        df = pd.read_excel(self.filepath, sheet_name='Segment Mapping')

        columns_to_keep = [
            'Business Unit',
            'SIC Code',
            'SIC Sub',
            'COVID Segmentation - L1',	
            'COVID Segmentation - L2',
            'COVID Segmentation - (Restaurants)',
            'COVID Segmentation - (Restaurants: Sub-Segment)',
            'Restaurant Service Type',
            'Cuisine Type'
        ]

        rename_columns = {
            'COVID Segmentation - L1':'Segmentation L1',
            'COVID Segmentation - L2':'Segmentation L2',
            'COVID Segmentation - (Restaurants)':'Restaurants',
            'COVID Segmentation - (Restaurants: Sub-Segment)':'Restaurants Sub',
            'Restaurant Service Type':'Service Type'
        }
        
        df_clean = df[columns_to_keep].rename(columns=rename_columns)

        return df_clean

    def setup_connection():
        server = 'mf-enterprise-dev-sql.46ac3df1733c.database.windows.net'
        database = 'PWRAPPDB'
        driver = '{ODBC Driver 17 for SQL Server}'

        # Establish the database connection using AAD Integrated Authentication
        conn_str = (
            f'DRIVER={driver};'
            f'SERVER=tcp:{server};'
            f'DATABASE={database};'
            'Authentication=ActiveDirectoryIntegrated'
        )

        cnxn = pyodbc.connect(conn_str)
        cursor = cnxn.cursor()

        return cnxn, cursor

    def delete_rows(self):

        sql_delete = """
        DELETE FROM [na_dist].[US_BEK_Segmentation] 
        """

        cnxn, cursor = bek_dictionary.setup_connection()

        cursor.execute(sql_delete)

        # Commit the transactions
        cnxn.commit()

        # Close the connection
        cursor.close()
        cnxn.close()


    def setup_miengine():
        server = 'mf-enterprise-dev-sql.46ac3df1733c.database.windows.net'
        database = 'PWRAPPDB'
        driver = '{ODBC Driver 17 for SQL Server}'

        # Establish the database connection using AAD Integrated Authentication
        conn_str = URL.create(
            'mssql+pyodbc',
            query={
                'odbc_connect':(
                    f'DRIVER={driver};'
                    f'SERVER=tcp:{server};'
                    f'DATABASE={database};'
                    'Authentication=ActiveDirectoryIntegrated;'
            )
            }
        )

        engine = create_engine(conn_str, connect_args={"autocommit": True}, fast_executemany=True, use_insertmanyvalues=False)

        return engine

        
    def insert_rows(self):
        engine = bek_dictionary.setup_miengine()

        table_name = 'US_BEK_Segmentation'
        schema_name = 'na_dist'

        # If the table doesn't exist, it will be created automatically
        self.data.to_sql(table_name, con=engine, schema=schema_name, if_exists='append', index=False)