# Table of FSA approved premises

## Import required libraries

In [13]:
import re
import collections
import getpass
from pathlib import Path
import numpy as np
import pandas as pd
import nltk
nltk.download('punkt')
import epydemiology as epy

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/philipjones/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [14]:
import pymysql
import sqlalchemy
#engine = sqlalchemy.create_engine('postgresql://scott:tiger@localhost:5432/mydatabase')

## Define functions that will combine lists of FSA approved premises
List needs to order files in chronological order so that the most recent file represents the most up-to-date information.

In [15]:
def phjUpdateData(phjOriginalDF,
                  phjNewDF,
                  phjColsOfInterestList,
                  phjIDColName = 'id',
                  phjYrColName = 'yr',
                  phjMthColName = 'mth',
                  phjGrpColName = 'grp',
                  phjNameColName = 'name',
                  phjCountColName = 'count',
                  phjTokColName = 'tokens',
                  phjPrevTokColName = 'prevtokens',
                  phjJaccardColName = 'jd',
                  phjMatchColName = 'match',
                  phjPrintResults = False):
    
    # Other things to do:
    # • Check that cols of interest appear in both dataframes
    # • Check that year and month columns already appear in original DF
    # • Check no duplicates in phjGrpColName columns
    

    # Concatenate data frames and sort by appno, year and month to ensure
    # earliest rows occur as first row in each group.
    phjConcatDF = pd.concat([phjOriginalDF,phjNewDF],sort = False).sort_values(by = [phjGrpColName,
                                                                                     phjYrColName,
                                                                                     phjMthColName])

    # Reset index to make sure no duplicated values in index
    phjConcatDF = phjConcatDF.reset_index(drop = True)

    # Add count column to indicate how many rows in each groupby group
    phjConcatDF[phjCountColName] = phjConcatDF.groupby(phjGrpColName)[phjGrpColName].transform('count')
    
    if phjPrintResults == True:
        print('Concatenated dataframe\n======================')
        print(phjConcatDF)
        print('\n')

        print('Groups that appear only once (i.e. occur only in either original or new dataframes')
        print('==================================================================================')
        print(phjConcatDF.loc[phjConcatDF[phjCountColName] < 2,:])
        print('\n')

        print('Groups that appear more than twice (i.e. already two occurrences of group in database')
        print('=====================================================================================')
        print(phjConcatDF.loc[phjConcatDF[phjCountColName] > 2,:])
        print('\n')

    # Create mask indicating the first rows of any groups with more than two rows, leaving the last
    # two rows only for further processing.
    # Suggestion for this method given as answer by Nick Becker in response to question at:
    # https://stackoverflow.com/questions/59761547/create-mask-to-identify-final-two-rows-in-groups-in-pandas-dataframe
    phjLast2RowsMask = phjConcatDF.index.isin(phjConcatDF.groupby(phjGrpColName).tail(2).index)

    # Add columns of tokens and tokens from previous rows
    phjLast2RowsDF = phjAddColOfTokens(phjDF = phjConcatDF[phjLast2RowsMask],
                                       phjNameColName = phjNameColName,
                                       phjTokColName = phjTokColName,
                                       phjPrevTokColName = phjPrevTokColName,
                                       phjGrpColName = phjGrpColName,
                                       phjYrColName = phjYrColName,
                                       phjMthColName = phjMthColName,
                                       phjPrintResults = phjPrintResults)
    
    if phjPrintResults == True:
        print('Token columns added (only last 2 rows of each group retained)\n===================')
        print(phjLast2RowsDF)
        print('\n')
    
    # Compare tokens and previous tokens using Jaccard distance
    phjLast2RowsDF = phjCompareTokCols(phjDF = phjLast2RowsDF,
                                       phjGrpColName = phjGrpColName,
                                       phjTokColName = phjTokColName,
                                       phjPrevTokColName = phjPrevTokColName,
                                       phjJaccardColName = phjJaccardColName,
                                       phjPrintResults = phjPrintResults)
     
    # Define names that have changed
    phjLast2RowsDF = phjDefineChangedNames(phjDF = phjLast2RowsDF,
                                           phjGrpColName = phjGrpColName,
                                           phjNameColName = phjNameColName,
                                           phjJaccardColName = phjJaccardColName,
                                           phjMatchColName = phjMatchColName,
                                           phjCutOffValue = 0.5,
                                           phjPrintResults = phjPrintResults)
    
    # Remove unchanged names
    phjLast2RowsDF = phjRemoveUnchanged(phjDF = phjLast2RowsDF,
                                        phjIDColName = phjIDColName,
                                        phjGrpColName = phjGrpColName,
                                        phjMatchColName = phjMatchColName,
                                        phjCountColName = phjCountColName,
                                        phjPrintResults = phjPrintResults)
    
    # Re-attach the first rows from groups with more than 2 rows
    phjConcatDF = phjConcatDF[~phjLast2RowsMask].append(phjLast2RowsDF,
                                                        ignore_index = False,
                                                        sort = False)
    
    if phjPrintResults == True:
        print('Recombined dataframes\n=====================')
        print(phjConcatDF)
        print('\n')

    # Update id column
    phjConcatDF = phjUpdateID(phjDF = phjConcatDF,
                              phjIDColName = phjIDColName,
                              phjYrColName = phjYrColName,
                              phjMthColName = phjMthColName,
                              phjPrintResults = phjPrintResults)
    
    # Some final corrections...
    phjConcatDF[phjIDColName] = phjConcatDF[phjIDColName].astype('int')
    phjConcatDF[phjYrColName] = phjConcatDF[phjYrColName].astype('int')
    phjConcatDF[phjMthColName] = phjConcatDF[phjMthColName].astype('int')
    phjConcatDF = phjConcatDF.reset_index(drop = True)
    phjConcatDF = phjConcatDF.sort_values(by = [phjGrpColName,phjYrColName,phjMthColName])
    
    if phjPrintResults == True:
        print('Final returned dataframe\n===================')
        print(phjConcatDF)
        print('\n')

        
    return phjConcatDF



# This function does some preprocessing to ensure only data of interest is retained and that minor
# anomalies in data entry are addressed (e.g. upper and lower case, punction marks, white space, etc.)
def phjPreProcess(phjDF,
                  phjCountryDF,
                  phjCountryMissCode,
                  phjColsOfInterestList,
                  phjPrintResults = False):
    
    # Convert column names to lowercase, remove spaces and '???' characters ... just for consistency
    # (For some reason, the AppNo column in Dec 2019 was called '???AppNo')
    phjDF.columns = phjDF.columns.str.replace(' ','').str.replace('\?\?\?','').str.lower()

    # Retain only the columns of interest
    phjDF = phjDF[phjColsOfInterestList].copy()
    
    # Retain slaughterhouse approved premises only
    phjDF['slaughterhouse'] = phjDF['slaughterhouse'].str.lower()
    phjDF = phjDF.loc[phjDF['slaughterhouse'] == 'yes',:].copy()
    
    # Remove whitespace from front and back of trading name column
    phjDF['tradingname'] = phjDF['tradingname'].str.strip()
    
    # Format the appno column to make uppercase and remove white space
    phjDF['appno'] = phjDF['appno'].str.upper().str.replace('[^\w]','')
    
    # Add a column that contains the trading name but in lower case and with all punctuation
    # and spaces removed. Also, '&' is converted to 'and' and 'ltd' to 'limited'. This ensures
    # that subtle variations in the way the TradingName is written won't be misinterpreted as
    # a different company.
    # Actually probably not needed and not used - but leave for the moment
    phjDF['tradingname_lcase'] = phjDF['tradingname'].str.replace('&','and').str.replace('[Ll]td','limited').str.replace('[Bb]ros','Brothers').str.lower().str.replace('[^\w]','')

    # Add column with postcode formatted to 7 characters
    phjDF = epy.phjPostcodeFormat7(phjDF = phjDF,
                                   phjPostcodeVarName = 'postcode',
                                   phjPostcodeCheckVarName = None,
                                   phjPostcode7VarName = 'postcode7',
                                   phjPrintResults = False)

    # Display all countries contained in file
    phjDF['country'].value_counts()
    
    # Replace country names with country codes defined in phjCountryDF
    phjDF = pd.merge(phjDF,
                     phjCountryDF,
                     on = "country",
                     how = 'left')
    
    phjDF['country_id'] = phjDF['country_id'].fillna(phjCountryMissCode)
    
    # Remove original postcode and country columns
    phjDF = phjDF[[col for col in list(phjDF.columns) if col not in ['postcode','country']]].copy()

    return phjDF



# Combine two dataframes ensuring that if there are any repeated rows, only the latest version is retained.

def phjRetrieveOrigData():
    
    return None



# Following function flattens a list that contains lists providing the nested lists are
# only 1 level deep.
def phjFlattenList(phjList):

    flatlist = []

    for sublist in phjList:
        if isinstance(sublist,list):
            for item in sublist:
                flatlist.append(item)
        else:
            flatlist.append(sublist)

    return flatlist



def phjAddColOfTokens(phjDF,
                      phjNameColName,
                      phjTokColName,
                      phjPrevTokColName,
                      phjGrpColName,
                      phjYrColName,
                      phjMthColName,
                      phjPrintResults = False):
    
    # Create a column of tokens found in the name column

    phjStopList = ['(',')','/','&',',','-','–','—','and','the','limited']

    phjReplaceDict = {'t/a':'ta',
                      'ltd':'limited',
                      'bros':'brothers',
                      '&':'and',
                      '2sisters':[2,'sisters']}
    
    # Sort data based on grouping variable and date variables
    phjDF = phjDF.sort_values(by = [phjGrpColName,
                                    phjYrColName,
                                    phjMthColName])
    
    # Remove items from phjStopList and replace any items in list using a dictionary lookup
    # (The dictionary replacements may result in some lists-within-a-list scenarios; the function
    # phjFlattenList() flattens lists providing the nested lists are only 1 deep.)
    # The replace() functions convert hyphens and n and m dashes to spaces.
    phjDF[phjTokColName] = phjDF[phjNameColName].apply(lambda x: [tok for tok in phjFlattenList([phjReplaceDict.get(i,i) for i in nltk.word_tokenize(x.lower().replace('-',' ').replace('–',' ').replace('—',' '))]) if tok not in phjStopList])


    # Create a new column with tokens offset by 1 place so new column contains the
    # tokens from the previous row
    phjDF[phjPrevTokColName] = phjDF[phjTokColName].shift(1)
    
    return phjDF



def phjCompareTokCols(phjDF,
                      phjGrpColName,
                      phjTokColName,
                      phjPrevTokColName,
                      phjJaccardColName,
                      phjPrintResults = False):

    # Need to check that index does not include duplicate values (because next line makes changes
    # based on index position)
    if len(list(phjDF.index)) == len(set(list(phjDF.index))):

        # For each group (groupby appno), the first row can't be compared with the preceding
        # row. Therefore, remove the tokens from the preceding row. The method for doing this
        # was given by EdChum - Reinstate Monica at
        # https://stackoverflow.com/questions/46242488/change-first-element-of-each-group-in-pandas-dataframe
        phjDF.loc[phjDF.groupby(phjGrpColName)[phjPrevTokColName].head(1).index, phjPrevTokColName] = np.nan

        # In groups with two rows, compare tokens with previous tokens using NLTK's Jaccard Distance
        # (Only calculate Jaccard distance if the prevtokens column contains a list i.e. don't try to calculate
        # difference if the prevtokens column contains a NaN value)
        phjDF[phjJaccardColName] = phjDF.apply(lambda x: nltk.jaccard_distance(set(x[phjTokColName]),set(x[phjPrevTokColName])) if isinstance(x[phjPrevTokColName],list) else np.nan,axis = 1)

        if phjPrintResults == True:
            print('Token columns compared\n======================')
            print(phjDF)
            print('\n')
    
    else:
        print("Index values not unique")
        phjDF = None
        
    return phjDF



def phjDefineChangedNames(phjDF,
                          phjGrpColName,
                          phjNameColName,
                          phjJaccardColName,
                          phjMatchColName,
                          phjCutOffValue = 0.5,
                          phjPrintResults = False):
    
    # Get a list of the indexes of rows where Jaccard distance is greater than 0
    phAlteredNamesIndexList = list(phjDF.loc[phjDF['jd'].gt(0),[phjGrpColName,phjNameColName,phjJaccardColName]].index)

    # Edit to include index of immediately preceding row
    phAlteredNamesIndexList = phAlteredNamesIndexList + [i-1 for i in phAlteredNamesIndexList]

    # Sort list to ensure consecutive index numbers are adjacent to each other
    phAlteredNamesIndexList.sort()

    # Provide a cut-off to interpret Jaccard distance
    phjDF[phjMatchColName] = phjDF['jd'].lt(phjCutOffValue)
    phjDF[phjMatchColName] = phjDF[phjMatchColName].replace({1:'yes',0:'no'})

    phjDF.loc[phjDF[phjJaccardColName].isnull(),phjMatchColName] = np.nan

    if phjPrintResults == True:
        print('Altered rows of data\n====================')
        print(phjDF.loc[phAlteredNamesIndexList,:])
        print('\n')
        
    return phjDF



def phjRemoveUnchanged(phjDF,
                       phjIDColName,
                       phjGrpColName,
                       phjMatchColName,
                       phjCountColName,
                       phjPrintResults = False):
    
    # If name has not changed then keep only the last row in the group (but ensure the same id number is retained).
    # If the trading name has changed then keep both rows in the group. The suggested approach to produce the
    # above was given as an answer by Erfin at:
    # https://stackoverflow.com/questions/59568154/updating-a-pandas-dataframe-with-new-data-whilst-retaining-existing-id-number

    mask_yes = phjDF[phjMatchColName].eq('yes') # array with True for rows with 'yes'
    mask_no = phjDF[phjMatchColName].eq('no')   # array with True for rows with 'no'
    mask_single = phjDF[phjCountColName].eq(1)  # array with True for rows in single-row groups


    # if the row is 'yes', get the shifted id, else the original id
    phjDF[phjIDColName] = np.where(mask_yes, phjDF[phjIDColName].shift(), phjDF[phjIDColName]) 

    # if a group has 'no' mark all rows as True so we can keep the whole group
    mask = phjDF.assign(indicator=mask_no).groupby(phjGrpColName)['indicator'].transform('any')

    # filter on groups with 'no' or only the row 'yes'
    phjDF = phjDF[mask | mask_yes | mask_single]

    if phjPrintResults == True:
        print('Updated dataframe\n=================')
        print(phjDF)
        print('\n')

    return phjDF



def phjUpdateID(phjDF,
                phjIDColName = 'id',
                phjYrColName = 'yr',
                phjMthColName = 'mth',
                phjPrintResults = False):
    
    # Sort dataframe by ID and date columns, ensuring that NaNs are left at the end
    phjDF = phjDF.sort_values(by = [phjIDColName,phjYrColName,phjMthColName],
                              na_position = 'last')
    
    # Create a list of new ID values that follow-on from those ID values that already exist.
    # But first, need to check whether the max() value is null (which can happen if the dataframe is empty).
    if pd.isnull(phjDF[phjIDColName].max()):
        phjNewIDs = list(range(1,
                         len(phjDF.loc[phjDF[phjIDColName].isnull(),:]) + 1))
    else:
        phjNewIDs = list(range(phjDF[phjIDColName].max().astype(int) + 1,
                               phjDF[phjIDColName].max().astype(int) + len(phjDF.loc[phjDF[phjIDColName].isnull(),:]) + 1))

    # Where ID values are NaN, replace with range of new IDs
    phjDF.loc[phjDF[phjIDColName].isnull(),[phjIDColName]] = phjNewIDs

    if phjPrintResults == True:
        print('Updated ID values\n=================')
        print(phjDF)
        print('\n')
    
    return phjDF

## Testing with hypothetical data

In [16]:
# Define missing codes
phjMissingCodesDict = {'phjMissValueStr':'missing',
                       'phjCPHCountyMissCode':999,
                       'phjCountryMissCode':99,
                       'phjAnimSrcMissCode':400,
                       'phjSppGrpMissCode':99,
                       'phjOffCCIRMissCode':999,
                       'phjInspectionTypeMissCode':785}

# Define country codes
phjCountryDF = pd.DataFrame({'country_id':[1,2,3,4,5,6,7,phjMissingCodesDict['phjCountryMissCode']],
                             'country':['England','Wales','Scotland','Northern Ireland','Guernsey','Jersey','Isle of Man','missing']})

print(phjCountryDF)
print(phjCountryDF.dtypes)
print('\n')

phjColsOfInterestList = ['premisesname_id','appno','tradingname','country','yr','mth','slaughterhouse','postcode']

# Original dataframe should include a column of premises ID
origDF = pd.DataFrame({'premisesname_id':[1,2,3,4,5,6,11,12,13,7,8,9,10],
                       'appno':['123','226','321','356','403','556','556','556','556','598','612','663','785'],
                       'tradingname':['Smith and Jones','ABC Ltd','Stuff Galore Ltd','Quality Food','Pure Heaven','XYZ Foods','ABC Foods','LMN Foods','AAA Foods','Fry Foods','Farm Foods','Good Eggs Limited','Farm to Fork Food'],
                       'country':['England','England','England','England','Wales','Scotland','Scotland','Scotland','Scotland','Wales','England','England','Northern Ireland'],
                       'yr':[2018,2018,2018,2018,2018,2018,2018,2018,2018,2018,2018,2018,2018],
                       'mth':[4,4,4,4,4,1,2,3,4,4,4,4,4],
                       'slaughterhouse':['yes','yes','yes','yes','yes','yes','yes','yes','yes','yes','yes','yes','yes'],
                       'postcode':['np45df','ab123df','w14sd','cd49gs','eg349dg','h118gs','h118gs','h118gs','h118gs','j28gd','h89ad','p98gs','ts19yu']})

# Preprocess the original file
origDF = phjPreProcess(phjDF = origDF,
                       phjCountryDF = phjCountryDF,
                       phjCountryMissCode = phjMissingCodesDict['phjCountryMissCode'],
                       phjColsOfInterestList = phjColsOfInterestList)

print('Original dataframe (post processing)\n==================')
print(origDF)
print('\n')



# New dataframe should have a column of premises ID but should be NaN initially
newDF = pd.DataFrame({'premisesname_id':[np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan],
                      'appno':['123','226','356','403','556','598','663','785','444'],
                      'tradingname':['Smith and Jones Bros','2sisters ABC Ltd','Completely Different','Pure Heaven','AAA Foods','Fry & Sons Foods','Good Eggs Ltd','Farm-2-Fork Food','Food Heaven'],
                      'country':['England','England','England','Wales','Scotland','England','England','Northern Ireland','Wales'],
                      'yr':[2018,2018,2018,2018,2018,2018,2018,2018,2018],
                      'mth':[5,5,5,5,5,5,5,5,5],
                      'slaughterhouse':['yes','yes','yes','yes','yes','yes','yes','yes','yes'],
                      'postcode':['np45df','ab123df','w14sd','eg349dg','h118gs','j28gd','p98gs','ts19yu','xy987ab']})

# Preprocess the new file
newDF = phjPreProcess(phjDF = newDF,
                      phjCountryDF = phjCountryDF,
                      phjCountryMissCode = phjMissingCodesDict['phjCountryMissCode'],
                      phjColsOfInterestList = phjColsOfInterestList)

print('Newer dataframe (post processing)\n===============')
print(newDF)
print('\n')



phjCombinedDF = phjUpdateData(phjOriginalDF = origDF,
                              phjNewDF = newDF,
                              phjColsOfInterestList = phjColsOfInterestList,
                              phjIDColName = 'premisesname_id',
                              phjYrColName = 'yr',
                              phjMthColName = 'mth',
                              phjGrpColName = 'appno',
                              phjNameColName = 'tradingname',
                              phjCountColName = 'count',
                              phjTokColName = 'tokens',
                              phjPrevTokColName = 'prevtokens',
                              phjJaccardColName = 'jd',
                              phjMatchColName = 'samename',
                              phjPrintResults = True)

   country_id           country
0           1           England
1           2             Wales
2           3          Scotland
3           4  Northern Ireland
4           5          Guernsey
5           6            Jersey
6           7       Isle of Man
7          99           missing
country_id     int64
country       object
dtype: object


Original dataframe (post processing)
    premisesname_id appno        tradingname    yr  mth slaughterhouse  \
0                 1   123    Smith and Jones  2018    4            yes   
1                 2   226            ABC Ltd  2018    4            yes   
2                 3   321   Stuff Galore Ltd  2018    4            yes   
3                 4   356       Quality Food  2018    4            yes   
4                 5   403        Pure Heaven  2018    4            yes   
5                 6   556          XYZ Foods  2018    1            yes   
6                11   556          ABC Foods  2018    2            yes   
7                12   556

## Test with two files

In [17]:
# Path to directory
phjPath = './monthly_approved_premises'

# List of files
phjFilesList = ["approved-food-establishments-as-at-1-january-2018.csv",
                "approved-food-establishments-as-at-1-december-2019.csv"]

# Define missing codes
phjMissingCodesDict = {'phjMissValueStr':'missing',
                       'phjCPHCountyMissCode':999,
                       'phjCountryMissCode':999,
                       'phjAnimSrcMissCode':999,
                       'phjSppGrpMissCode':999,
                       'phjOffCCIRMissCode':999,
                       'phjInspectionTypeMissCode':999}

# Define country codes
phjCountryDF = pd.DataFrame({'country_id':[1,2,3,4,5,6,7,phjMissingCodesDict['phjCountryMissCode']],
                             'country':['England','Wales','Scotland','Northern Ireland','Guernsey','Jersey','Isle of Man','missing']})

print(phjCountryDF)
print(phjCountryDF.dtypes)
print('\n')

phjColsOfInterestList = ['appno','tradingname','town','postcode','country','slaughterhouse','x','y']

# Original dataframe should include a column of premises ID
# Read csv file
origDF = pd.read_csv(Path('/'.join([phjPath,phjFilesList[0]])))

# Preprocess the original file
origDF = phjPreProcess(phjDF = origDF,
                       phjCountryDF = phjCountryDF,
                       phjCountryMissCode = phjMissingCodesDict['phjCountryMissCode'],
                       phjColsOfInterestList = phjColsOfInterestList)

origDF['id'] = range(1,len(origDF.index)+1)
origDF = origDF[['id'] + [i for i in origDF.columns if i not in ['id']]]
origDF['yr'] = 2018
origDF['mth'] = 1


print('Original dataframe (post processing)\n==================')
print(origDF.sort_values(by = ['appno','id']))
print('\n')



# New dataframe should have a column of premises ID but should be NaN initially
newDF = pd.read_csv(Path('/'.join([phjPath,phjFilesList[1]])))


# Preprocess the new file
newDF = phjPreProcess(phjDF = newDF,
                      phjCountryDF = phjCountryDF,
                      phjCountryMissCode = phjMissingCodesDict['phjCountryMissCode'],
                      phjColsOfInterestList = phjColsOfInterestList)

newDF['id'] = np.nan
newDF = newDF[['id'] + [i for i in newDF.columns if i not in ['id']]]
newDF['yr'] = 2019
newDF['mth'] = 12

print('Newer dataframe (post processing)\n===============')
print(newDF.sort_values(by = ['appno','id']))
print('\n')



phjCombinedDF = phjUpdateData(phjOriginalDF = origDF,
                              phjNewDF = newDF,
                              phjColsOfInterestList = phjColsOfInterestList,
                              phjIDColName = 'id',
                              phjYrColName = 'yr',
                              phjMthColName = 'mth',
                              phjGrpColName = 'appno',
                              phjNameColName = 'tradingname',
                              phjCountColName = 'count',
                              phjTokColName = 'tokens',
                              phjPrevTokColName = 'prevtokens',
                              phjJaccardColName = 'jd',
                              phjMatchColName = 'samename',
                              phjPrintResults = True)

print(phjCombinedDF.sort_values(by = ['appno','id']))

   country_id           country
0           1           England
1           2             Wales
2           3          Scotland
3           4  Northern Ireland
4           5          Guernsey
5           6            Jersey
6           7       Isle of Man
7         999           missing
country_id     int64
country       object
dtype: object


Original dataframe (post processing)
      id  appno                                        tradingname  \
0      1   1007                           Frank Bird (Poultry) Ltd   
1      2   1077                Joe Simpson (T/A F Simpson and Son)   
281  282   1100  2 Sisters Poultry Ltd (TA 2 Sisters Coupar Angus)   
282  283   1101             Anglo Beef Processors UK T/A ABP Perth   
283  284   1103            Neerock Limited (T/A Woodhead Brothers)   
..   ...    ...                                                ...   
277  278   9509                                 Moy Park Ballymena   
278  279   9518                             McKeown Fine 

Token columns added (only last 2 rows of each group retained)
        id  appno                                        tradingname  \
0      1.0   1007                           Frank Bird (Poultry) Ltd   
1      NaN   1007                           Frank Bird (Poultry) Ltd   
2      2.0   1077                Joe Simpson (T/A F Simpson and Son)   
3      NaN   1077                Joe Simpson (T/A F Simpson and Son)   
4    282.0   1100  2 Sisters Poultry Ltd (TA 2 Sisters Coupar Angus)   
..     ...    ...                                                ...   
648    NaN   9552                                     Kearns Poultry   
649  281.0   9554                                   Rockvale Poultry   
650    NaN   9554                                   Rockvale Poultry   
651  262.0  GX401                         State of Guernsey Abattoir   
652    NaN   IOM1                                  Isle of Man Meats   

                town slaughterhouse         x         y  \
0            P

      id  appno                                        tradingname  \
0      1   1007                           Frank Bird (Poultry) Ltd   
1      2   1077                Joe Simpson (T/A F Simpson and Son)   
281  282   1100  2 Sisters Poultry Ltd (TA 2 Sisters Coupar Angus)   
282  283   1101             Anglo Beef Processors UK T/A ABP Perth   
283  284   1103            Neerock Limited (T/A Woodhead Brothers)   
..   ...    ...                                                ...   
278  279   9518                             McKeown Fine Foods Ltd   
279  280   9552                                     Kearns Poultry   
280  281   9554                                   Rockvale Poultry   
261  262  GX401                         State of Guernsey Abattoir   
368  369   IOM1                                  Isle of Man Meats   

                town slaughterhouse         x         y  \
0            Penrith            yes  357081.0  533629.0   
1    Bishop Auckland            yes  4128

## Testing with real data files

### Define additional required data

#### Run real datafiles and add to MySQL database

In [18]:
phjYearList = [2018,2019,2020]

phjMonthOrdDict = collections.OrderedDict()
phjMonthOrdDict['january'] = 1
phjMonthOrdDict['february'] = 2
phjMonthOrdDict['march'] = 3
phjMonthOrdDict['april'] = 4
phjMonthOrdDict['may'] = 5
phjMonthOrdDict['june'] = 6
phjMonthOrdDict['july'] = 7
phjMonthOrdDict['august'] = 8
phjMonthOrdDict['september'] = 9
phjMonthOrdDict['october'] = 10
phjMonthOrdDict['november'] = 11
phjMonthOrdDict['december'] = 12

phjMonthList = [k for k,v in phjMonthOrdDict.items()]

print(phjMonthList)

['january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october', 'november', 'december']


In [19]:
# Define missing codes
phjMissingCodesDict = {'phjMissValueStr':'missing',
                       'phjCPHCountyMissCode':999,
                       'phjCountryMissCode':999,
                       'phjAnimSrcMissCode':999,
                       'phjSppGrpMissCode':999,
                       'phjOffCCIRMissCode':999,
                       'phjInspectionTypeMissCode':999}

In [20]:
phjCountryDF = pd.DataFrame({'country_id':[1,2,3,4,5,6,7,phjMissingCodesDict['phjCountryMissCode']],
                             'country':['England','Wales','Scotland','Northern Ireland','Guernsey','Jersey','Isle of Man','missing']})

print(phjCountryDF)

   country_id           country
0           1           England
1           2             Wales
2           3          Scotland
3           4  Northern Ireland
4           5          Guernsey
5           6            Jersey
6           7       Isle of Man
7         999           missing


#### Run files

In [21]:
# Make connection to database where data will be retrieved and stored
# Instructions for creating sql engine taken from: https://docs.sqlalchemy.org/en/13/dialects/mysql.html
phjUsername = input('Enter username: ')
phjPassword = getpass.getpass('Enter password: ')

phjSQLEng = sqlalchemy.create_engine('mysql+pymysql://{0}:{1}@localhost/ccir_data'.format(phjUsername,phjPassword))

# Delete contents of table first (don't use 'replace' because it changes the structure of the table)
with phjSQLEng.begin() as conn:     
    conn.execute('DROP TABLE IF EXISTS `appno`')
    
#    conn.execute("""CREATE TABLE IF NOT EXISTS `ccir_data`.`appno` (
#                      `id` INT(4) UNSIGNED NOT NULL AUTO_INCREMENT,
#                      `appno` VARCHAR(12) CHARACTER SET 'utf8mb4' COLLATE 'utf8mb4_0900_ai_ci' NOT NULL DEFAULT '',
#                      `yr` INT(4) UNSIGNED NOT NULL,
#                      `mth` INT(4) UNSIGNED NOT NULL,
#                      `abattoir_id` INT(4) UNSIGNED NOT NULL,
#                    PRIMARY KEY (`id`),
#                    UNIQUE INDEX `appno-yr-mth_UNIQUE` (`appno` ASC, `yr` ASC, `mth` ASC) VISIBLE)
#                    ENGINE = InnoDB
#                    AUTO_INCREMENT = 1
#                    DEFAULT CHARACTER SET = utf8mb4
#                    COLLATE = utf8mb4_0900_ai_ci
#                    """)

    conn.execute("""CREATE TABLE IF NOT EXISTS `ccir_data`.`appno` (
                      `appno` VARCHAR(12) CHARACTER SET 'utf8mb4' COLLATE 'utf8mb4_0900_ai_ci' NOT NULL DEFAULT '',
                      `yr` INT(4) UNSIGNED NOT NULL,
                      `mth` INT(4) UNSIGNED NOT NULL,
                      `abattoir_id` INT(4) UNSIGNED NOT NULL,
                      PRIMARY KEY (`appno`,`yr`,`mth`))
                    ENGINE = InnoDB
                    DEFAULT CHARACTER SET = utf8mb4
                    COLLATE = utf8mb4_0900_ai_ci
                    """)


    conn.execute('DELETE FROM `abattoir`')
    


Enter username: root
Enter password: ········


  result = self._query(query)


In [22]:
# Files listing FSA approved premises taken from:
# https://data.food.gov.uk/catalog/datasets/1e61736a-2a1a-4c6a-b8b1-e45912ebc8e3

# Path to directory
phjPath = './monthly_approved_premises'

# List of files
phjFilesList = ["approved-food-establishments-as-at-1-{}-{}.csv".format(mth,yr) for yr in phjYearList for mth in phjMonthList]

# Columns to retain in data
phjColsOfInterestList  = ['appno','tradingname','town','postcode','country','slaughterhouse','x','y']

# Make connection to database where data will be retrieved and stored
# Instructions for creating sql engine taken from: https://docs.sqlalchemy.org/en/13/dialects/mysql.html
phjUsername = input('Enter username: ')
phjPassword = getpass.getpass('Enter password: ')

phjSQLEng = sqlalchemy.create_engine('mysql+pymysql://{0}:{1}@localhost/ccir_data'.format(phjUsername,phjPassword))

phjRenameYrMthDict = {'last_yr':'yr',
                      'last_mth':'mth'}

for phjFilename in phjFilesList:
    
    print('Filename: {}'.format(phjFilename))
    
    # Retrieve data from database to act as original data
    # ---------------------------------------------------
    # Make connection to database where data will be retrieved and stored
    #phjConn = epy.phjConnectToDatabase('mysql')
    
    try:
        fsaAppno1DF = pd.read_sql('SELECT * from `abattoir`', con=phjSQLEng)
        
        # The columns in the database are 'latest_yr' and 'latest_mth' but this needs to
        # be changed to 'yr' and 'mth' respectively to enable dataframe to be combined with
        # new data file.
        fsaAppno1DF = fsaAppno1DF.rename(columns = phjRenameYrMthDict)
    
    except pd.io.sql.DatabaseError as e:
        print('\nA DatabaseError occurred.')
        print(e)
        fsaAppno1DF = None
        break
    
    print('First dataframe (retrieved from DB)\n===============')
    print(fsaAppno1DF)
    print('\n')

    # Retrieve next file in list to act as latest data
    # ------------------------------------------------
    try:
        # Read csv file
        fsaAppno2DF = pd.read_csv(Path('/'.join([phjPath,phjFilename])))

        # Preprocess the new file
        fsaAppno2DF = phjPreProcess(phjDF = fsaAppno2DF,
                                    phjCountryDF = phjCountryDF,
                                    phjCountryMissCode = phjMissingCodesDict['phjCountryMissCode'],
                                    phjColsOfInterestList = phjColsOfInterestList)

        # Add id column and move it to front
        fsaAppno2DF['id'] = np.nan
        fsaAppno2DF = fsaAppno2DF[['id'] + [i for i in fsaAppno2DF.columns if i not in ['id']]]

        # Add yr and mth columns
        # Extract year and month from filename and add to dataframe
        phjExtrDate = re.search('as-at-1-(?P<mth>.*)-(?P<yr>.*).csv',phjFilename)
        fsaAppno2DF['yr'] = int(phjExtrDate['yr'])
        fsaAppno2DF['mth'] = int(phjMonthOrdDict[phjExtrDate['mth']])

        print('Second dataframe (post processing)\n================')
        print(fsaAppno2DF)
        print('\n')
    
    except FileNotFoundError as e:
        print('There was error opening the file {}.'.format(phjFilename))
        print(e)
        print('\n')
            
        fsaAppno2DF = None
        
    
    if (fsaAppno2DF is not None) & (fsaAppno2DF is not None):
        
        # Combine dataframes
        # ------------------
        phjCombinedDF = phjUpdateData(phjOriginalDF = fsaAppno1DF,
                                      phjNewDF = fsaAppno2DF,
                                      phjColsOfInterestList = phjColsOfInterestList,
                                      phjIDColName = 'id',
                                      phjYrColName = 'yr',
                                      phjMthColName = 'mth',
                                      phjGrpColName = 'appno',
                                      phjNameColName = 'tradingname',
                                      phjCountColName = 'count',
                                      phjTokColName = 'tokens',
                                      phjPrevTokColName = 'prevtokens',
                                      phjJaccardColName = 'jd',
                                      phjMatchColName = 'samename',
                                      phjPrintResults = False)
        
        print('Combined dataframe (post processing)\n================')
        print(phjCombinedDF)
        print('\n')


        # Write data to 'abattoir' table
        # ------------------------------
        
        # The 'abattoir' table defines an id number for each incarnation of an abattoir. For abattoirs that
        # have only minor changes in details, the same id number will be retained but the lastes version
        # of an abattoir's metadata will be retained. If an abattoir name changes considerably (e.g. the
        # abattoir is bought by another company) then a new id number will be created.
        
        # The columns that need to be written back to the database are inlucded in the phjColsOfInterestList
        # but with 'country' replaced by 'country_id' and 'postcode' replaced by 'postcode7'.
        # Before writing back to the database, the 'yr' and 'mth' columns must be renamed to 'latest_yr' and
        # latest 'latest_mth'; this is done by reversing the key:value pairs in the phjRenameYrMthDict dictionary.
        # For code to replace the items within a list (i.e. 'country' to 'country_id' etc.) see:
        # https://stackoverflow.com/questions/53294611/pandas-to-sql-changing-datatype-in-database-table
        
        # Delete contents of table first (don't use 'replace' because it changes the structure of the table)
        with phjSQLEng.begin() as conn:     
            conn.execute('DELETE FROM `abattoir`')
        
        # Then append data in dataframe (to empty database)
        phjColNameRepl = {'country':'country_id',
                          'postcode':'postcode7'}

        phjCombinedDF[[phjColNameRepl.get(i,i) for i in ['id']+phjColsOfInterestList+['yr','mth']]].rename(columns = {v:k for k,v in phjRenameYrMthDict.items()}).to_sql(con = phjSQLEng,
                                                                                                                                                                         name = 'abattoir',
                                                                                                                                                                         if_exists = 'append',
                                                                                                                                                                         index = False,
                                                                                                                                                                         dtype = {'id':sqlalchemy.types.INTEGER(),
                                                                                                                                                                                  'appno':sqlalchemy.types.VARCHAR(length=12),
                                                                                                                                                                                  'tradingname':sqlalchemy.types.VARCHAR(length=254),
                                                                                                                                                                                  'town':sqlalchemy.types.VARCHAR(length=254),
                                                                                                                                                                                  'country_id':sqlalchemy.types.INTEGER(),
                                                                                                                                                                                  'slaughterhouse':sqlalchemy.types.VARCHAR(length=12),
                                                                                                                                                                                  'x':sqlalchemy.types.FLOAT(),
                                                                                                                                                                                  'y':sqlalchemy.types.FLOAT(),
                                                                                                                                                                                  'latest_yr':sqlalchemy.types.INTEGER(),
                                                                                                                                                                                  'latest_mth':sqlalchemy.types.INTEGER()})
        
        # Write data to 'appno' table
        # ===========================
        # This table links the appno for any given month and year with the name of the abattoir on that
        # particular occasion using the 'abattoir_id' field. The only data that needs to be retained is
        # the information relating to the yr and mth of the current field being processed.
        # N.B. The appno table is set to have autoincrement for id column. Rows are added to the table
        # for each new file. The table is not cleared beforehand. If the table needs to be deleted
        # manually, the following SQL is report to be used to reset the autoincrement or to delete.
        # However, in MySQL, autoincrement cannot be set to value less than current value. Instead, a
        # constant value could be subtracted from the id column but this does not help if table is empty.
        #
        # DELETE FROM `appno`;
        # ALTER TABLE `appno` AUTO_INCREMENT = 1;
        #
        # SET @phjMinID = (SELECT min(`id`) FROM `appno`) + 1;
        # UPDATE `appno` set `id`=`id`- @phjMinID;
        #
        # Best option is perhaps to delete and recreate the table in SQL:
        #
        # DROP TABLE IF EXISTS `appno`;
        #
        # CREATE TABLE `appno` (
        #   `id` int(4) unsigned NOT NULL AUTO_INCREMENT,
        #   `appno` varchar(12) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NOT NULL DEFAULT '',
        #   `yr` int(4) unsigned NOT NULL,
        #   `mth` int(2) unsigned NOT NULL,
        #   `abattoir_id` int(4) unsigned NOT NULL,
        #   PRIMARY KEY (`id`)
        # ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci;
        
        print('Combined appno dataframe')
        print('------------------------')
        
        print(phjCombinedDF.loc[(phjCombinedDF['yr'] == int(phjExtrDate['yr'])) & (phjCombinedDF['mth'] == int(phjMonthOrdDict[phjExtrDate['mth']])),['appno','yr','mth','id']])


        phjCombinedDF.loc[(phjCombinedDF['yr'] == int(phjExtrDate['yr'])) & (phjCombinedDF['mth'] == int(phjMonthOrdDict[phjExtrDate['mth']])),['appno','yr','mth','id']].rename(columns = {'id':'abattoir_id'}).to_sql(  con = phjSQLEng,
                                                                                                                                                                                                                name = 'appno',
                                                                                                                                                                                                                if_exists = 'append',
                                                                                                                                                                                                                index = False,
                                                                                                                                                                                                                dtype = {'appno':sqlalchemy.types.VARCHAR(length=12),
                                                                                                                                                                                                                         'yr':sqlalchemy.types.INTEGER(),
                                                                                                                                                                                                                         'mth':sqlalchemy.types.INTEGER(),
                                                                                                                                                                                                                         'abattoir_id':sqlalchemy.types.INTEGER()})

        
print('All done')

Enter username: root
Enter password: ········
Filename: approved-food-establishments-as-at-1-january-2018.csv
First dataframe (retrieved from DB)
Empty DataFrame
Columns: [id, appno, tradingname, town, postcode7, country_id, slaughterhouse, x, y, yr, mth]
Index: []


Second dataframe (post processing)
     id appno                                        tradingname  \
0   NaN  1007                           Frank Bird (Poultry) Ltd   
1   NaN  1077                Joe Simpson (T/A F Simpson and Son)   
2   NaN  2019                                    HCF Poultry Ltd   
3   NaN  2023                       T Soanes & Son (Poultry) Ltd   
4   NaN  2037                           2 Sisters Food Group Ltd   
..   ..   ...                                                ...   
335 NaN  7161                  Capestone Organic Poultry Limited   
336 NaN  7164                               Usk Vale Poultry Ltd   
337 NaN  7176  2 Sisters Red Meat Ltd T/A St Merryn Foods - M...   
338 NaN  7182    

Second dataframe (post processing)
     id appno                                        tradingname  \
0   NaN  1007                           Frank Bird (Poultry) Ltd   
1   NaN  1077                Joe Simpson (T/A F Simpson and Son)   
2   NaN  2019                                    HCF Poultry Ltd   
3   NaN  2023                       T Soanes & Son (Poultry) Ltd   
4   NaN  2037                           2 Sisters Food Group Ltd   
..   ..   ...                                                ...   
335 NaN  7161                  Capestone Organic Poultry Limited   
336 NaN  7164                               Usk Vale Poultry Ltd   
337 NaN  7176  2 Sisters Red Meat Ltd T/A St Merryn Foods - M...   
338 NaN  7182                                     David T Havard   
339 NaN  7198       Cig Calon Cymru 2010 CYF T/A Cig Calon Cymru   

                town slaughterhouse         x         y  \
0            Penrith            yes  357081.0  533629.0   
1    Bishop Auckland          

Combined dataframe (post processing)
      id  appno                                        tradingname  \
0      1   1007                           Frank Bird (Poultry) Ltd   
1      2   1077                Joe Simpson (T/A F Simpson and Son)   
2      3   1100  2 Sisters Poultry Ltd (TA 2 Sisters Coupar Angus)   
3      4   1101             Anglo Beef Processors UK T/A ABP Perth   
4      5   1103            Neerock Limited (T/A Woodhead Brothers)   
..   ...    ...                                                ...   
335  336   9509                                 Moy Park Ballymena   
336  337   9518                             McKeown Fine Foods Ltd   
337  338   9552                                     Kearns Poultry   
338  339   9554                                   Rockvale Poultry   
339  340  GX401                         State of Guernsey Abattoir   

                 town postcode7  country_id slaughterhouse         x  \
0             Penrith   CA101NB           1       

Filename: approved-food-establishments-as-at-1-may-2018.csv
First dataframe (retrieved from DB)
      id  appno                                        tradingname  \
0      1   1007                           Frank Bird (Poultry) Ltd   
1      2   1077                Joe Simpson (T/A F Simpson and Son)   
2      3   1100  2 Sisters Poultry Ltd (TA 2 Sisters Coupar Angus)   
3      4   1101             Anglo Beef Processors UK T/A ABP Perth   
4      5   1103            Neerock Limited (T/A Woodhead Brothers)   
..   ...    ...                                                ...   
338  339   9554                                   Rockvale Poultry   
339  340  GX401                         State of Guernsey Abattoir   
340  341   4014                         Freemans of Newent Limited   
341  342   5001                         Bernard Matthews Foods Ltd   
342  343   7132                      Farmers Fresh (Wales) Limited   

                town postcode7  country_id slaughterhouse      

Second dataframe (post processing)
     id appno                                        tradingname  \
0   NaN  1007                           Frank Bird (Poultry) Ltd   
1   NaN  1077                Joe Simpson (T/A F Simpson and Son)   
2   NaN  2019                                    HCF Poultry Ltd   
3   NaN  2023                       T Soanes & Son (Poultry) Ltd   
4   NaN  2037                           2 Sisters Food Group Ltd   
..   ..   ...                                                ...   
328 NaN  7161                  Capestone Organic Poultry Limited   
329 NaN  7164                               Usk Vale Poultry Ltd   
330 NaN  7176  2 Sisters Red Meat Ltd T/A St Merryn Foods - M...   
331 NaN  7182                                     David T Havard   
332 NaN  7198       Cig Calon Cymru 2010 CYF T/A Cig Calon Cymru   

                town slaughterhouse         x         y  \
0            Penrith            yes  357081.0  533629.0   
1    Bishop Auckland          

Second dataframe (post processing)
     id appno                                        tradingname  \
0   NaN  1007                           Frank Bird (Poultry) Ltd   
1   NaN  1077                Joe Simpson (T/A F Simpson and Son)   
2   NaN  2019                                    HCF Poultry Ltd   
3   NaN  2023                       T Soanes & Son (Poultry) Ltd   
4   NaN  2037                           2 Sisters Food Group Ltd   
..   ..   ...                                                ...   
328 NaN  7161                  Capestone Organic Poultry Limited   
329 NaN  7164                               Usk Vale Poultry Ltd   
330 NaN  7176  2 Sisters Red Meat Ltd T/A St Merryn Foods - M...   
331 NaN  7182                                     David T Havard   
332 NaN  7198       Cig Calon Cymru 2010 CYF T/A Cig Calon Cymru   

                town slaughterhouse         x         y  \
0            Penrith            yes  357081.0  533629.0   
1    Bishop Auckland          

Combined dataframe (post processing)
      id  appno                                        tradingname  \
0      1   1007                           Frank Bird (Poultry) Ltd   
1      2   1077                Joe Simpson (T/A F Simpson and Son)   
2      3   1100  2 Sisters Poultry Ltd (TA 2 Sisters Coupar Angus)   
3      4   1101             Anglo Beef Processors UK T/A ABP Perth   
4      5   1103            Neerock Limited (T/A Woodhead Brothers)   
..   ...    ...                                                ...   
335  336   9509                                 Moy Park Ballymena   
336  337   9518                             McKeown Fine Foods Ltd   
337  338   9552                                     Kearns Poultry   
338  339   9554                                   Rockvale Poultry   
339  340  GX401                         State of Guernsey Abattoir   

                 town postcode7  country_id slaughterhouse         x  \
0             Penrith   CA101NB           1       

     appno    yr  mth   id
0     1007  2018    9    1
1     1077  2018    9    2
2     1100  2018    9    3
3     1101  2018    9    4
4     1103  2018    9    5
..     ...   ...  ...  ...
335   9509  2018    9  336
336   9518  2018    9  337
337   9552  2018    9  338
338   9554  2018    9  339
339  GX401  2018    9  340

[329 rows x 4 columns]
Filename: approved-food-establishments-as-at-1-october-2018.csv
First dataframe (retrieved from DB)
      id appno                                        tradingname  \
0      1  1007                           Frank Bird (Poultry) Ltd   
1      2  1077                Joe Simpson (T/A F Simpson and Son)   
2      3  1100  2 Sisters Poultry Ltd (TA 2 Sisters Coupar Angus)   
3      4  1101             Anglo Beef Processors UK T/A ABP Perth   
4      5  1103            Neerock Limited (T/A Woodhead Brothers)   
..   ...   ...                                                ...   
354  355  4073                                      ABP ELLESMERE   


Second dataframe (post processing)
     id appno                                   tradingname             town  \
0   NaN  1007                      Frank Bird (Poultry) Ltd          Penrith   
1   NaN  1077           Joe Simpson (T/A F Simpson and Son)  Bishop Auckland   
2   NaN  2019                               HCF Poultry Ltd         Bradford   
3   NaN  2023                  T Soanes & Son (Poultry) Ltd        Driffield   
4   NaN  2037                      2 Sisters Food Group Ltd       Scunthorpe   
..   ..   ...                                           ...              ...   
318 NaN  7161             Capestone Organic Poultry Limited    Haverfordwest   
319 NaN  7164                          Usk Vale Poultry Ltd        Pontypool   
320 NaN  7176                      Kepak Food Group Limited   Merthyr Tydfil   
321 NaN  7182                                David T Havard       Caerphilly   
322 NaN  7198  Cig Calon Cymru 2010 CYF T/A Cig Calon Cymru         Llanelli   

    

Second dataframe (post processing)
     id appno                                   tradingname             town  \
0   NaN  1007                      Frank Bird (Poultry) Ltd          Penrith   
1   NaN  1077           Joe Simpson (T/A F Simpson and Son)  Bishop Auckland   
2   NaN  2019                               HCF Poultry Ltd         Bradford   
3   NaN  2023                  T Soanes & Son (Poultry) Ltd        Driffield   
4   NaN  2037                      2 Sisters Food Group Ltd       Scunthorpe   
..   ..   ...                                           ...              ...   
317 NaN  7161             Capestone Organic Poultry Limited    Haverfordwest   
318 NaN  7164                          Usk Vale Poultry Ltd        Pontypool   
319 NaN  7176                      Kepak Food Group Limited   Merthyr Tydfil   
320 NaN  7182                                David T Havard       Caerphilly   
321 NaN  7198  Cig Calon Cymru 2010 CYF T/A Cig Calon Cymru         Llanelli   

    

Second dataframe (post processing)
     id appno                                   tradingname             town  \
0   NaN  1007                      Frank Bird (Poultry) Ltd          Penrith   
1   NaN  1077           Joe Simpson (T/A F Simpson and Son)  Bishop Auckland   
2   NaN  2019                               HCF Poultry Ltd         Bradford   
3   NaN  2023                  T Soanes & Son (Poultry) Ltd        Driffield   
4   NaN  2037                      2 Sisters Food Group Ltd       Scunthorpe   
..   ..   ...                                           ...              ...   
317 NaN  7161             Capestone Organic Poultry Limited    Haverfordwest   
318 NaN  7164                          Usk Vale Poultry Ltd        Pontypool   
319 NaN  7176                      Kepak Food Group Limited   Merthyr Tydfil   
320 NaN  7182                                David T Havard       Caerphilly   
321 NaN  7198  Cig Calon Cymru 2010 CYF T/A Cig Calon Cymru         Llanelli   

    

Second dataframe (post processing)
     id appno                                   tradingname             town  \
0   NaN  1007                      Frank Bird (Poultry) Ltd          Penrith   
1   NaN  1077           Joe Simpson (T/A F Simpson and Son)  Bishop Auckland   
2   NaN  2019                               HCF Poultry Ltd         Bradford   
3   NaN  2023                  T Soanes & Son (Poultry) Ltd        Driffield   
4   NaN  2037                      2 Sisters Food Group Ltd       Scunthorpe   
..   ..   ...                                           ...              ...   
314 NaN  7161             Capestone Organic Poultry Limited    Haverfordwest   
315 NaN  7164                          Usk Vale Poultry Ltd        Pontypool   
316 NaN  7176                      Kepak Food Group Limited   Merthyr Tydfil   
317 NaN  7182                                David T Havard       Caerphilly   
318 NaN  7198  Cig Calon Cymru 2010 CYF T/A Cig Calon Cymru         Llanelli   

    

Second dataframe (post processing)
     id appno                                   tradingname             town  \
0   NaN  1007                      Frank Bird (Poultry) Ltd          Penrith   
1   NaN  1077           Joe Simpson (T/A F Simpson and Son)  Bishop Auckland   
2   NaN  2019                               HCF Poultry Ltd         Bradford   
3   NaN  2023                  T Soanes & Son (Poultry) Ltd        Driffield   
4   NaN  2037                      2 Sisters Food Group Ltd       Scunthorpe   
..   ..   ...                                           ...              ...   
313 NaN  7161             Capestone Organic Poultry Limited    Haverfordwest   
314 NaN  7164                          Usk Vale Poultry Ltd        Pontypool   
315 NaN  7176                      Kepak Food Group Limited   Merthyr Tydfil   
316 NaN  7182                                David T Havard       Caerphilly   
317 NaN  7198  Cig Calon Cymru 2010 CYF T/A Cig Calon Cymru         Llanelli   

    

Second dataframe (post processing)
     id appno                                   tradingname             town  \
0   NaN  1007                      Frank Bird (Poultry) Ltd          Penrith   
1   NaN  1077           Joe Simpson (T/A F Simpson and Son)  Bishop Auckland   
2   NaN  2019                               HCF Poultry Ltd         Bradford   
3   NaN  2023                  T Soanes & Son (Poultry) Ltd        Driffield   
4   NaN  2037                      2 Sisters Food Group Ltd       Scunthorpe   
..   ..   ...                                           ...              ...   
312 NaN  7161             Capestone Organic Poultry Limited    Haverfordwest   
313 NaN  7164                          Usk Vale Poultry Ltd        Pontypool   
314 NaN  7176                      Kepak Food Group Limited   Merthyr Tydfil   
315 NaN  7182                                David T Havard       Caerphilly   
316 NaN  7198  Cig Calon Cymru 2010 CYF T/A Cig Calon Cymru         Llanelli   

    

Combined dataframe (post processing)
      id  appno                                        tradingname  \
0      1   1007                           Frank Bird (Poultry) Ltd   
1      2   1077                Joe Simpson (T/A F Simpson and Son)   
2      3   1100  2 Sisters Poultry Ltd (TA 2 Sisters Coupar Angus)   
3      4   1101             Anglo Beef Processors UK T/A ABP Perth   
4      5   1103            Neerock Limited (T/A Woodhead Brothers)   
..   ...    ...                                                ...   
336  337   9518                             McKeown Fine Foods Ltd   
337  338   9552                                     Kearns Poultry   
338  339   9554                                   Rockvale Poultry   
339  340  GX401                         State of Guernsey Abattoir   
361  362   IOM1                                  Isle of Man Meats   

                town postcode7  country_id slaughterhouse         x         y  \
0            Penrith   CA101NB           

Combined appno dataframe
------------------------
     appno    yr  mth   id
0     1007  2019    6    1
1     1077  2019    6    2
2     1100  2019    6    3
3     1101  2019    6    4
4     1103  2019    6    5
..     ...   ...  ...  ...
336   9518  2019    6  337
337   9552  2019    6  338
338   9554  2019    6  339
339  GX401  2019    6  340
361   IOM1  2019    6  362

[316 rows x 4 columns]
Filename: approved-food-establishments-as-at-1-july-2019.csv
First dataframe (retrieved from DB)
      id appno                                        tradingname  \
0      1  1007                           Frank Bird (Poultry) Ltd   
1      2  1077                Joe Simpson (T/A F Simpson and Son)   
2      3  1100  2 Sisters Poultry Ltd (TA 2 Sisters Coupar Angus)   
3      4  1101             Anglo Beef Processors UK T/A ABP Perth   
4      5  1103            Neerock Limited (T/A Woodhead Brothers)   
..   ...   ...                                                ...   
362  363  2399        

Combined appno dataframe
------------------------
     appno    yr  mth   id
0     1007  2019    7    1
1     1077  2019    7    2
2     1100  2019    7    3
3     1101  2019    7    4
4     1103  2019    7    5
..     ...   ...  ...  ...
336   9518  2019    7  337
337   9552  2019    7  338
338   9554  2019    7  339
339  GX401  2019    7  340
361   IOM1  2019    7  362

[314 rows x 4 columns]
Filename: approved-food-establishments-as-at-1-august-2019.csv
First dataframe (retrieved from DB)
      id appno                                        tradingname  \
0      1  1007                           Frank Bird (Poultry) Ltd   
1      2  1077                Joe Simpson (T/A F Simpson and Son)   
2      3  1100  2 Sisters Poultry Ltd (TA 2 Sisters Coupar Angus)   
3      4  1101             Anglo Beef Processors UK T/A ABP Perth   
4      5  1103            Neerock Limited (T/A Woodhead Brothers)   
..   ...   ...                                                ...   
367  368  2748      

[376 rows x 17 columns]


Combined appno dataframe
------------------------
     appno    yr  mth   id
0     1007  2019    8    1
1     1077  2019    8    2
2     1100  2019    8    3
3     1101  2019    8    4
4     1103  2019    8    5
..     ...   ...  ...  ...
336   9518  2019    8  337
337   9552  2019    8  338
338   9554  2019    8  339
339  GX401  2019    8  340
361   IOM1  2019    8  362

[313 rows x 4 columns]
Filename: approved-food-establishments-as-at-1-september-2019.csv
First dataframe (retrieved from DB)
      id appno                                        tradingname  \
0      1  1007                           Frank Bird (Poultry) Ltd   
1      2  1077                Joe Simpson (T/A F Simpson and Son)   
2      3  1100  2 Sisters Poultry Ltd (TA 2 Sisters Coupar Angus)   
3      4  1101             Anglo Beef Processors UK T/A ABP Perth   
4      5  1103            Neerock Limited (T/A Woodhead Brothers)   
..   ...   ...                                              

Combined appno dataframe
------------------------
    appno    yr  mth   id
0    1007  2019    9    1
1    1077  2019    9    2
2    1100  2019    9    3
3    1101  2019    9    4
4    1103  2019    9    5
..    ...   ...  ...  ...
335  9509  2019    9  336
336  9518  2019    9  337
337  9552  2019    9  338
338  9554  2019    9  339
361  IOM1  2019    9  362

[309 rows x 4 columns]
Filename: approved-food-establishments-as-at-1-october-2019.csv
First dataframe (retrieved from DB)
      id appno                                        tradingname  \
0      1  1007                           Frank Bird (Poultry) Ltd   
1      2  1077                Joe Simpson (T/A F Simpson and Son)   
2      3  1100  2 Sisters Poultry Ltd (TA 2 Sisters Coupar Angus)   
3      4  1101             Anglo Beef Processors UK T/A ABP Perth   
4      5  1103            Neerock Limited (T/A Woodhead Brothers)   
..   ...   ...                                                ...   
371  372  7226                 

Combined appno dataframe
------------------------
    appno    yr  mth   id
0    1007  2019   10    1
1    1077  2019   10    2
2    1100  2019   10    3
3    1101  2019   10    4
4    1103  2019   10    5
..    ...   ...  ...  ...
335  9509  2019   10  336
336  9518  2019   10  337
337  9552  2019   10  338
338  9554  2019   10  339
361  IOM1  2019   10  362

[309 rows x 4 columns]
Filename: approved-food-establishments-as-at-1-november-2019.csv
First dataframe (retrieved from DB)
      id appno                                        tradingname  \
0      1  1007                           Frank Bird (Poultry) Ltd   
1      2  1077                Joe Simpson (T/A F Simpson and Son)   
2      3  1100  2 Sisters Poultry Ltd (TA 2 Sisters Coupar Angus)   
3      4  1101             Anglo Beef Processors UK T/A ABP Perth   
4      5  1103            Neerock Limited (T/A Woodhead Brothers)   
..   ...   ...                                                ...   
371  372  7226                

Combined appno dataframe
------------------------
    appno    yr  mth   id
0    1007  2019   11    1
1    1077  2019   11    2
2    1100  2019   11    3
3    1101  2019   11    4
4    1103  2019   11    5
..    ...   ...  ...  ...
335  9509  2019   11  336
336  9518  2019   11  337
337  9552  2019   11  338
338  9554  2019   11  339
361  IOM1  2019   11  362

[311 rows x 4 columns]
Filename: approved-food-establishments-as-at-1-december-2019.csv
First dataframe (retrieved from DB)
      id appno                                        tradingname  \
0      1  1007                           Frank Bird (Poultry) Ltd   
1      2  1077                Joe Simpson (T/A F Simpson and Son)   
2      3  1100  2 Sisters Poultry Ltd (TA 2 Sisters Coupar Angus)   
3      4  1101             Anglo Beef Processors UK T/A ABP Perth   
4      5  1103            Neerock Limited (T/A Woodhead Brothers)   
..   ...   ...                                                ...   
372  373  2662                

Second dataframe (post processing)
     id appno                              tradingname              town  \
0   NaN  1007                 Frank Bird (Poultry) Ltd           Penrith   
1   NaN  1077      Joe Simpson (T/A F Simpson and Son)   Bishop Auckland   
2   NaN  1100                 2 Sisters Food Group Ltd      Coupar Angus   
3   NaN  1101   Anglo Beef Processors UK T/A ABP Perth             Perth   
4   NaN  1103  Neerock Limited (T/A Woodhead Brothers)           Turriff   
..   ..   ...                                      ...               ...   
307 NaN  9509                       Moy Park Ballymena  BALLYMENA          
308 NaN  9518                   McKeown Fine Foods Ltd         BALLYMENA   
309 NaN  9552                           Kearns Poultry        BALLYMENA    
310 NaN  9554                         Rockvale Poultry            ARMAGH   
311 NaN  IOM1                        Isle of Man Meats       Isle of Man   

    slaughterhouse         x         y              

Second dataframe (post processing)
     id appno                              tradingname              town  \
0   NaN  1007                 Frank Bird (Poultry) Ltd           Penrith   
1   NaN  1077      Joe Simpson (T/A F Simpson and Son)   Bishop Auckland   
2   NaN  1100                 2 Sisters Food Group Ltd      Coupar Angus   
3   NaN  1101   Anglo Beef Processors UK T/A ABP Perth             Perth   
4   NaN  1103  Neerock Limited (T/A Woodhead Brothers)           Turriff   
..   ..   ...                                      ...               ...   
305 NaN  9509                       Moy Park Ballymena  BALLYMENA          
306 NaN  9518                   McKeown Fine Foods Ltd         BALLYMENA   
307 NaN  9552                           Kearns Poultry        BALLYMENA    
308 NaN  9554                         Rockvale Poultry            ARMAGH   
309 NaN  IOM1                        Isle of Man Meats       Isle of Man   

    slaughterhouse         x         y              

Second dataframe (post processing)
     id appno                              tradingname              town  \
0   NaN  1007                 Frank Bird (Poultry) Ltd           Penrith   
1   NaN  1077      Joe Simpson (T/A F Simpson and Son)   Bishop Auckland   
2   NaN  1100                 2 Sisters Food Group Ltd      Coupar Angus   
3   NaN  1101   Anglo Beef Processors UK T/A ABP Perth             Perth   
4   NaN  1103  Neerock Limited (T/A Woodhead Brothers)           Turriff   
..   ..   ...                                      ...               ...   
304 NaN  9509                       Moy Park Ballymena  BALLYMENA          
305 NaN  9518                   McKeown Fine Foods Ltd         BALLYMENA   
306 NaN  9552                           Kearns Poultry        BALLYMENA    
307 NaN  9554                         Rockvale Poultry            ARMAGH   
308 NaN  IOM1                        Isle of Man Meats       Isle of Man   

    slaughterhouse         x         y              

Second dataframe (post processing)
     id appno                              tradingname              town  \
0   NaN  1007                 Frank Bird (Poultry) Ltd           Penrith   
1   NaN  1077      Joe Simpson (T/A F Simpson and Son)   Bishop Auckland   
2   NaN  1100                 2 Sisters Food Group Ltd      Coupar Angus   
3   NaN  1101   Anglo Beef Processors UK T/A ABP Perth             Perth   
4   NaN  1103  Neerock Limited (T/A Woodhead Brothers)           Turriff   
..   ..   ...                                      ...               ...   
301 NaN  9509                       Moy Park Ballymena  BALLYMENA          
302 NaN  9518                   McKeown Fine Foods Ltd         BALLYMENA   
303 NaN  9552                           Kearns Poultry        BALLYMENA    
304 NaN  9554                         Rockvale Poultry            ARMAGH   
305 NaN  IOM1                        Isle of Man Meats       Isle of Man   

    slaughterhouse         x         y              

Second dataframe (post processing)
     id appno                              tradingname              town  \
0   NaN  1007                 Frank Bird (Poultry) Ltd           Penrith   
1   NaN  1077      Joe Simpson (T/A F Simpson and Son)   Bishop Auckland   
2   NaN  1100                 2 Sisters Food Group Ltd      Coupar Angus   
3   NaN  1101   Anglo Beef Processors UK T/A ABP Perth             Perth   
4   NaN  1103  Neerock Limited (T/A Woodhead Brothers)           Turriff   
..   ..   ...                                      ...               ...   
300 NaN  9509                       Moy Park Ballymena  BALLYMENA          
301 NaN  9518                   McKeown Fine Foods Ltd         BALLYMENA   
302 NaN  9552                           Kearns Poultry        BALLYMENA    
303 NaN  9554                         Rockvale Poultry            ARMAGH   
304 NaN  IOM1                        Isle of Man Meats       Isle of Man   

    slaughterhouse         x         y              

First dataframe (retrieved from DB)
      id appno                                        tradingname  \
0      1  1007                           Frank Bird (Poultry) Ltd   
1      2  1077                Joe Simpson (T/A F Simpson and Son)   
2      3  1100  2 Sisters Poultry Ltd (TA 2 Sisters Coupar Angus)   
3      4  1101             Anglo Beef Processors UK T/A ABP Perth   
4      5  1103            Neerock Limited (T/A Woodhead Brothers)   
..   ...   ...                                                ...   
379  380  1100                           2 Sisters Food Group Ltd   
380  381  2762                                    H M Venison Ltd   
381  382  2761                    Westcountry Premium Venison Ltd   
382  383  2768                   Farm Fresh Quality Foods Co. Ltd   
383  384  1160                                Millers of Speyside   

                 town postcode7  country_id slaughterhouse         x  \
0             Penrith   CA101NB           1            yes  357

## Retrieve data from database and pickle it

In [23]:
# Path to directory
phjPklPath = './outputs'

# Define queries
phjAppnoQuery = "SELECT * FROM `appno`"
phjAbattoirQuery = "SELECT * FROM `abattoir`"

# Retrieve Appno data
# ===================
phjRetrievedAppnoDF = pd.read_sql(phjAppnoQuery,
                                  con = phjSQLEng)

print('Retrieved Appno data from database')
print('----------------------------------')
print(phjRetrievedAppnoDF)
print(phjRetrievedAppnoDF.dtypes)
print('\n')

# Export as pickle and csv
phjRetrievedAppnoDF.to_pickle(Path('/'.join([phjPklPath,'phjLatestAppno.pkl'])))
phjRetrievedAppnoDF.to_csv(Path('/'.join([phjPklPath,'phjLatestAppno.csv'])),
                           header = True,
                           index = False)


# Retrieve Abattoir data
# ======================
phjRetrievedAbattoirDF = pd.read_sql(phjAbattoirQuery,
                                     con = phjSQLEng)

print('Retrieved Abattoir data from database')
print('-------------------------------------')
print(phjRetrievedAbattoirDF)
print(phjRetrievedAbattoirDF.dtypes)

# Export as pickle and csv
phjRetrievedAbattoirDF.to_pickle(Path('/'.join([phjPklPath,'phjLatestAbattoir.pkl'])))
phjRetrievedAbattoirDF.to_csv(Path('/'.join([phjPklPath,'phjLatestAbattoir.csv'])),
                              header = True,
                              index = False)

Retrieved Appno data from database
----------------------------------
     appno    yr  mth  abattoir_id
0     1007  2018    1            1
1     1007  2018    2            1
2     1007  2018    3            1
3     1007  2018    4            1
4     1007  2018    5            1
...    ...   ...  ...          ...
9302  IOM1  2020    1          362
9303  IOM1  2020    2          362
9304  IOM1  2020    3          362
9305  IOM1  2020    4          362
9306  IOM1  2020    5          362

[9307 rows x 4 columns]
appno          object
yr              int64
mth             int64
abattoir_id     int64
dtype: object


Retrieved Abattoir data from database
-------------------------------------
      id appno                                        tradingname  \
0      1  1007                           Frank Bird (Poultry) Ltd   
1      2  1077                Joe Simpson (T/A F Simpson and Son)   
2      3  1100  2 Sisters Poultry Ltd (TA 2 Sisters Coupar Angus)   
3      4  1101             An

In [24]:
# Test reading pickle
phjRetrievedPklAppnoDF = pd.read_pickle(Path('/'.join([phjPklPath,'phjLatestAppno.pkl'])))

print('Retrieved pickled Appno data')
print('----------------------------')
print(phjRetrievedPklAppnoDF)
print(phjRetrievedPklAppnoDF.dtypes)
print('\n')

phjRetrievedPklAbattoirDF = pd.read_pickle(Path('/'.join([phjPklPath,'phjLatestAbattoir.pkl'])))

print('Retrieved pickled Abattoir data')
print('-------------------------------')
print(phjRetrievedPklAbattoirDF)
print(phjRetrievedPklAbattoirDF.dtypes)
print('\n')

Retrieved pickled Appno data
----------------------------
     appno    yr  mth  abattoir_id
0     1007  2018    1            1
1     1007  2018    2            1
2     1007  2018    3            1
3     1007  2018    4            1
4     1007  2018    5            1
...    ...   ...  ...          ...
9302  IOM1  2020    1          362
9303  IOM1  2020    2          362
9304  IOM1  2020    3          362
9305  IOM1  2020    4          362
9306  IOM1  2020    5          362

[9307 rows x 4 columns]
appno          object
yr              int64
mth             int64
abattoir_id     int64
dtype: object


Retrieved pickled Abattoir data
-------------------------------
      id appno                                        tradingname  \
0      1  1007                           Frank Bird (Poultry) Ltd   
1      2  1077                Joe Simpson (T/A F Simpson and Son)   
2      3  1100  2 Sisters Poultry Ltd (TA 2 Sisters Coupar Angus)   
3      4  1101             Anglo Beef Processors UK T