# Table of FSA approved premises

In [15]:
import re
import collections
import numpy as np
import pandas as pd
import nltk
nltk.download('punkt')
import epydemiology as epy

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/philipjones/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [16]:
phjYearList = [2018,2019,2020]

phjMonthOrdDict = collections.OrderedDict()
phjMonthOrdDict['january'] = 1
phjMonthOrdDict['february'] = 2
phjMonthOrdDict['march'] = 3
phjMonthOrdDict['april'] = 4
phjMonthOrdDict['may'] = 5
phjMonthOrdDict['june'] = 6
phjMonthOrdDict['july'] = 7
phjMonthOrdDict['august'] = 8
phjMonthOrdDict['september'] = 9
phjMonthOrdDict['october'] = 10
phjMonthOrdDict['november'] = 11
phjMonthOrdDict['december'] = 12

phjMonthList = [k for k,v in phjMonthOrdDict.items()]

print(phjMonthList)

['january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october', 'november', 'december']


In [17]:
# Define missing codes
phjMissingCodesDict = {'phjMissValueStr':'missing',
                       'phjCPHCountyMissCode':999,
                       'phjCountryMissCode':99,
                       'phjAnimSrcMissCode':400,
                       'phjSppGrpMissCode':99,
                       'phjOffCCIRMissCode':999,
                       'phjInspectionTypeMissCode':785}

In [18]:
phjCountryDF = pd.DataFrame({'country_id':[1,2,3,4,5,6,7,phjMissingCodesDict['phjCountryMissCode']],
                             'country':['England','Wales','Scotland','Northern Ireland','Guernsey','Jersey','Isle of Man','missing']})

print(phjCountryDF)

   country_id           country
0           1           England
1           2             Wales
2           3          Scotland
3           4  Northern Ireland
4           5          Guernsey
5           6            Jersey
6           7       Isle of Man
7          99           missing


In [19]:
def phjPreProcess(phjDF,
                  phjCountryDF,
                  phjCountryMissCode,
                  phjColsOfInterestList):
    
    # Convert column names to lowercase, remove spaces and '???' characters ... just for consistency
    # (For some reason, the AppNo column in Dec 2019 was called '???AppNo')
    phjDF.columns = phjDF.columns.str.replace(' ','').str.replace('\?\?\?','').str.lower()

    # Retain only the columns of interest
    phjDF = phjDF[phjColsOfInterestList].copy()
    
    # Retain slaughterhouse approved premises only
    phjDF = phjDF.loc[phjDF['slaughterhouse'] == 'Yes',:].copy()
    
    # Remove whitespace from front and back of trading name column
    phjDF['tradingname'] = phjDF['tradingname'].str.strip()
    
    # Format the appno column to make uppercase and remove white space
    phjDF['appno'] = phjDF['appno'].str.upper().str.replace('[^\w]','')
    
    # Add a column that contains the trading name but in lower case and with all punctuation
    # and spaces removed. Also, '&' is converted to 'and' and 'ltd' to 'limited'. This ensures
    # that subtle variations in the way the TradingName is written won't be misinterpreted as
    # a different company.
    phjDF['tradingname_lcase'] = phjDF['tradingname'].str.replace('&','and').str.replace('[Ll]td','limited').str.replace('[Bb]ros','Brothers').str.lower().str.replace('[^\w]','')

    # Add column with postcode formatted to 7 characters
    phjDF = epy.phjPostcodeFormat7(phjDF = phjDF,
                                   phjPostcodeVarName = 'postcode',
                                   phjPostcodeCheckVarName = None,
                                   phjPostcode7VarName = 'postcode7',
                                   phjPrintResults = False)

    # Display all countries contained in file
    phjDF['country'].value_counts()
    
    # Replace country names with country codes defined in phjCountryDF
    phjDF = pd.merge(phjDF,
                     phjCountryDF,
                     on = "country",
                     how = 'left')
    
    phjDF['country_id'] = phjDF['country_id'].fillna(phjCountryMissCode)
    
    # Remove original postcode and country columns
    phjDF = phjDF[[col for col in list(phjDF.columns) if col not in ['postcode','country']]].copy()

    return phjDF

## Create initial tables

In [20]:
phjPath = '/Users/philipjones/Documents/ccir/FSA approved food premises'

#phjColsOfInterest  = ['appno','tradingname','town','postcode','country','slaughterhouse','x','y']
phjColsOfInterest  = ['appno','tradingname','postcode','country','slaughterhouse']


yr = min(phjYearList)
mth = 'january'

phjFilename = "approved-food-establishments-as-at-1-{}-{}.csv".format(mth,yr)

# Read csv file
phjFSAAppNoDF = pd.read_csv('/'.join([phjPath,phjFilename]))

# Process the imported file
phjFSAAppNoDF = phjPreProcess(phjDF = phjFSAAppNoDF,
                              phjCountryDF = phjCountryDF,
                              phjCountryMissCode = phjMissingCodesDict['phjCountryMissCode'],
                              phjColsOfInterestList = phjColsOfInterest)

# Add year and month columns
phjFSAAppNoDF['yr'] = yr
phjFSAAppNoDF['mth'] = phjMonthOrdDict[mth]

# Reset index (and make 1-based) to produce column of premisesname_id
phjFSAAppNoDF = phjFSAAppNoDF.reset_index(drop = False).rename(columns = {'index':'premisesname_id'})
phjFSAAppNoDF['premisesname_id'] = phjFSAAppNoDF['premisesname_id'] + 1

#print(phjFSAAppNoDF)

phjAppNoTable = phjFSAAppNoDF.loc[:,['appno','yr','mth','premisesname_id']]
phjPremisesNameTable = phjFSAAppNoDF.loc[:,['premisesname_id','tradingname','tradingname_lcase','town','postcode7','country_id','slaughterhouse','x','y']]

print('appno table\n===========')
print(phjAppNoTable)
print('\n')
print('premisesname table\n==================')
print(phjPremisesNameTable)

appno table
    appno    yr  mth  premisesname_id
0    1007  2018    1                1
1    1077  2018    1                2
2    2019  2018    1                3
3    2023  2018    1                4
4    2037  2018    1                5
..    ...   ...  ...              ...
335  7161  2018    1              336
336  7164  2018    1              337
337  7176  2018    1              338
338  7182  2018    1              339
339  7198  2018    1              340

[340 rows x 4 columns]


premisesname table
     premisesname_id                                        tradingname  \
0                  1                           Frank Bird (Poultry) Ltd   
1                  2                Joe Simpson (T/A F Simpson and Son)   
2                  3                                    HCF Poultry Ltd   
3                  4                       T Soanes & Son (Poultry) Ltd   
4                  5                           2 Sisters Food Group Ltd   
..               ...                 

## Update table with updated data file

In [21]:
yr = 2019
mth = 'january'

phjFilename = "approved-food-establishments-as-at-1-{}-{}.csv".format(mth,yr)

# Read csv file
temp_phjFSAAppNoDF = pd.read_csv('/'.join([phjPath,phjFilename]))

# Process the imported file
temp_phjFSAAppNoDF = phjPreProcess(phjDF = temp_phjFSAAppNoDF,
                                   phjCountryDF = phjCountryDF,
                                   phjCountryMissCode = phjMissingCodesDict['phjCountryMissCode'],
                                   phjColsOfInterestList = phjColsOfInterest)

# Add year and month columns
temp_phjFSAAppNoDF['yr'] = yr
temp_phjFSAAppNoDF['mth'] = phjMonthOrdDict[mth]

# Reset index (and make 1-based) to produce column of premisesname_id
temp_phjFSAAppNoDF = temp_phjFSAAppNoDF.reset_index(drop = False).rename(columns = {'index':'premisesname_id'})
#temp_phjFSAAppNoDF['premisesname_id'] = temp_phjFSAAppNoDF['premisesname_id'] + 1 + len(phjFSAAppNoDF.index)
temp_phjFSAAppNoDF['premisesname_id'] = np.nan

print('Latest version of premises table\n==================')
print(temp_phjFSAAppNoDF)

Latest version of premises table
     premisesname_id appno                                   tradingname  \
0                NaN  1007                      Frank Bird (Poultry) Ltd   
1                NaN  1077           Joe Simpson (T/A F Simpson and Son)   
2                NaN  2019                               HCF Poultry Ltd   
3                NaN  2023                  T Soanes & Son (Poultry) Ltd   
4                NaN  2037                      2 Sisters Food Group Ltd   
..               ...   ...                                           ...   
317              NaN  7161             Capestone Organic Poultry Limited   
318              NaN  7164                          Usk Vale Poultry Ltd   
319              NaN  7176                      Kepak Food Group Limited   
320              NaN  7182                                David T Havard   
321              NaN  7198  Cig Calon Cymru 2010 CYF T/A Cig Calon Cymru   

    slaughterhouse                    tradingname_lcas

In [22]:
# Concatenate dataframes consisting of the updated original rows plus new rows
# and sort by appno, year and month to ensure earliest rows occur as first row
# in each group.
temp_phjFSAAppNoDF = pd.concat([phjFSAAppNoDF,temp_phjFSAAppNoDF],sort = False).sort_values(by = ['appno','yr','mth','tradingname_lcase'])

# Reset index to make sure no duplicated values in index
temp_phjFSAAppNoDF = temp_phjFSAAppNoDF.reset_index(drop = True)

# Add count column to indicate how many rows in each groupby group
temp_phjFSAAppNoDF['countappno'] = temp_phjFSAAppNoDF.groupby('appno')['appno'].transform('count')

print('Concatenated dataframe\n======================')
print(temp_phjFSAAppNoDF)
print('\n')

print('AppNos that appear only once in either first or second part')
print('===========================================================')

print(temp_phjFSAAppNoDF.loc[temp_phjFSAAppNoDF['countappno'] != 2,:])

Concatenated dataframe
     premisesname_id  appno  \
0                1.0   1007   
1                NaN   1007   
2                2.0   1077   
3                NaN   1077   
4              282.0   1100   
..               ...    ...   
657            281.0   9554   
658              NaN   9554   
659            262.0  GX401   
660              NaN  GX401   
661              NaN   IOM1   

                                           tradingname slaughterhouse  \
0                             Frank Bird (Poultry) Ltd            Yes   
1                             Frank Bird (Poultry) Ltd            Yes   
2                  Joe Simpson (T/A F Simpson and Son)            Yes   
3                  Joe Simpson (T/A F Simpson and Son)            Yes   
4    2 Sisters Poultry Ltd (TA 2 Sisters Coupar Angus)            Yes   
..                                                 ...            ...   
657                                   Rockvale Poultry            Yes   
658                 

In [23]:
# Create a column of tokens found in the trading name column

phjStopList = ['(',')','/','&',',','-','–','—','and','the','limited']

phjReplaceDict = {'t/a':'ta',
                  'ltd':'limited',
                  'bros':'brothers',
                  '&':'and',
                  '2sisters':[2,'sisters']}

# Following function flattens a list that contains lists providing the nested lists are
# only 1 level deep.
def phjFlattenList(phjList):
    
    flatlist = []

    for sublist in phjList:
        if isinstance(sublist,list):
            for item in sublist:
                flatlist.append(item)
        else:
            flatlist.append(sublist)

    return flatlist

# Remove items from phjStopList and replace any items in list using a dictionary lookup
# (The dictionary replacements may result in some lists-within-a-list scenarios; the function
# phjFlattenList() flattens lists providing the nested lists are only 1 deep.)
# The replace() functions convert hyphens and n and m dashes to spaces.
#temp_phjFSAAppNoDF['tokens'] = temp_phjFSAAppNoDF['tradingname'].apply(lambda x: phjFlattenList([phjReplaceDict.get(tok,tok) for tok in nltk.word_tokenize(x.lower()) if tok not in phjStopList]))
temp_phjFSAAppNoDF['tokens'] = temp_phjFSAAppNoDF['tradingname'].apply(lambda x: phjFlattenList([phjReplaceDict.get(tok,tok) for tok in nltk.word_tokenize(x.lower().replace('-',' ').replace('–',' ').replace('—',' ')) if tok not in phjStopList]))

# Create a new column with tokens offset by 1 place so new column contains the
# tokens from the previous row
temp_phjFSAAppNoDF['prevtokens'] = temp_phjFSAAppNoDF['tokens'].shift(1)

# Need to check that index does not include duplicate values (because next line makes changes
# based on index position)
if len(list(temp_phjFSAAppNoDF.index)) == len(set(list(temp_phjFSAAppNoDF.index))):
    print("No duplicates in index")
else:
    print("Index values not unique")

# For each group (groupby appno), the first row can't be compared with the preceding
# row. Therefore, remove the tokens from the preceding row. The method for doing this
# was given by EdChum - Reinstate Monica at
# https://stackoverflow.com/questions/46242488/change-first-element-of-each-group-in-pandas-dataframe
temp_phjFSAAppNoDF.loc[temp_phjFSAAppNoDF.groupby('appno')['prevtokens'].head(1).index, 'prevtokens'] = np.nan

# In groups with two rows, compare tokens with previous tokens using NLTK's Jaccard Distance
# (Only calculate Jaccard distance if the prevtokens column contains a list i.e. don't try to calculate
# difference if the prevtokens column contains a NaN value)
temp_phjFSAAppNoDF['jd'] = temp_phjFSAAppNoDF.apply(lambda x: nltk.jaccard_distance(set(x['tokens']),set(x['prevtokens'])) if isinstance(x['prevtokens'],list) else np.nan,axis = 1)

print(temp_phjFSAAppNoDF)

No duplicates in index
     premisesname_id  appno  \
0                1.0   1007   
1                NaN   1007   
2                2.0   1077   
3                NaN   1077   
4              282.0   1100   
..               ...    ...   
657            281.0   9554   
658              NaN   9554   
659            262.0  GX401   
660              NaN  GX401   
661              NaN   IOM1   

                                           tradingname slaughterhouse  \
0                             Frank Bird (Poultry) Ltd            Yes   
1                             Frank Bird (Poultry) Ltd            Yes   
2                  Joe Simpson (T/A F Simpson and Son)            Yes   
3                  Joe Simpson (T/A F Simpson and Son)            Yes   
4    2 Sisters Poultry Ltd (TA 2 Sisters Coupar Angus)            Yes   
..                                                 ...            ...   
657                                   Rockvale Poultry            Yes   
658                 

In [24]:
# Identify those names that have changed even slightly
# ====================================================

# Get a list of the indexes of rows where Jaccard distance is greater than 0
phAlteredNamesIndexList = list(temp_phjFSAAppNoDF.loc[temp_phjFSAAppNoDF['jd'] > 0,['appno','tradingname','jd']].index)

# Edit to include index of immediately preceding row
phAlteredNamesIndexList = phAlteredNamesIndexList + [i-1 for i in phAlteredNamesIndexList]

# Sort list to ensure consecutive index numbers are adjacent to each other
phAlteredNamesIndexList.sort()

# Provide a cut-off to interpret Jaccard distance
temp_phjFSAAppNoDF['samename'] = temp_phjFSAAppNoDF['jd'] < 0.5
temp_phjFSAAppNoDF['samename'] = temp_phjFSAAppNoDF['samename'].replace({1:'yes',0:'no'})

temp_phjFSAAppNoDF.loc[temp_phjFSAAppNoDF['jd'].isnull(),'samename'] = np.nan

# Print results
print(temp_phjFSAAppNoDF.loc[phAlteredNamesIndexList,['appno','tradingname','tokens','jd','samename']])

# Print results
print(temp_phjFSAAppNoDF)

    appno                                        tradingname  \
10   1106    2Sisters Red Meat Limited (T/A McIntosh Donald)   
11   1106                                Kepak Group Limited   
53   1598                 Dawn Meats UK (T/A Highland Meats)   
54   1598                                        Dunbia (UK)   
99   2100                                       DAWN CARNABY   
100  2100                                     Dunbia Carnaby   
156  2388                                       DAWN CUMBRIA   
157  2388                                     Dunbia Cumbria   
170  2450                     Al-Ummah Halal Poultry Limited   
171  2450                                   AL UMMAH LIMITED   
193  2630                              Charing Meats Limited   
194  2630                       Agro Foods (Ashford) Limited   
217  4014                               Cargill Meats Europe   
218  4014                         Freemans of Newent Limited   
410  5106                               

In [25]:
# If trading name has not changed then keep only the last row in the group
# (but ensure the same id number is retained).
# If the trading name has changed then keep both rows in the group.
# The suggested approach to produced the above was given as an answer by Erfin at:
# https://stackoverflow.com/questions/59568154/updating-a-pandas-dataframe-with-new-data-whilst-retaining-existing-id-number

mask_yes = temp_phjFSAAppNoDF['samename'].eq('yes') # array with True for rows with 'yes'
mask_no = temp_phjFSAAppNoDF['samename'].eq('no')   # array with True for rows with 'no'
mask_single = temp_phjFSAAppNoDF['countappno'].eq(1)  # array with True for rows in single-row groups


# if the row is 'yes', get the shifted id, else the original id
temp_phjFSAAppNoDF['premisesname_id'] = np.where(mask_yes, temp_phjFSAAppNoDF['premisesname_id'].shift(), temp_phjFSAAppNoDF['premisesname_id']) 

# if a group has 'no' mark all rows as True so we can keep the whole group
mask = temp_phjFSAAppNoDF.assign(indicator=mask_no).groupby('appno')['indicator'].transform('any')

# filter on groups with 'no' or only the row 'yes'
temp_phjFSAAppNoDF = temp_phjFSAAppNoDF[mask | mask_yes | mask_single]

print(temp_phjFSAAppNoDF)

     premisesname_id  appno  \
1                1.0   1007   
3                2.0   1077   
5              282.0   1100   
7              283.0   1101   
9              284.0   1103   
..               ...    ...   
654            279.0   9518   
656            280.0   9552   
658            281.0   9554   
660            262.0  GX401   
661              NaN   IOM1   

                                           tradingname slaughterhouse  \
1                             Frank Bird (Poultry) Ltd            Yes   
3                  Joe Simpson (T/A F Simpson and Son)            Yes   
5    2 Sisters Poultry Ltd (TA 2 Sisters Coupar Angus)            Yes   
7               Anglo Beef Processors UK T/A ABP Perth            Yes   
9              Neerock Limited (T/A Woodhead Brothers)            Yes   
..                                                 ...            ...   
654                             McKeown Fine Foods Ltd            Yes   
656                                     Kea

In [26]:
# An example of the code used to update the data in the dataframe.
# The code is based on an answer to a StackOverflow question that was given by Erfin at:
# https://stackoverflow.com/questions/59568154/updating-a-pandas-dataframe-with-new-data-whilst-retaining-existing-id-number

# Create dataframe
df = pd.DataFrame({'id':[1,2,3,4,5,6],
                   'gp':['x','a','a','b','b','y'],
                   'meta':['five','one','two','three','four','six'],
                   'matchvar':['nmo','wwww','w ww w','xxxx','xyxx','wxyz'],
                   'match':[np.nan,np.nan,'yes',np.nan,'no',np.nan]})

# Add count column to indicate how many rows in each groupby group
df['count'] = df.groupby('gp')['gp'].transform('count')

print('Original dataframe\n==================')
print(df)
print('\n')

mask_yes = df['match'].eq('yes') # array with True for rows with 'yes'
mask_no = df['match'].eq('no')   # array with True for rows with 'no'
mask_single = df['count'].eq(1)  # array with True for rows in single-row groups

# if the row is 'yes', get the shifted id, else the original id
df['id'] = np.where(mask_yes, df['id'].shift(), df['id']) 

# if a group has 'no' mark all rows as True so we can keep the whole group
mask = df.assign(indicator=mask_no).groupby('gp')['indicator'].transform('any')

print('mask\n====')
print(mask)
print('\n')

print('mask_yes\n========')
print(mask_yes)
print('\n')

print('mask_single\n===========')
print(mask_single)
print('\n')

# filter on groups with 'no' or only the row 'yes'
df = df[mask | mask_yes | mask_single]

print('Updated dataframe\n=================')
print(df)

Original dataframe
   id gp   meta matchvar match  count
0   1  x   five      nmo   NaN      1
1   2  a    one     wwww   NaN      2
2   3  a    two   w ww w   yes      2
3   4  b  three     xxxx   NaN      2
4   5  b   four     xyxx    no      2
5   6  y    six     wxyz   NaN      1


mask
====
0    False
1    False
2    False
3     True
4     True
5    False
Name: indicator, dtype: bool


mask_yes
0    False
1    False
2     True
3    False
4    False
5    False
Name: match, dtype: bool


mask_single
0     True
1    False
2    False
3    False
4    False
5     True
Name: count, dtype: bool


Updated dataframe
    id gp   meta matchvar match  count
0  1.0  x   five      nmo   NaN      1
2  2.0  a    two   w ww w   yes      2
3  4.0  b  three     xxxx   NaN      2
4  5.0  b   four     xyxx    no      2
5  6.0  y    six     wxyz   NaN      1


In [27]:
print(temp_phjFSAAppNoDF.loc[temp_phjFSAAppNoDF['premisesname_id'].isnull(),:])

print(len(temp_phjFSAAppNoDF.loc[temp_phjFSAAppNoDF['premisesname_id'].isnull(),:]))


phjNewIDs = list(range(temp_phjFSAAppNoDF['premisesname_id'].max().astype(int) + 1,
                       temp_phjFSAAppNoDF['premisesname_id'].max().astype(int) + len(temp_phjFSAAppNoDF.loc[temp_phjFSAAppNoDF['premisesname_id'].isnull(),:]) + 1))

print(phjNewIDs)

temp_phjFSAAppNoDF.loc[temp_phjFSAAppNoDF['premisesname_id'].isnull(),['premisesname_id']] = phjNewIDs

print(temp_phjFSAAppNoDF)

     premisesname_id appno                    tradingname slaughterhouse  \
11               NaN  1106            Kepak Group Limited            Yes   
30               NaN  1216               Pasture-to-Plate            Yes   
54               NaN  1598                    Dunbia (UK)            Yes   
100              NaN  2100                 Dunbia Carnaby            Yes   
157              NaN  2388                 Dunbia Cumbria            Yes   
171              NaN  2450               AL UMMAH LIMITED            Yes   
194              NaN  2630   Agro Foods (Ashford) Limited            Yes   
199              NaN  2662       Norfolk Meat Traders Ltd            Yes   
203              NaN  2670          Manifold Valley Meats            Yes   
218              NaN  4014     Freemans of Newent Limited            Yes   
230              NaN  4073                  ABP ELLESMERE            Yes   
411              NaN  5106              Dunbia Cardington            Yes   
444         

In [28]:
# Minor repairs of dataframe (e.g. reset index, convert premisesname_id to integer)

temp_phjFSAAppNoDF = temp_phjFSAAppNoDF.reset_index(drop = True)

temp_phjFSAAppNoDF['premisesname_id'] = temp_phjFSAAppNoDF['premisesname_id'].astype('int')

print(temp_phjFSAAppNoDF)

     premisesname_id  appno  \
0                  1   1007   
1                  2   1077   
2                282   1100   
3                283   1101   
4                284   1103   
..               ...    ...   
353              279   9518   
354              280   9552   
355              281   9554   
356              262  GX401   
357              358   IOM1   

                                           tradingname slaughterhouse  \
0                             Frank Bird (Poultry) Ltd            Yes   
1                  Joe Simpson (T/A F Simpson and Son)            Yes   
2    2 Sisters Poultry Ltd (TA 2 Sisters Coupar Angus)            Yes   
3               Anglo Beef Processors UK T/A ABP Perth            Yes   
4              Neerock Limited (T/A Woodhead Brothers)            Yes   
..                                                 ...            ...   
353                             McKeown Fine Foods Ltd            Yes   
354                                     Kea