In [1]:
# Import Statements
import pandas as pd
import numpy as np
import difflib

pd.set_option('display.max_rows', None)

In [2]:
# Brewery Data
#print("\n".join(pd.read_csv('Data/wa_breweries.csv')['brewery'].unique()))

In [3]:
# Production Data
prod = pd.read_csv('Data/brewery_production_2017-2019.csv')

In [4]:
prod.head()

Unnamed: 0,brewery,year,estimate,annual_production
0,101 Brewery,2019,1,41
1,122 West Brewing Co.,2019,1,165
2,192 Brewing,2019,1,209
3,20 Corners Brewing LLC,2019,1,1722
4,210 Brewing Co,2019,1,350


In [5]:
# update brewery names
def update_names(df):
    
    # remove periods
    df['brewery'] = df['brewery'].str.replace('.','')
    
    # replace end 'co' with 'company'
    df['brewery'] = np.where(df['brewery'].str[-2:] == 'Co',
                             df['brewery'].str[:-2] + 'Company',
                             df['brewery'])

    # replace '&'' and 'and'
    df['brewery'] = df['brewery'].str.replace('&','and')
    
    # remove ' LLC'
    df['brewery'] = df['brewery'].str.replace(' LLC','')
    
    return df

prod = update_names(prod)

In [6]:
# prod 22 names
names_22 = pd.read_csv('Data/brewery_production_2022.csv')['brewery_name'].unique()
#print("\n".join(names_22))

In [7]:
# compare brewery names
match = []
not_match = []
for name in prod['brewery'].unique():
    if name in names_22:
        match.append(name)
    else:
        not_match.append(name)
        
print(len(match))
print(len(not_match))

# create df from match
match1 = pd.DataFrame(np.column_stack([match, match]), 
                               columns=['brewery_production', 'brewery_name'])

225
221


In [8]:
# function for matching names
def match_names(prod, brewery_names, cut=0.90):    

    # brewery names to test from
    names = []
    for n in brewery_names:
        n2 = n.lower().replace(' company','').replace(' brewery',' brewing')
        names.append((n,n2))

    # match to production data names
    prod_name_match = []
    for name in prod:
        name_test = name.lower().replace(' company','').replace(' inc','').replace(
            ' brewery',' brewing').replace(' -','').replace(' taproom','')
        matches = difflib.get_close_matches(name_test, [n[1] for n in names], cutoff=cut)
        if len(matches) > 0:
            m = matches[0]
            best_match = [n[0] for n in names if n[1] == m][0]
        else:
            best_match = None
        prod_name_match.append((name, best_match))
        
    return pd.DataFrame(prod_name_match, columns =['brewery_production', 'brewery_name'])

In [9]:
# second round of matching
match2 = match_names(not_match, names_22)
match2 = match2[(~match2['brewery_name'].isna()) & (~match2['brewery_name'].isin(
    ['Backwoods Brewing Company','Wild Man Brewing Company']))]

In [10]:
# manually match rest
not_match2 = np.setdiff1d(not_match, match2['brewery_production'].unique()).tolist()
#print("\n".join(not_match2))

In [11]:
match3list = [
    ("54-40 Brewing Company", "54°40' Brewing Company"),
    ("Ale Spike", "Ale Spike Camano Island Brewing"),
    ("Anacortes Brewery/Rockfish Grill", "Anacortes Brewery"),
    ("Atwood Ales", "Atwood Farm Brewery"),
    ("Bainbridge Island Brewing", "Bainbridge Brewing Company"),
    ("Bardic Brewing", "Bardic Brewing and Cider"),
    ("Belltown Brewing", "Belltown Brewery"),
    ("Bent Bine Brew Co", "Bent Bine Brewing Company"),
    ("Big Barn Brewing Co / Bodacious Berries Fruits and Brews", "Big Barn Brewing Company"),
    ("Boundary Bay Brewery and Bistro", "Boundary Bay Brewery"),
    ("BrewBakers Brewery", "Brewbakers Brewing Company"),
    ("Bron Yr Aur Brewing", "Bron Yr Aur Brewing Company"),
    ("Cardinal Craft Brewing Academy/ Skagit Valley College", "Cardinal Craft Brewing"),
    ("Cascadia Brewing Co ", "Cascadia Brewing Company "),
    ("Columbia Valley Brewing", "Columbia Valley Brewing Company"),
    ("Craft Brewing Academy/ Skagit Valley College ", "Craft Brewing Academy/Skagit Valley College"),
    ("Diamond Knot Brewery B2 Brewery and Taproom", "Diamond Knot Brewing Company"),
    ("Dirty Bucket Brewery / Locust Brewing", "Locust Brewing Company"),
    ("Dirty Bucket Brewery", "Dirty Bucket Brewing Company"),
    ("Dog and Pony Alehouse and Grill", "Dog and Pony Brewing Company"),
    ("Dunagan Brewing Company", "Dunagan Irish Pub and Brewery"),
    ("Dystopian State Brewing", "Dystopian State Brewing Company"),
    ("E9 Brewing Co ", "E9 Brewing Company"),
    ("Engine House No 9", "E9 Brewing Company"),
    ("Forward Operating Base Brewing Company / FOB Brewing", "Forward Operating Base Brewing Company"),
    ("Genus Brewing / Nu Home Brew and Bottles", "Genus Brewing"),
    ("Golden Handle Project", "Golden Handle Brewing Company"),
    ("Hale's Ales Brewery and Pub", "Hale's Ales"),
    ("Hookum Brewing Company", "Hookum Brewing Company "),
    ("Locust Cider and Brewing Company", "Locust Brewing Company"),
    ("North Fork Brewing Company", "North Fork Brewery"),
    ("Pastime Brewery Bar and Grill", "Pastime Brewery"),
    ("RAM/Big Horn Brewery", "RAM Restaurant and Brewery"),
    ("Rattlesnake Mountain Brewery / Kimo's Restaurant", "Rattlesnake Mountain Brewing Company"),
    ("Resonate Brewery + Pizzeria", "Resonate Brewery and Pizzeria"),
    ("Schooner Brewing Company", "Schooner Exact Brewing Company"),
    ("Skagit River Brewery", "Skagit River Brewing Company"),
    ("Snipes Mountain Brewing Company", "Snipes Mountain Brewing"),
    ("Sound To Summit", "Sound To Summit Brewing"),
    ("Steam Plant Grill", "Steam Plant Brewing Company"),
    ("Terramar", "Terramar Brewing and Distilling"),
    ("Triceratops Brewing", "Triceratops Brewing Company"),
    ("TTs Old Iron Brewery", "TT's Old Iron Brewery"),
    ("Walking Man Brewing Company", "Walking Man Brewing")]
match3 = pd.DataFrame(match3list, columns =['brewery_production','brewery_name'])

In [12]:
# rest are not in 2022 data so use existing names
not_match3 = np.setdiff1d(not_match, match3['brewery_production'].unique()).tolist()
match4 = pd.DataFrame(list(zip(iter(not_match3), iter(not_match3))), 
                      columns =['brewery_production','brewery_name'])

In [13]:
# compile match table
dfs = [match1, match2, match3, match4]
name_match_df = pd.concat(dfs)

name_match_df.head()

Unnamed: 0,brewery_production,brewery_name
0,5 North Brewing Company,5 North Brewing Company
1,7 Seas Brewing Company,7 Seas Brewing Company
2,Acorn Brewing,Acorn Brewing
3,Airways Brewing Company,Airways Brewing Company
4,Ashtown Brewing Company,Ashtown Brewing Company


In [19]:
# merge back into production data
brewery_production_2017_2019 = prod.merge(name_match_df, 
                                          how='left', left_on='brewery', right_on='brewery_production')
cols = ['brewery_name','year','estimate','annual_production']
brewery_production_2017_2019 = brewery_production_2017_2019[cols]

Unnamed: 0,brewery_name,year,estimate,annual_production
0,101 Brewery,2019,1,41
1,122 West Brewing Company,2019,1,165
2,192 Brewing Company,2019,1,209
3,192 Brewing,2019,1,209
4,20 Corners Brewing Company,2019,1,1722


In [None]:
# save as csv
brewery_production_2017_2019.to_csv('Data/brewery_production_2017_2019', index=False) 