In [1]:
import pandas as pd
import glob

file_names = glob.glob('./Production Data by Brewery by Month 2022-20240226T230240Z-001/Production Data by Brewery by Month 2022/*.pdf')

import docx
def getText(filename):
    # importing required modules 
    from pypdf import PdfReader 
    
    # creating a pdf reader object 
    reader = PdfReader(filename) 
    fullText = ""
    for page in reader.pages:
        fullText += page.extract_text() + '\n'
    return fullText 

import pandas as pd
contents = {}
for file_name in file_names:
    content = getText(file_name)
    contents[file_name] = content

In [2]:
# read in data that are four columns situated at certain positions
def checkLine(line):
    parts = line.strip('\n').replace(',', '').split()
    # check if have 3 float at the end of each line
    fCount = 0
    for i in range(len(parts) - 3, len(parts)):
        try:
            float(parts[i])
            fCount += 1
        except:
            pass
    try:
        assert(fCount == 3)
        name = ' '.join(parts[:-3])
        assert("Total" not in name)
        cond = True
    except:
        cond = False
    return cond

def formatLine(line):
    parts = line.strip('\n').replace(',', '').split()
    # check if have 3 float in each line
    fCount = 0
    for i in range(len(parts) - 3, len(parts)):
        try:
            float(parts[i])
            fCount += 1
        except:
            pass
    name = ' '.join(parts[:-3])
    return (name, float(parts[-3]), float(parts[-2]), float(parts[-1]))

import os

dataObject = {}
for file_name in file_names:
    month = os.path.basename(file_name)[:3].upper()
    lines = contents[file_name].split('\n')
    # concat lines that have brewery name split into two lines
    for i in range(len(lines) - 1):
        if (not checkLine(lines[i]) and ("BREWERY" in lines[i] or "COMPANY" in lines[i] or "CO." in lines[i])) and checkLine(lines[i + 1]):
            lines[i] = lines[i] + ' ' + lines[i + 1]
            lines[i + 1] = ''
    
    # also filter lines that are not in state
    for i in range(len(lines)):
        line = lines[i]
        if "Out of State Brewery Trade Name" in line:
            break
    lines = lines[:i]

    filtLines = list(filter(lambda text: checkLine(text), lines))

    data = list(map(lambda text: formatLine(text), filtLines))
    df = pd.DataFrame(data, columns=['Brewery', 'Over 60000', '60000 Under', 'Total'])

    # merge data with same brewery name
    df = df.groupby('Brewery').sum().reset_index()
    dataObject[month] = df

In [3]:
monthNumberObject = {
    "JAN": 1, "FEB": 2, "MAR": 3, "APR": 4, "MAY": 5, "JUN": 6, "JUL": 7, "AUG": 8, "SEP": 9, "OCT": 10, "NOV": 11, "DEC": 12
}

# aggregate data from different breweries into one dataframe
dfAll = pd.DataFrame(columns=['Brewery'])
for month in dataObject:
    df = dataObject[month]
    dfSub = df[['Brewery', 'Total']].rename(columns={'Total': monthNumberObject[month]})
    dfAll = pd.merge(dfAll, dfSub, on='Brewery', how='outer')

In [4]:
# sort breweries by total production
sortedDfAll = dfAll.iloc[dfAll[[1,2,3,4,5,6,7,8,9,10,11,12]].sum(axis=1).sort_values(ascending=False).index].reset_index(drop=True)
# drop row total production
sortedDfAll = sortedDfAll[sortedDfAll['Brewery'].apply(lambda x: "Total" not in x)].reset_index(drop=True)


In [5]:
# sortedDfAll.to_csv('brewery_monthly_production_2022.csv', index=False, header=True)

In [6]:
sortedDfAll.head()

Unnamed: 0,Brewery,4,8,12,2,1,7,6,3,5,11,10,9
0,GEORGETOWN BREWING CO.,8672.18,10193.51,7380.14,6503.01,6409.97,8722.62,9571.99,8538.23,8855.93,9315.17,8985.2,9227.2
1,FREMONT BREWING,2536.74,2731.87,3229.78,2144.6,2466.78,2720.6,3077.69,2800.91,2616.72,2435.67,2050.42,2819.8
2,MAC & JACKS BREWERY INC.,2036.64,2412.73,1755.14,1841.37,1951.9,2052.96,2776.15,1780.41,2171.85,1902.2,2225.78,2378.33
3,REUBENS BREWS,2070.59,2617.09,1592.8,1676.63,1705.85,2303.87,2519.17,1933.15,2077.57,1709.33,1740.98,2433.18
4,BALE BREAKER BREWING COMPANY,1900.88,2282.84,1416.19,1276.72,1535.06,2302.83,2140.11,1822.97,1802.26,2088.51,1536.51,1709.75


In [7]:
# Create a new column 'total' that is the sum of all month columns
sortedDfAll['total'] = sortedDfAll.iloc[:, 1:].sum(axis=1)

In [8]:
breweries1 = sortedDfAll.Brewery.tolist()

In [9]:
wa_breweries_df = pd.read_csv("wa_breweries.csv")

In [10]:
breweries2 = wa_breweries_df.brewery.tolist()

In [11]:
from fuzzywuzzy import fuzz
from nltk.corpus import stopwords

unnecessary_words = stopwords.words('english')
unnecessary_words += ["brewery", "brewing", "company", "co", "brew", "pub", "works", "house", "beer", "ciders", "ale", "brewpub", "eatery"]

# Find the best match from the production data breweries to the WA breweries
matches = []
for brewery1 in breweries1:
    match = None
    match_score = 0
    brewery1 = brewery1.lower()
    for brewery2 in breweries2:
        brewery2 = brewery2.lower()
        # find the best match that has the most matched words
        score = 0
        for w1 in brewery1.split():
            for w2 in brewery2.split():
                if w1 == w2:
                    score += 1 * (0 if w1 in unnecessary_words else 1)
                elif fuzz.ratio(w1, w2) > 80:
                    score += 0.5 * (0 if w1 in unnecessary_words or w2 in unnecessary_words else 1)
        if score > match_score:
            match = brewery2
            match_score = score
    if match:
        matches.append((brewery1, match, match_score))

In [12]:
matches

[('georgetown brewing co.', 'georgetown brewing company', 1),
 ('fremont brewing', 'fremont brewing company', 1),
 ('mac & jacks brewery inc.', "mac and jack's brewing company", 1.5),
 ('reubens brews', "reuben's brews", 1.5),
 ('bale breaker brewing company', 'bale breaker brewing company', 2),
 ('silver city brewery', 'silver city brewery', 2),
 ('schooner exact brewing company', 'schooner exact brewing company', 2),
 ('no-li brewhouse', 'no-li brewhouse', 2),
 ('iron horse brewery', 'iron horse brewery', 2),
 ('elysian brewing company', 'elysian brewing', 1),
 ('kulshan brewing company', 'kulshan brewing company', 1),
 ('icicle brewing company', 'icicle brewing company', 1),
 ('black raven brewing company', 'black raven brewing company', 2),
 ('boundary bay brewery and bistro', 'boundary bay brewery', 2),
 ('stoup brewing', 'stoup brewing', 1),
 ('aslan brewing company', 'aslan brewing company', 1),
 ('7 seas brewing', '7 seas brewing company', 2),
 ('scuttlebutt brewing co.', 'scut

In [13]:
# after careful comparing the match results, we found that some matches are wrong 
wrong_matches = [
 ('bainbridge island brewing', 'ale spike camano island brewing', 1),
 ('bodacious berries fruits and brews', "reuben's brews", 1),
 ('mcmenamins', 'mcmenamins anderson school brewery', 1),
 ('wenatchee brewing co', 'wenatchee valley brewing company', 1),
 ('valley brewing company', 'columbia valley brewing company', 1),
 ('lagunitas tap room and beer sanctuary',
  'rose garage brewing and taps',
  0.5),
 ('big house brew pub', 'big barn brewing company', 1),
 ('timber city ginger beer', 'river city brewing', 1),
 ('valley house brewing company', 'columbia valley brewing company', 1),
 ('good. hard seltzer', 'good brewing company', 0.5),
 ('atwood ales', 'atwood farm brewery', 1),
 ('tilted tree hard cider', 'bardic brewing and cider', 1),
 ('papas casino restaurant & lounge-ten pin bre',
  'flyers restaurant and brewery',
  1),
 ('yakima valley hops', 'columbia valley brewing company', 1),
 ('yakima chief hops', "chief spring's fire and irons brewpub", 1),
 ('twenty-eight ten llc', 'ten pin brewing company', 1),
 ('zen zymurgist fermentation', 'fermentation initiative', 1),
 ('pear up cider', 'bardic brewing and cider', 1),
 ('the shed', 'the grain shed', 1),
 ('great western malting', 'western red brewing', 1),
 ('lost bear brews', 'lost woods brewery', 1),
 ]

In [14]:
# prune the wrong matches
matches = [match for match in matches if match not in wrong_matches]

In [15]:
# find the breweries in production breweries that are not matched
unmatched_breweries = [brewery for brewery in breweries1 if brewery.lower() not in [match[0].lower() for match in matches]]

In [16]:
unmatched_breweries

['54-40 BREWING COMPANY',
 'BAINBRIDGE ISLAND BREWING',
 'NOBOAT BREWING COMPANY',
 'BODACIOUS BERRIES FRUITS AND BREWS',
 'SOUND2SUMMIT BREWERY',
 'MCMENAMINS',
 'GLOBAL BEER COMPANY',
 'WENATCHEE BREWING CO',
 'VALLEY BREWING COMPANY',
 'WALLA WALLA STEAK CO/CROSS BUCK BREWING',
 'LAGUNITAS TAP ROOM AND BEER SANCTUARY',
 'BIG HOUSE BREW PUB',
 'TIMBER CITY GINGER BEER',
 'VALLEY HOUSE BREWING COMPANY',
 'GLORYBUCHA KOMBUCHA',
 'GOOD. HARD SELTZER',
 'COMMUNITEA KOMBUCHA',
 'SQUEEZE HARD SELTZER',
 'SMOKING MOS',
 'OFF-CAMBER BREWING',
 'MT BREW',
 'ATWOOD ALES',
 'FOB BREWING COMPANY',
 'TILTED TREE HARD CIDER',
 'JM CELLARS COMPANY',
 'PAPAS CASINO RESTAURANT & LOUNGE-TEN PIN BRE',
 'YAKIMA VALLEY HOPS',
 'YAKIMA CHIEF HOPS',
 'HAAS INNOVATIONS BREWING',
 'TWENTY-EIGHT TEN LLC',
 'BREW BAKERS',
 'BREAKING WAVES BREWING',
 'ZEN ZYMURGIST FERMENTATION',
 'AT',
 'FIREFIGHTER BREWING COMPANY',
 'PLAIN BREWING',
 'PEAR UP CIDER',
 'KINE KOMBUCHA',
 'ENCHANTMENT BREWING',
 'THE SHED',
 'G

In [17]:
should_match = { 
 '54-40 BREWING COMPANY': "54°40' Brewing Company",
 'BAINBRIDGE ISLAND BREWING': "Bainbridge Brewing Company",
 'NOBOAT BREWING COMPANY': "No Boat Brewing Company",
 'SOUND2SUMMIT BREWERY': "Sound To Summit Brewing",
 'VALLEY BREWING COMPANY': "Valley Brewing Company",
 'BIG HOUSE BREW PUB': "Big House Brewpub",
 'VALLEY HOUSE BREWING COMPANY': "Valley House Brewing Company",
 'OFF-CAMBER BREWING': "Off Camber Brewing",
 'FOB BREWING COMPANY': "Forward Operating Base Brewing Company",
 'BREW BAKERS': "Brewbakers Brewing Company",
 'WALLA WALLA STEAK CO/CROSS BUCK BREWING': "Crossbuck Brewing",
 'WENATCHEE BREWING CO': "Wenatchee Valley Brewing Company",
 "ATWOOD ALES": "Atwood Farm Brewery",
 "TWENTY-EIGHT TEN LLC": "Ancient Lakes Brewing Company",
 "LOST BEAR BREWS": "Wild Oak Project"
}

In [18]:
# append the should match items along with matches and render the result
matches_object = dict(list(map(lambda tup: (tup[0].lower(), tup[1].lower()), matches)))
should_match_object = dict(list(map(lambda kv: (kv[0].lower(), kv[1].lower()) ,should_match.items())))
# concat both dictionaries
matches_object = {**matches_object, **should_match_object}

In [19]:
# this provides a mapping between the production list with the wa list
matches_object

{'georgetown brewing co.': 'georgetown brewing company',
 'fremont brewing': 'fremont brewing company',
 'mac & jacks brewery inc.': "mac and jack's brewing company",
 'reubens brews': "reuben's brews",
 'bale breaker brewing company': 'bale breaker brewing company',
 'silver city brewery': 'silver city brewery',
 'schooner exact brewing company': 'schooner exact brewing company',
 'no-li brewhouse': 'no-li brewhouse',
 'iron horse brewery': 'iron horse brewery',
 'elysian brewing company': 'elysian brewing',
 'kulshan brewing company': 'kulshan brewing company',
 'icicle brewing company': 'icicle brewing company',
 'black raven brewing company': 'black raven brewing company',
 'boundary bay brewery and bistro': 'boundary bay brewery',
 'stoup brewing': 'stoup brewing',
 'aslan brewing company': 'aslan brewing company',
 '7 seas brewing': '7 seas brewing company',
 'scuttlebutt brewing co.': 'scuttlebutt brewing company',
 'pike brewing company and liberty malt supply': 'pike brewing c

In [23]:
# regroup the production numbers by the mapping from the matches_object
sortedDfAll["BreweryInWAList"] = sortedDfAll.Brewery.apply(lambda x: matches_object[x.lower()] if x.lower() in matches_object else None)
sortedDfAll = sortedDfAll.groupby('BreweryInWAList').sum(min_count=1).reset_index()

In [24]:
sortedDfAll.shape

(388, 15)

In [25]:
sortedDfAll.head()

Unnamed: 0,BreweryInWAList,Brewery,4,8,12,2,1,7,6,3,5,11,10,9,total
0,192 brewing company,192 BREWING COMPANY,,0.33,,0.5,,,,,,,,,0.83
1,20 corners brewing company,20 CORNERS BREWING,72.33,209.93,106.5,104.64,,,,147.98,,75.86,,132.3,849.54
2,23rd ave brewery,23RD AVE BREWERY,,,,,,5.0,4.78,,2.0,,14.0,1.0,26.78
3,4 stitch brewing company,4 STITCH BREWING CO.,,3.4,,2.0,,1.46,,1.25,4.72,,,,12.83
4,45 degree brewhouse,45 DEGREE BREWHOUSE,,,31.13,,,,,,,,,7.09,38.22


In [26]:
sortedDfAll.to_csv('brewery_monthly_production_2022.csv', index=False, header=True)

In [27]:
# this provides the missing breweries that are in production data but not in WA list
missing_breweries = [brewery for brewery in unmatched_breweries if brewery not in should_match]
missing_breweries

['BODACIOUS BERRIES FRUITS AND BREWS',
 'MCMENAMINS',
 'GLOBAL BEER COMPANY',
 'LAGUNITAS TAP ROOM AND BEER SANCTUARY',
 'TIMBER CITY GINGER BEER',
 'GLORYBUCHA KOMBUCHA',
 'GOOD. HARD SELTZER',
 'COMMUNITEA KOMBUCHA',
 'SQUEEZE HARD SELTZER',
 'SMOKING MOS',
 'MT BREW',
 'TILTED TREE HARD CIDER',
 'JM CELLARS COMPANY',
 'PAPAS CASINO RESTAURANT & LOUNGE-TEN PIN BRE',
 'YAKIMA VALLEY HOPS',
 'YAKIMA CHIEF HOPS',
 'HAAS INNOVATIONS BREWING',
 'BREAKING WAVES BREWING',
 'ZEN ZYMURGIST FERMENTATION',
 'AT',
 'FIREFIGHTER BREWING COMPANY',
 'PLAIN BREWING',
 'PEAR UP CIDER',
 'KINE KOMBUCHA',
 'ENCHANTMENT BREWING',
 'THE SHED',
 'GREAT WESTERN MALTING',
 'EIGHTY-TWO BEVERAGES',
 'NEIGEL VINTNERS']