In [42]:
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd 
from fuzzywuzzy import fuzz

# this gets the tables from the list of all Belgian beers, and does some cleaning 

In [43]:
url = "https://fr.wikipedia.org/wiki/Liste_de_bi%C3%A8res_belges"

response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")

tables = soup.find_all("table", {"class": "wikitable"})

def remove_wikipedia_ref(text):
	return re.sub(r"\[\d+\]", "", text)

def remove_language_tags(text):
	return re.sub(r"\s*\((nl|en)\)", "", text)

def remove_trailing_commas(text):
	count_trailing = re.search(r",+$", text)
	if count_trailing:
		return text[:-len(count_trailing.group())]
	return text

def remove_pour_and_following_text(text): # need to adjust to remove the parenthesis
	if "pour" in text:
		return text.split("pour")[0][:-1]
	return text

dataframes = []
for i, table in enumerate(tables):
    headers = [header.text.strip() for header in table.find_all("th")]
    rows = []
    for row in table.find_all("tr")[1:]:  # Skip header row
        cols = row.find_all(["td", "th"])
        cols = [col.text.strip() for col in cols]
        cols = [remove_wikipedia_ref(col) for col in cols]
        cols = [remove_trailing_commas(col) for col in cols]
        cols = [remove_language_tags(col) for col in cols]
        cols = [remove_pour_and_following_text(col) for col in cols]
        if cols:
            rows.append(cols)
    df = pd.DataFrame(rows, columns=headers)
    dataframes.append(df)
	
beers = pd.concat(dataframes, ignore_index=True)

In [44]:
beers

Unnamed: 0,Bières,Type,Teneur en alcool,Brasserie
0,3 Schténg,"Fermentation haute, Vieille Brune Flamande",6 %,"Brasserie Grain d'Orge , 4852 Hombourg, Liège ..."
1,IV Saison,Saison blonde,"6,5 %",Brasserie de Jandrain-Jandrenouille
2,V Cense,"Fermentation haute, Spéciale","7,5 %",Brasserie de Jandrain-Jandrenouille
3,VI Wheat,"Fermentation haute, Blanche",6 %,Brasserie de Jandrain-Jandrenouille
4,7-PK,Fermentation haute blonde,7 %,Brouwerij Anders!
...,...,...,...,...
1855,Zulte,"Fermentation haute, Brune","4,7 %",Alken-Maes (Heineken)
1856,Zwalmse Tripel,Triple,8 %,De Proefbrouwerij
1857,Zwarte Pol,Stout,"6,5 %",Brasserie Inter-Pol
1858,Zwet.be,Stout,7 %,De Proefbrouwerij


# this gets the data from list of Belgian breweries, as well as some additional beer

In [45]:
url = "https://fr.wikipedia.org/wiki/Liste_de_brasseries_belges"

reponse = requests.get(url)
soup = BeautifulSoup(reponse.text, "html.parser")

tables = soup.find_all("table", {"class": "wikitable"})

dataframes_breweries = []
for i, table in enumerate(tables):
    headers = [header.text.strip() for header in table.find_all("th")]
    rows = []
    for row in table.find_all("tr")[1:]:
        cols = row.find_all(["td", "th"])
        cols = [col.text.strip() for col in cols]
        cols = [remove_wikipedia_ref(col) for col in cols]
        cols = [remove_trailing_commas(col) for col in cols]
        cols = [remove_language_tags(col) for col in cols]
        cols = cols[:6]
        if cols:
            rows.append(cols)
    df = pd.DataFrame(rows, columns=headers)
    dataframes_breweries.append(df.iloc[:, :4])
    
breweries = pd.concat(dataframes_breweries, ignore_index=True)

# this block just removes some useless columns and rows
breweries = breweries.iloc[:250, :4]

In [49]:
breweries

Unnamed: 0,Brasserie,Commune,Province,Principales bières
0,Brasserie de l'abbaye de Saint-Ghislain,Saint-Ghislain,Hainaut,Abbaye de Saint-Ghislain
1,Brasserie de l'abbaye de Saint-Sixte,Vleteren,Flandre-Occidentale,Westvleteren
2,Brasserie de l'abbaye des Rocs,Honnelles,Hainaut,Abbaye des Rocs
3,Brasserie de l'abbaye du Val-Dieu,Aubel,Liège,Val-Dieu
4,Brasserie de l'abbaye de Villers-la-Ville,Villers-la-Ville,Brabant wallon,Abbaye de Villers
...,...,...,...,...
245,Brasserie de Warsage,Dalhem,Liège,Warsage
246,Microbrasserie de Waterloo (fait partie de Joh...,Waterloo,Brabant wallon,Waterloo
247,Brasserie Weldebrouck,Willebroek,Anvers,Weldebrouck
248,Brasserie Wilderen,Saint-Trond,Limbourg,"Tripel Kanunnik, Wilderen"


# we need to align the breweries from both datasets with the format of one of the two
for this we will use the transformers -- sentence_transformers
or fuzzy thing, idk which one will work best




In [None]:
import pandas as pd
from rapidfuzz import process, fuzz

def rename_by_similarity(df_source, col_source, df_reference, col_reference, threshold=80):
    ref_values = df_reference[col_reference].dropna().unique() # this gets unique brewery names
    df_source = df_source.copy()

    def get_best_match(value):
        if pd.isna(value) or value == "":
            return value
        match, score, _ = process.extractOne(value, ref_values, scorer=fuzz.ratio)
        return match if score >= threshold else value # renaming if above threshold

    df_source[col_source] = df_source[col_source].apply(get_best_match)
    return df_source

In [56]:
new = rename_by_similarity(beers, "Brasserie", breweries, "Brasserie")

# here i'm going to add some coordinates using geopy library

In [None]:
from geopy.geocoders import Nominatim
import time

geolocator = Nominatim(user_agent="myGeocoder", timeout=1) # have to try using different timeout values

coordinates = []

for index, row in breweries.iterrows():
    brewery = row["Brasserie"]
    commune = row["Commune"]
    province = row["Province"]
    
    retries = 3
    for i in range(retries):
        try:
            location = geolocator.geocode(brewery) # first we try getting coords of the brewery (most precise)
            if location:
                coordinates.append((location.latitude, location.longitude))
                break # when found coordinates break loop
            else:
                location = geolocator.geocode(commune) # moving onto commune
                if location:
                    coordinates.append((location.latitude, location.longitude))
                    break
                else:
                    coordinates.append(None) # nothing found, might add manually
                    break
        except Exception as e:
            print(f"Error processing {brewery}: {e}")
            time.sleep(5)

breweries["coordinates"] = coordinates
print(breweries.head())

Error processing Brasserie de l'abbaye de Saint-Sixte: HTTPSConnectionPool(host='nominatim.openstreetmap.org', port=443): Max retries exceeded with url: /search?q=Brasserie+de+l%27abbaye+de+Saint-Sixte&format=json&limit=1 (Caused by ReadTimeoutError("HTTPSConnectionPool(host='nominatim.openstreetmap.org', port=443): Read timed out. (read timeout=1)"))
Error processing Brasserie de l'abbaye de Villers-la-Ville: HTTPSConnectionPool(host='nominatim.openstreetmap.org', port=443): Max retries exceeded with url: /search?q=Brasserie+de+l%27abbaye+de+Villers-la-Ville&format=json&limit=1 (Caused by ReadTimeoutError("HTTPSConnectionPool(host='nominatim.openstreetmap.org', port=443): Read timed out. (read timeout=1)"))
Error processing Brasserie de l'abbaye Notre-Dame de Saint-Benoît: HTTPSConnectionPool(host='nominatim.openstreetmap.org', port=443): Max retries exceeded with url: /search?q=Brasserie+de+l%27abbaye+Notre-Dame+de+Saint-Beno%C3%AEt&format=json&limit=1 (Caused by ReadTimeoutError("HT

KeyboardInterrupt: 