In [None]:
import pandas as pd
import os
from geopy.geocoders import Nominatim
import time
from tqdm import tqdm

# Enable tqdm
tqdm.pandas()

USE_SAMPLE = False  # Set to False to run on the full dataset
SAMPLE_SIZE = 200
RANDOM_STATE = 700

BATCH_SIZE = 5000
BATCHES_OUTFOLDER = r"C:\Users\maxmo\Dropbox\GDS\Dissertation\discogs\data\geocoding_batches"

INPATH = r"C:\Users\maxmo\Dropbox\GDS\Dissertation\discogs\data\L2_clean_labels.csv"
OUTPATH_FULL = r"C:\Users\maxmo\Dropbox\GDS\Dissertation\discogs\data\L3_geocoded_labels.csv"
OUTPATH_MISSING = r"C:\Users\maxmo\Dropbox\GDS\Dissertation\discogs\data\geocoded_MISSINGS.csv"
OUTPATH_SAMPLE = r"C:\Users\maxmo\Dropbox\GDS\Dissertation\discogs\data\tests\sample_with_coordinates.csv"

df = pd.read_csv(INPATH)

BASE GEOCODING FUNCTION


This code provides the basic cleaning as part of the first step in the geocoding process. We scrub the "contact_info" for confounding characters and send the results through OSM. A secondary method, utilising REGEX on the "profile" text is tried subsequently. Where both fail, the results are passed into a missings csv for further study.

In [24]:
import re

def extract_location(text):
    if pd.isnull(text):
        return None
    # Match "located", "venue", "studio", or "based" followed by "in", "on", or "near"
    match = re.search(
        r'(located|venue|studio|based)\s+(in|on|near)\s+([^\.\!\?\n\r]+)',
        text,
        re.IGNORECASE
    )
    if match:
        return match.group(3).strip()
    return None

def clean_location(location):
    if not isinstance(location, str):
        return location
    # Remove "the" if it is the first word
    location = re.sub(r'^\s*the\s+', '', location, flags=re.IGNORECASE)
    # Remove "city of" if it is the first words
    location = re.sub(r'^\s*city of\s+', '', location, flags=re.IGNORECASE)
    # Truncate at first stopping punctuation
    location = re.split(r'[:;!\.\?\n\r]', location)[0].strip()
    # Truncate before "___ by"
    location = re.split(r'\s+\w+\s+by\b', location)[0].strip()
    # Truncate before "in YEAR" (YEAR = 4 digits)
    location = re.split(r'\s+in\s+\d{4}\b', location, flags=re.IGNORECASE)[0].strip()
    # Truncate before "established"
    location = re.split(r'\s+established\b', location, flags=re.IGNORECASE)[0].strip()
    # Cut everything after the second comma
    comma_parts = location.split(',')
    if len(comma_parts) > 2:
        location = ','.join(comma_parts[:2]).strip()
    return location

df['regex_profile_location'] = df['profile'].apply(extract_location)
df['regex_profile_location'] = df['regex_profile_location'].apply(clean_location)
print(df[['profile', 'regex_profile_location']].sample(10, random_state=68))

                                                 profile  \
17548                 Recording studio.  Opened in 2013.   
23791  Roman-Catholic pillar basilica in Romanesque s...   
11752  The Sonopress Studios in Berlin was located at...   
25969      Center for art & music in Huntsville, Alabama   
12808                     Live venue in Shinjuku, Tokyo.   
19006  Studio Royal St Joseph was active between 1976...   
4757   Club / venue situated in the City of Bremen, G...   
27514  Cultural center, theater and concert venue in ...   
2574   Sunshine Music, Vienna is an Austrian record l...   
29345  Series related to the organ of [l675162].\r\nR...   

                                  regex_profile_location  
17548                                               None  
23791  district [i]Niederzell[/i] of the Isle of Reic...  
11752                                               None  
25969                                               None  
12808                                    Shi

In [25]:
geolocator = Nominatim(user_agent="m.read2@lse.ac.uk", timeout=5)

def geocode_address(address):
    if not isinstance(address, str) or not address.strip():
        return None, None
    time.sleep(0.01)  # rate limiting
    try:
        location = geolocator.geocode(address)
        if location:
            return location.latitude, location.longitude
    except Exception as e:
        print(f"Failed for {address}: {e}")
    return None, None

# Select dataframe based on USE_SAMPLE
working_df = df.sample(n=SAMPLE_SIZE, random_state=RANDOM_STATE) if USE_SAMPLE else df

start = time.time()

# Apply geocoding with tqdm progress bar
coords = working_df['text_block'].progress_apply(geocode_address)
working_df['latitude'], working_df['longitude'] = zip(*coords)

end = time.time()
print(f"Total time: {end - start:.2f} seconds")

 55%|█████▌    | 16263/29382 [4:35:57<18:53:59,  5.19s/it]

Failed for i n f o @ c o t t a g e s o u n d . c o m: HTTPSConnectionPool(host='nominatim.openstreetmap.org', port=443): Max retries exceeded with url: /search?q=i+n+f+o+%40+c+o+t+t+a+g+e+s+o+u+n+d+.+c+o+m&format=json&limit=1 (Caused by ReadTimeoutError("HTTPSConnectionPool(host='nominatim.openstreetmap.org', port=443): Read timed out. (read timeout=5)"))


 77%|███████▋  | 22531/29382 [6:22:32<7:08:02,  3.75s/it] 

Failed for 0758-461 85 (1983 to 1992-04-01)
+46 8 582 461 85 (1992-04-01 to mid-2005)
 0171-510 55 (mid-2005 to ca 2015)
0171-46 70 90 (ca 2015 to ): HTTPSConnectionPool(host='nominatim.openstreetmap.org', port=443): Max retries exceeded with url: /search?q=0758-461+85+%281983+to+1992-04-01%29%0A%2B46+8+582+461+85+%281992-04-01+to+mid-2005%29%0A+0171-510+55+%28mid-2005+to+ca+2015%29%0A0171-46+70+90+%28ca+2015+to+%29&format=json&limit=1 (Caused by ReadTimeoutError("HTTPSConnectionPool(host='nominatim.openstreetmap.org', port=443): Read timed out. (read timeout=5)"))


100%|██████████| 29382/29382 [8:18:24<00:00,  1.02s/it]  

Total time: 29904.55 seconds





In [26]:
start = time.time()

# Apply geocoding with tqdm progress bar
# Only geocode non-blank regex_profile_location
coords = working_df['regex_profile_location'].progress_apply(
    lambda x: geocode_address(x) if isinstance(x, str) else (None, None)
)
working_df['reg_prof_latitude'], working_df['reg_prof_longitude'] = zip(*coords)

end = time.time()

  6%|▋         | 1845/29382 [15:38<3:05:57,  2.47it/s]

Failed for Paris, related to label [l=Delphine]: Non-successful status code 500


 10%|▉         | 2816/29382 [23:46<7:07:49,  1.03it/s]

Failed for NYC as [l=Waterworks], the studio relocated to Tucson: Non-successful status code 500


 19%|█▉        | 5670/29382 [47:31<2:54:01,  2.27it/s] 

Failed for [l=Grenslandhallen] complex in Hasselt (Belgium): Non-successful status code 500


 19%|█▉        | 5715/29382 [47:50<3:58:17,  1.66it/s]

Failed for premises of the [l=Musikbrauerei] in Berlin, Germany: Non-successful status code 500


 19%|█▉        | 5716/29382 [47:51<4:28:34,  1.47it/s]

Failed for premises of the [l=Musikbrauerei] in Berlin, Germany: Non-successful status code 500


 23%|██▎       | 6824/29382 [56:13<2:30:38,  2.50it/s]

Failed for Stockholm, that runs the [l=Atlantis Grammofon] and [l=Alternativ] labels: Non-successful status code 500


 28%|██▊       | 8267/29382 [1:06:32<2:08:17,  2.74it/s]

Failed for Naples within [l=Polosud] records: Non-successful status code 500


 38%|███▊      | 11042/29382 [1:27:53<2:07:48,  2.39it/s]

Failed for California connected to [a=Bill Cuomo] before he relocated to Tenessee and there started the studio [l=Manzanita]: Non-successful status code 500


 43%|████▎     | 12601/29382 [1:39:04<3:01:26,  1.54it/s]

Failed for same building as [l=Hafenklang], [l=Hafenklang Studio] and [l=Sounds Of Subterrania]: Non-successful status code 500


 43%|████▎     | 12602/29382 [1:39:05<3:13:30,  1.45it/s]

Failed for same building as [l=Hafenklang], [l=Hafenklang Studio] and [l=Sounds Of Subterrania]: Non-successful status code 500


 61%|██████▏   | 18004/29382 [2:14:51<1:03:07,  3.00it/s]

Failed for Nice, France co-founded by singer/songwriter/producer [a=Medi]: Non-successful status code 500


 68%|██████▊   | 19958/29382 [2:26:36<1:21:18,  1.93it/s]

Failed for Limbourg (Belgium) associated with Belgian label [l=Rodel]: Non-successful status code 500


 68%|██████▊   | 19959/29382 [2:26:37<1:34:55,  1.65it/s]

Failed for Limbourg (Belgium) associated with Belgian label [l=Rodel]: Non-successful status code 500


 97%|█████████▋| 28363/29382 [3:12:29<07:09,  2.37it/s]  

Failed for [a=OrangeGuyProductions]' home under the independent label AP DP MP Records that mostly records all the records from the label: Non-successful status code 500


100%|██████████| 29382/29382 [3:17:40<00:00,  2.48it/s]


In [27]:
# Save all geocoded data
working_df.to_csv(OUTPATH_FULL, index=False)

# Filter rows with no valid coordinates in either set
mask = (
    (working_df['latitude'].isnull() | working_df['longitude'].isnull()) &
    (working_df['reg_prof_latitude'].isnull() | working_df['reg_prof_longitude'].isnull())
)
working_df[mask].to_csv(OUTPATH_MISSING, index=False)