### Libraries needed in this project

In [1]:
import asyncio
import aiohttp
from bs4 import BeautifulSoup
import re
from urllib.parse import urljoin, urlparse
import httpx
import pandas as pd
from geopy.adapters import AioHTTPAdapter
from geopy.geocoders import Nominatim
import pyap
from aiohttp import ClientSession
import time

#### Extracting the domains from the .parquet file

In [2]:
df_parquet = pd.read_parquet('D:/list of company websites.snappy.parquet')

## US addresses

In [3]:
# Domains list
domains = df_parquet.head(850)['domain']  
domains

0        umbrawindowtinting.com
1                embcmonroe.org
2               caffeygroup.com
3                sk4designs.com
4            draftingdesign.com
                 ...           
845                  joylux.com
846        kupferbergcenter.org
847    flstatewideinsurance.com
848      solarlightsandmore.com
849         mysticalpoodles.com
Name: domain, Length: 850, dtype: object

#### Function that from a domain verifiy if it's https or http

In [99]:
async def get_url_scheme(domain, timeout=5):
    # finding out if the domain is https or http
    try:
        async with httpx.AsyncClient(verify=False) as client:
            # a get request to the https address
            response = await client.get('https://' + domain, timeout=timeout)
            
            # extract the url where the request was redirected
            url_string = str(response.url)
            
            # decide if the url it's https or http (if it is not https we asume it is http)
            if url_string.startswith('https://'):
                return domain, 'https'
            else:
                return domain, 'http'
    # if an exception occured we assume it is an http and we will check later if it is valid
    except httpx.RequestError as e:
        return domain, 'http'
    except httpx.HTTPError as e:
        # treating the cases where we have a SSL error
        if isinstance(e, httpx.HTTPError) and isinstance(e.__cause__, httpx.ConnectError):
            return domain, 'http'
        else:
            raise e

- Asynchronous main function where for the domains list we obtain the url scheme

In [5]:
async def main(domains):
    tasks = [get_url_scheme(domain) for domain in domains]
    results = await asyncio.gather(*tasks, return_exceptions=True)
    return results

In [6]:
# Here the main function is runned
results = await main(domains)

#### Printing the results

In [7]:
# just to check if everything is ok
complete_urls = []
for result in results:
    if isinstance(result, tuple):
        domain, scheme = result
        complete_url = f"{scheme}://{domain}"
        print(f"Url complet: {complete_url}")
        complete_urls.append(complete_url)
    else:
        print(f"Rezultat neașteptat: {result}")

Url complet: https://umbrawindowtinting.com
Url complet: https://embcmonroe.org
Url complet: https://caffeygroup.com
Url complet: https://sk4designs.com
Url complet: https://draftingdesign.com
Url complet: http://truesdail.com
Url complet: https://seedsourceag.com
Url complet: https://romebeerfest.com
Url complet: https://beerock.com
Url complet: https://cabwhp.org
Url complet: http://saintmlc.com
Url complet: https://dillonmusic.com
Url complet: https://societyfortheblind.org
Url complet: https://perfectsearchinc.com
Url complet: https://wyandottewinery.com
Url complet: http://thespiritofblackjackmountain.com
Url complet: https://kingdomtel.com
Url complet: http://triadstage.org
Url complet: http://savagecbd.com
Url complet: https://clubk-9.com
Url complet: https://katerisyracuse.com
Url complet: https://triplecrownconstruction.com
Url complet: https://greyhackle.com
Url complet: https://wildandwanderful.com
Url complet: https://coastalpaddlecompany.com
Url complet: https://casepaper.

In [8]:
len(complete_urls)

848

_____
#### Function that checks if an URL is valid

In [105]:
import httpx

async def check_url_async(url, timeout=5):
    try:
        async with httpx.AsyncClient(verify = False) as client:
            response = await client.head(url, timeout=timeout)
            # If the response has the status code  = 200 then it is successful
            return response.status_code == 200
    except httpx.HTTPStatusError:
        return False
    except httpx.RequestError:
        return False


####  Function that filters asynchronously the list of URLS with the check_url_async function

In [106]:
import asyncio

async def filter_urls_async(urls):
    valid_urls = []
    tasks = [check_url_async(url) for url in urls]
    results = await asyncio.gather(*tasks)
    for url, result in zip(urls, results):
        if result:
            valid_urls.append(url)
    return valid_urls

- Asynchronously main function where we apply the filter to the list of urls and obtain the valid ones

In [11]:
async def main():
    valid_urls = await filter_urls_async(complete_urls)
    return valid_urls

In [12]:
valid_urls = await main()

In [13]:
len(valid_urls)

226

#### Function that loads an URL ignoring the ssl errors
- in general it's not recommended, but in this case we don't have a choice

In [112]:
import asyncio
import aiohttp
from bs4 import BeautifulSoup
import re
from urllib.parse import urljoin

async def fetch(session, url):
    try:
        async with session.get(url, ssl=False, timeout=10) as response:
            # If the response has the status code  = 200 then it is successful
            if response.status == 200:
                body = await response.read()
                # we use latin-1 because if we use utf-8 we will have errors
                return body.decode('latin-1', errors='ignore')
            else:
                print(f"This URL is not accessible: {url}")
                return None
    except asyncio.TimeoutError:
        print(f"Timeout occurred for URL: {url}")
        return None


####  Function that returns URLs starting from an URL
- the URLs are filtered with keywords (on that URLs it's most probable to have an address)

In [113]:
async def find_urls_with_keywords(url, keywords):
    found_urls = set()
    
    async with aiohttp.ClientSession() as session:
        html = await fetch(session, url)
        if html:
            found_urls.add(url)
            # extracting the html content from the page
            soup = BeautifulSoup(html, 'html.parser')
            # we will keep only the <a href> tags where we can find links to other pages of the website
            links = soup.find_all('a', href=True)
            tasks = []
            for link in links:
                href = link['href']
                # ignoring if the link is to the mail
                if href.startswith('mailto:'):
                    continue
                parsed_url = urlparse(href)
                path = parsed_url.path
                # we keep the urls that have the keywords in it (where is most probably to find an address)
                for keyword in keywords:
                    if re.search(r'\b{}\b'.format(re.escape(keyword)), path, re.IGNORECASE):
                        full_url = urljoin(url, href)
                        found_urls.add(full_url)
                        break
            await asyncio.gather(*tasks)
    return found_urls

- Async main function that for a list of urls generate a bigger list with the urls that have a high chance to have an address

In [16]:
async def main(valid_urls):
    # that is the list with the keywords where is a chance to find an address
    keywords = ['connect', 'contact', 'kontakt', 'impressum', 'adresse', 'reach', 'datenschutz', 'home']
    tasks = [find_urls_with_keywords(url, keywords) for url in valid_urls]
    final_urls = await asyncio.gather(*tasks)
    return final_urls

In [18]:
final_urls = await main(valid_urls)

Timeout occurred for URL: https://floridaorchestra.org
Timeout occurred for URL: http://sonomastarparty.com


In [19]:
urls_list = []
for set_elem in final_urls:
    for elem in set_elem:
        urls_list.append(elem)

In [20]:
len(urls_list)

417

#### Checking the new list of URLS and the invalid ones will be eliminated

In [21]:
async def main():
    valid_urls = await filter_urls_async(urls_list)
    return valid_urls

In [22]:
urls_list = await main()

In [23]:
len(urls_list)

391

#### Making a DataFrame to save the valid URLS into a .csv to using them later

In [24]:
import pandas as pd

df = pd.DataFrame(columns = ["URL"])

df

Unnamed: 0,URL


In [25]:
# building the rows
for url in urls_list:
    individual_row_data = [url]
    length = len(df)
    df.loc[length] = individual_row_data

In [26]:
df

Unnamed: 0,URL
0,https://embcmonroe.org
1,https://embcmonroe.org/connect-with-us/
2,https://sk4designs.com/contact
3,https://sk4designs.com
4,https://draftingdesign.com
...,...
386,http://greenelectricsolarsolutions.com
387,http://merchantportfolios.com
388,http://synpoly.com
389,http://vkhall-law.com


#### Exporting the final urls into a .csv

In [27]:
df.to_csv("US_URLS.csv", index=False)

#### Reading the urls from the CSV

In [28]:
import pandas as pd
urls = pd.read_csv("US_URLS.csv")

In [29]:
urls

Unnamed: 0,URL
0,https://embcmonroe.org
1,https://embcmonroe.org/connect-with-us/
2,https://sk4designs.com/contact
3,https://sk4designs.com
4,https://draftingdesign.com
...,...
386,http://greenelectricsolarsolutions.com
387,http://merchantportfolios.com
388,http://synpoly.com
389,http://vkhall-law.com


In [30]:
print(urls['URL'][0])

https://embcmonroe.org


#### Function that extracts the address from a page if it's possible

In [39]:
async def extract_addresses_from_url_async(url):
    async with aiohttp.ClientSession() as session:
        try:
            async with session.get(url, ssl=False, timeout=30) as response:
                # If the response has not the status code  = 200 then it is unsuccessful
                if response.status != 200:
                    return None
                
                binary_data = await response.read()
                page_text = binary_data.decode('latin-1')
                
        except asyncio.TimeoutError:
            print(f"Timeout occurred while fetching URL: {url}")
            return None
        except aiohttp.ClientError as e:
            print(f"Error occurred while fetching URL '{url}': {e}")
            return None
        except aiohttp.ClientConnectorError as e:
            print(f"Error connecting to server: {e}")
            return None
        
        # We verify if we received the status code 429 (Too Many Requests)
        if response.status == 429:
            print(f"Too Many Requests received. Retrying after waiting.")
            await asyncio.sleep(5)  # We wait 5 sec before to retry it
            return await extract_addresses_from_url_async(url)  # Recall the function
        
        try:
            # extracting the html content from the page
            soup = BeautifulSoup(page_text, 'html.parser')
        except:
            return None
        
        try:
            # extracting the text from the page
            page_text = soup.get_text()
        except:
            return None
        
        try:
            # find an US address in the text using pyap.parse function (from the pyap library)
            addresses = pyap.parse(page_text, country='US')
        except:
            return None
        
        return [url, addresses]


- Asynchronous main function where with the list of URLS we extract addresses asynchronously

In [40]:
async def main(urls_list):
    tasks = [extract_addresses_from_url_async(url) for url in urls_list]
    final_addresses = await asyncio.gather(*tasks, return_exceptions=True)
    return final_addresses

In [41]:
final_addresses = await main(urls['URL'])

In [43]:
final_addresses

[['https://embcmonroe.org',
  [503 Maurice St. Monroe, NC 28112, 503 Maurice Street Monroe, NC 28112]],
 ['https://embcmonroe.org/connect-with-us/',
  [503 Maurice Street Monroe, NC 28112]],
 ['https://sk4designs.com/contact',
  [295 Mahoney Drive, Unit K Telluride, CO 81432]],
 ['https://sk4designs.com', []],
 ['https://draftingdesign.com', []],
 ['https://draftingdesign.com/contact.htm', []],
 ['https://beerock.com', []],
 ['http://saintmlc.com/contact-us.html', []],
 ['http://saintmlc.com', []],
 ['https://societyfortheblind.org/about-us/contact-us/', []],
 ['https://societyfortheblind.org/events/sip-coffee-and-connect-a-virtual-support-group-for-seniors-319/',
  []],
 ['https://clubk-9.com', []],
 ['https://clubk-9.com/home', []],
 ['https://katerisyracuse.com', []],
 ['https://wildandwanderful.com', []],
 ['https://coastalpaddlecompany.com', []],
 ['https://casepaper.com/contact-us/',
  [3333 N.W. 116th Street Miami, FL 33167,
   499 East Tioga Street Philadelphia, PA 19134,
   91

In [44]:
addresses_list = [element for element in final_addresses if element is not None and element[1]]


In [58]:
addresses_list[0][1]

[503 Maurice St. Monroe, NC 28112, 503 Maurice Street Monroe, NC 28112]

#### Function that checks if an extracted address is valid and complete the address with the missing elements (for example: country)

In [141]:
def extract_address(text):
    # we use the geopy library to check if the address is real and parsing the address
    geolocator = Nominatim(user_agent="Jupyter Notebook")
    location = geolocator.geocode(text, addressdetails=True, timeout = 5)
    try:
        if location:
            print("Locatia este corecta")
            return location.raw['address']
        else:
            print("The address is not valid")
            return None
    except Exception as e:
        print(f"Location failed: {e}")
        return None

#### Saving the valid addresses into a .CSV

In [49]:
real_addresses = []

#### Making the DataFrame for the Addresses List

In [63]:
import pandas as pd

df = pd.DataFrame(columns = ["URL", "Country", "Region", "City", "Postcode", "Road", "Road number"])

df

Unnamed: 0,URL,Country,Region,City,Postcode,Road,Road number


#### For each URL we will check if the found address is real and put the valid ones into the CSV
- we can't make this step asynchronous because the Geopy library allows only 1 request per second

In [64]:
# elem is a list, where elem[0] is the url of the page and elem[1] is a list that contains all the addresses found at the elem[0]
    # print(elem[0])
    # elem [1] contains the list of the addresses
    for adresa in elem[1]:
        # print(adresa)
        
        # we extract the address with geopy if it is real
        real_address = extract_address(adresa)
        if real_address != None:
            # parsing the address that was found
            if 'country' in real_address:
                country = real_address['country']
            else:
                country = ""
                
            if 'county' in real_address:
                region = real_address['county']
            else:
                region = ""
                
            if 'city' in real_address:
                city = real_address['city']
            elif 'village' in real_address:
                city = real_address['village']
            else:
                city = ""
                
            if 'postcode' in real_address:
                postcode = real_address['postcode']
            else:
                postcode = ""
                
            if 'road' in real_address:
                road = real_address['road']
            else:
                road = ""
                
            if 'house_number' in real_address:
                no_road = real_address['house_number']
            else:
                no_road = ""
            
            # build the row for the frame
            individual_row_data = [elem[0], country, region, city, postcode, road, no_road]
            lengthh = len(df)
            df.loc[lengthh] = individual_row_data
            print("valid")
        else:
            print("invalid")
            

Locatia este corecta
valid
Locatia este corecta
valid
Locatia este corecta
valid
The address is not valid
invalid
The address is not valid
invalid
Locatia este corecta
valid
Locatia este corecta
valid
Locatia este corecta
valid
Locatia este corecta
valid
Locatia este corecta
valid
The address is not valid
invalid
The address is not valid
invalid
The address is not valid
invalid
The address is not valid
invalid
The address is not valid
invalid
The address is not valid
invalid
The address is not valid
invalid
The address is not valid
invalid
The address is not valid
invalid
The address is not valid
invalid
The address is not valid
invalid
The address is not valid
invalid
The address is not valid
invalid
The address is not valid
invalid
The address is not valid
invalid
The address is not valid
invalid
The address is not valid
invalid
The address is not valid
invalid
The address is not valid
invalid
The address is not valid
invalid
The address is not valid
invalid
The address is not valid


The address is not valid
invalid
The address is not valid
invalid
The address is not valid
invalid
The address is not valid
invalid
The address is not valid
invalid
The address is not valid
invalid
The address is not valid
invalid
The address is not valid
invalid
Locatia este corecta
valid
The address is not valid
invalid
Locatia este corecta
valid
The address is not valid
invalid
The address is not valid
invalid


In [65]:
df

Unnamed: 0,URL,Country,Region,City,Postcode,Road,Road number
0,https://embcmonroe.org,United States,Union County,Monroe,28112,Maurice Street,503
1,https://embcmonroe.org,United States,Union County,Monroe,28112,Maurice Street,503
2,https://embcmonroe.org/connect-with-us/,United States,Union County,Monroe,28112,Maurice Street,503
3,https://casepaper.com/contact-us/,United States,Philadelphia County,Philadelphia,19134,East Tioga Street,499
4,https://casepaper.com/contact-us/,United States,San Bernardino County,Rancho Cucamonga,91730,Hermosa Avenue,9168
...,...,...,...,...,...,...,...
102,http://pswrehab.com/home/department,United States,Tulare County,Porterville,93257,West Poplar Avenue,194
103,http://pswrehab.com/home#services,United States,Tulare County,Porterville,93257,West Poplar Avenue,194
104,http://lnlconstruction.com/contact.html,United States,,Zillah,98953,Concord Street,231
105,https://weberortho.com/contact-weber-orthodont...,United States,DuPage County,,60187,West Willow Avenue,210


In [66]:
df.to_csv("Addresses.csv", index=False)

## Deutsch Addresses

In [72]:
domainsD = df_parquet.iloc[850:1622]['domain']

In [73]:
# the Deutsch domains are in the range (850->1621)
domainsD

850                 kuk24.de
851      tremco-illbruck.com
852           cantus-bahn.de
853            pastisani.com
854       akzent-personal.de
                ...         
1617    fertighaus-keitel.de
1618          theveryend.net
1619     boerse-stuttgart.de
1620      startupjoblist.com
1621          ecco-reisen.de
Name: domain, Length: 772, dtype: object

- Asynchronous main function where for the domains list we obtain the url scheme

In [74]:
async def main(domains):
    tasks = [get_url_scheme(domain) for domain in domains]
    results = await asyncio.gather(*tasks, return_exceptions=True)
    return results

In [75]:
resultsD = await main(domainsD)

In [76]:
resultsD

[('kuk24.de', 'https'),
 ('tremco-illbruck.com', 'https'),
 ('cantus-bahn.de', 'https'),
 ('pastisani.com', 'https'),
 ('akzent-personal.de', 'https'),
 ('eurofighter.com', 'https'),
 ('marry-jim.com', 'https'),
 ('weinanzeiger.com', 'https'),
 ('teleclinic.com', 'https'),
 ('infinity-reisen.com', 'https'),
 ('rtc-c.com', 'http'),
 ('freiberuflervermittlung.de', 'https'),
 ('scheurer-arbeitsbuehnen.de', 'https'),
 ('aycan.de', 'https'),
 ('cimt.eu', 'https'),
 ('hopegala.de', 'https'),
 ('binect.de', 'https'),
 ('antenne-bethel.de', 'https'),
 ('gross-fuchs.com', 'https'),
 ('cortec-neuro.com', 'https'),
 ('schuhhause.de', 'https'),
 ('biosicherheit.de', 'https'),
 ('twt-online-marketing.de', 'https'),
 ('audiopro.de', 'https'),
 ssl.SSLError(1,
              '[SSL: SSLV3_ALERT_HANDSHAKE_FAILURE] sslv3 alert handshake failure (_ssl.c:1006)'),
 ('hostnet.de', 'https'),
 ('wissensschule.de', 'https'),
 ('bdia.de', 'https'),
 ('projektadler.com', 'http'),
 ('deutscheip.de', 'https'),
 ('e

In [77]:
# Printing the results
complete_urlsD = []
for result in resultsD:
    if isinstance(result, tuple):
        domain, scheme = result
        complete_url = f"{scheme}://{domain}"
        print(f"Url complet: {complete_url}")
        complete_urlsD.append(complete_url)
    else:
        print(f"Rezultat neașteptat: {result}")

Url complet: https://kuk24.de
Url complet: https://tremco-illbruck.com
Url complet: https://cantus-bahn.de
Url complet: https://pastisani.com
Url complet: https://akzent-personal.de
Url complet: https://eurofighter.com
Url complet: https://marry-jim.com
Url complet: https://weinanzeiger.com
Url complet: https://teleclinic.com
Url complet: https://infinity-reisen.com
Url complet: http://rtc-c.com
Url complet: https://freiberuflervermittlung.de
Url complet: https://scheurer-arbeitsbuehnen.de
Url complet: https://aycan.de
Url complet: https://cimt.eu
Url complet: https://hopegala.de
Url complet: https://binect.de
Url complet: https://antenne-bethel.de
Url complet: https://gross-fuchs.com
Url complet: https://cortec-neuro.com
Url complet: https://schuhhause.de
Url complet: https://biosicherheit.de
Url complet: https://twt-online-marketing.de
Url complet: https://audiopro.de
Rezultat neașteptat: [SSL: SSLV3_ALERT_HANDSHAKE_FAILURE] sslv3 alert handshake failure (_ssl.c:1006)
Url complet: ht

In [78]:
len(complete_urlsD)

760

- Asynchronously main function where we apply the filter to the list of urls and obtain the valid ones

In [79]:
async def main():
    valid_urls = await filter_urls_async(complete_urlsD)
    return valid_urls

In [80]:
valid_urlsD = await main()

In [81]:
len(valid_urlsD)

225

- Async main function that for a list of urls generate a bigger list with the urls that have a high chance to have an address

In [82]:
async def main(valid_urls):
    keywords = ['connect', 'contact', 'kontakt', 'impressum', 'adresse', 'reach', 'datenschutz', 'home']
    tasks = [find_urls_with_keywords(url, keywords) for url in valid_urls]
    final_urls = await asyncio.gather(*tasks)
    return final_urls

In [83]:
final_urlsD = await main(valid_urlsD)

In [84]:
urls_listD = []
for set_elem in final_urlsD:
    for elem in set_elem:
        urls_listD.append(elem)

In [85]:
len(urls_listD)

620

#### Checking the new list of URLS and the invalid ones will be eliminated

In [86]:
async def main():
    valid_urls = await filter_urls_async(urls_listD)
    return valid_urls

In [87]:
urls_listD = await main()

In [88]:
len(urls_listD)

554

#### Making a DataFrame to save the valid URLS into a .csv to using them later

In [1]:
import pandas as pd

df = pd.DataFrame(columns = ["URL"])

df

Unnamed: 0,URL


In [None]:
# building the rows
for url in urls_listD:
    individual_row_data = [url]
    length = len(df)
    df.loc[length] = individual_row_data

In [91]:
df

Unnamed: 0,URL
0,https://www.cantus-bahn.de/kontakt
1,https://cantus-bahn.de
2,https://www.cantus-bahn.de/metamenu/impressum
3,https://www.cantus-bahn.de/metamenu/datenschutz
4,https://pastisani.com
...,...
549,http://neuromerchandisinggroup.com
550,http://theblankswebsite.com
551,http://theblankswebsite.com/contact
552,http://gletschercola.de


In [92]:
df.to_csv("Deutsch_URLS.csv", index=False)

#### Reading the urls from the CSV

In [3]:
import pandas as pd
urlsD = pd.read_csv("Deutsch_URLS.csv")

In [4]:
urlsD['URL']

0                   https://www.cantus-bahn.de/kontakt
1                               https://cantus-bahn.de
2        https://www.cantus-bahn.de/metamenu/impressum
3      https://www.cantus-bahn.de/metamenu/datenschutz
4                                https://pastisani.com
                            ...                       
549                 http://neuromerchandisinggroup.com
550                        http://theblankswebsite.com
551                http://theblankswebsite.com/contact
552                            http://gletschercola.de
553                                http://fernbus24.de
Name: URL, Length: 554, dtype: object

#### Function that extracts the address from a page if it's possible
- for the deutsch addresses we use 2 regexes
- one is for the Road + Road Number
- the other one is for the City + Postcode
- to obtain a real address we will make the cartesian product (it's not the fast way but we will be sure the real address is in it)
- to reduce the list we put the condition that an address have maximum 7 words

In [75]:
async def extract_addresses_from_url_deutsch_async(url, regex_pattern, regex_pattern2):
    async with aiohttp.ClientSession() as session:
        try:
            async with session.get(url, ssl=False, timeout=30) as response:
                if response.status != 200:
                    return None
                
                binary_data = await response.read()
                try:
                    page_text = binary_data.decode('utf-8')
                except:
                    page_text = binary_data.decode('latin-1')
                
        except asyncio.TimeoutError:
            print(f"Timeout occurred while fetching URL: {url}")
            return None
        except aiohttp.ClientError as e:
            print(f"Error occurred while fetching URL '{url}': {e}")
            return None
        except aiohttp.ClientConnectorError as e:
            print(f"Error connecting to server: {e}")
            return None
        
        # if we have the status code 429 (Too Many Requests)
        if response.status == 429:
            print(f"Too Many Requests received. Retrying after waiting.")
            await asyncio.sleep(5)  # we wait 5 second to retry
            return await extract_addresses_from_url_async(url)  # Recall the function
        
        try:
            # extracts the html content
            soup = BeautifulSoup(page_text, 'html.parser')
        except:
            return None
        
        try:
            # extracts all the paragraphs (all the addresses are in a <p> tag *from my experience)
            paragraphs = soup.find_all('p')
        except:
            return None
        
        try:
            for paragraph in paragraphs:
                # Finding all the <br> or <br/> tags
                br_tags = paragraph.find_all(['br', 'br/'])
        
                for br_tag in br_tags:
                    # Replace <br> or <br/> with newline character (\n)
                    br_tag.replace_with('\n ')
                if paragraph.string == None:
                    paragraph.append('\n')
        except:
            return None
        
        # Store all paragraph text in a single variable
        all_text = ""
        for paragraph in paragraphs:
            all_text += paragraph.text
            all_text += '\n'
            
         # Initialize city list
        cities = []
        
        # Initialize street list
        streets = []
        
        # Search for regex pattern for the street & street number
        matches = re.finditer(regex_pattern, all_text, re.MULTILINE)
        
        for match in matches:
            # print(match.group(0))
            streets.append(match.group(0))
        # print(streets)
        
        # Search for regex pattern for the postal code & city
        matches = re.finditer(regex_pattern2, all_text, re.MULTILINE)
        for match in matches:
            # print(match.group(0))
            cities.append(match.group(0))
        # print(cities)
        
        # a list that retain all the combinations between cities & streets
        addresses = []
        
        for i in range(0, len(streets)):
            for j in range(0, len(cities)):
                address = streets[i] + ' ' + cities[j]
                words = address.split()
                # print(words)
                if len(words) <= 7 and '\n' not in address and '\t' not in address:
                    addresses.append(address)
                
        addresses = set(addresses)
        addresses = list(addresses)
        print(addresses)        
        return [url, addresses]


In [65]:
# Regex pattern
# street + street number regex (Deutsch Address)
regex_pattern = r"((Ober|Unter den|An |Im |Platz |Berg |Am |Alt\-).+|(?:([A-Z][a-zäüö-]+){1,2})).([Cc]haussee|[Aa]llee|[sS]tr(\.|(a(ss|ß)e))|[Rr]ing|berg|moos|gasse|grund|hörn| Nord|graben|[mM]arkt|[Uu]fer|[Ss]tieg|[Ll]inden|[Dd]amm|[pP]latz|brücke|Steinbüchel|Burg|stiege|[Ww]eg|rain|park|[Ww]eide|[Hh][oö]f|pfad|garten|bogen|passage).+?(\d{1,4})([a-zäöüß]+)?(\-?\d{1,4}[a-zäöüß]?)?"

# Postal Code + City/Town (Deutsch Address)
regex_pattern2 = r"(?<![\.\d])(\d{4,5})\b\s*?([A-Za-zÄÖÜäöüß]+(?:[A-Za-zÄÖÜäöüß -]*[A-Za-zÄÖÜäöüß]+)?)\b"

- Asynchronous main function where with the list of URLS we extract addresses asynchronously

In [60]:
async def main(urls_list, r1, r2):
    tasks = [extract_addresses_from_url_deutsch_async(url, r1, r2) for url in urls_list]
    final_addresses = await asyncio.gather(*tasks, return_exceptions=True)
    return final_addresses

In [76]:
final_addressesD = await main(urlsD['URL'], regex_pattern, regex_pattern2)

['Maffeistr. 12 2023 Weltkonferenz AHK', 'Maffeistrasse 12 93059 Regensburg', 'Augustenstr. 3 93049 Regensburg', 'Augustenstr. 3 2023 Weltkonferenz AHK', 'Maffeistr. 12 93049 Regensburg', 'Maffeistrasse 12 2023 Weltkonferenz AHK', 'Augustenstr. 3 93059 Regensburg', 'Maffeistrasse 12 8568  Handelsregister', 'Maffeistrasse 12 93049 Regensburg', 'Maffeistr. 12 93059 Regensburg', 'Augustenstr. 3 8568  Handelsregister', 'Maffeistr. 12 8568  Handelsregister']
['Orber Straße 30 60386 Frankfurt']
[]
['Schorndorfer Straße 27 70734 Fellbach', 'Schorndorfer Straße 27 70736 Fellbach', 'Sonnenbühlstr. 23 70736 Fellbach', 'Sonnenbühlstr. 23 70734 Fellbach', 'Schorndorfer Straße 27 1600 Amphitheatre Parkway', 'Sonnenbühlstr. 23 1600 Amphitheatre Parkway']
['Franz-Ulrich-Straße 18 34117 Kassel']
['Schorndorfer Straße 27 70736 Fellbach']
[]
['Rechtsgrundlage hierfür ist Art. 6 60386 Frankfurt', 'Elgendorfer Str. 57 56410 Montabaur', 'Elgendorfer Str. 57 60386 Frankfurt', 'Orber Straße 30 56410 Montabau

[]
[]
[]
[]
[]
[]
[]
['Sonnenstr. 6 90537 Feucht']
[]
[]
['Rechtsgrundlage hierfür ist Art. 6 10719 Berlin', 'Rechtsgrundlage ist Art. 6 10719 Berlin', 'Kurfürstendamm 212 10719 Berlin', 'Kurfürstendamm 21 10719 Berlin']
[]
[]
['Sonnenstr. 6 90537 Feucht bei Nürnberg']
[]
[]
[]
[]
['Passauer Str. 8-9 10789 Berlin']
[]
[]
['Leopoldstr. 21 6011 Wellington', 'Leopoldstr. 21 80802 München']
[]
['Wiesenhüttenplatz 25 60329 Frankfurt am Main']
['Lindenstraße.5 27419 Sittensen', 'Lindenstraße 5 27419 Sittensen']
['Lindenstraße.5 27419 Sittensen', 'Lindenstraße 5 27419 Sittensen', 'Lindenstraße.5 1600 Amphitheatre Parkway', 'Lindenstraße 5 1600 Amphitheatre Parkway']
['Wiesenhüttenplatz 25 60329 Frankfurt am Main', 'Wiesenhüttenplatz 25 08523 Plauen', 'Wettiner Platz 10 08523 Plauen', 'Reichsstraße 3 60329 Frankfurt am Main', 'Reichsstraße 3 08523 Plauen', 'Wiesenhüttenplatz 25 01067 Dresden', 'Wettiner Platz 10 01067 Dresden', 'Reichsstraße 3 01067 Dresden', 'Wettiner Platz 10 60329 Frankfurt

['Landstr. 146 1601 S', 'Landstr. 146 2029 Stierlin Court', 'Landstr. 146 60314 Frankfurt', 'Gänsemarkt 43 1601 S', 'Landstr. 146 1600 Amphitheatre Parkway', 'Gänsemarkt 43 60314 Frankfurt', 'Gänsemarkt 43 20354 Hamburg', 'Landstr. 146 20354 Hamburg', 'Gänsemarkt 43 1600 Amphitheatre Parkway', 'Gänsemarkt 43 2029 Stierlin Court']
['Burgstrasse, 27 60316 Frankfurt', 'Landstr. 146 60314 Frankfurt', 'Burgstrasse, 27 2017 fame creative lab', 'Burgstrasse, 27 60314 Frankfurt', 'Landstr. 146 60316 Frankfurt', 'Landstr. 146 2017 fame creative lab']
['Landstr. 146 60314 Frankfurt', 'Landstr. 146 2222 islands', 'Landstr. 146 2023 World Travel Awards']
[]
[]
['Rechtsgrundlage ist Art. 6 1600 Amphitheater Parkway', 'Rechtsgrundlage ist Art. 6 65232 Taunusstein', 'Rechtsgrundlage ist Art. 6 1355 Market St', 'Mainstraße 6 2029 Stierlin Court', 'Gänsemarkt 43 21079 Hamburg', 'Gänsemarkt 43 1595 Spring Hill Road', 'Mainstraße 6 1601 S California Ave', 'Gänsemarkt 43 65232 Taunusstein', 'Gänsemarkt 43

['Seehofstr. 15 14169 Berlin']
['Junkersring 5 53844 Troisdorf']
[]
['Junkersring 5 53844 Troisdorf']
['Binger Straße 25 2024 aktualisiert']
['Schmilinskystr. 45 20099 Hamburg']
['Heider Weg 42 27777 Ganderkesee', 'Heider Weg 42 77694 Kehl am Rhein']
[]
[]
[]
[]
['Schwamberger/Volkergasse 4 1150 Wien']
['Seehofstr. 15 14169 Berlin']
[]
['Wehrstraße 24-26 53773 Hennef Telefon', 'Wehrstraße 24-26 2029 Stierlin Court', 'Dammtorstraße 29-32 1600 Amphitheatre Parkway', 'Wehrstraße 24-26 1600 Amphitheatre Parkway', 'Wehrstr. 26 53773 Hennef Telefon', 'Wehrstraße 24-26 53773 Hennef', 'Wehrstr. 26 2029 Stierlin Court', 'Dammtorstraße 29-32 53773 Hennef Telefon', 'Dammtorstraße 29-32 53773 Hennef', 'Wehrstr. 26 1355 Market Street', 'Wehrstr. 26 1601 Willow Road', 'Wehrstr. 26 20354 Hamburg', 'Wehrstr. 26 53773 Hennef', 'Wehrstr. 26 1600 Amphitheatre Parkway', 'Dammtorstraße 29-32 20354 Hamburg', 'Dammtorstraße 29-32 2029 Stierlin Court', 'Wehrstraße 24-26 1355 Market Street', 'Dammtorstraße 29-

['Magnusstr. 11 12489 Berlin']
['Stadtstraße 2 10900 Berlin', 'Stadtstraße 2 79104 Freiburg']
[]
[]
[]
['Kornstraße 31 30167 Hannover']
[]
['Brunnenstr. 153 10115 Berlin']
['Paderborner Straße 33 33161 Hövelhof']
['Beta-Straße 1 85774 Unterföhring', 'Beta-Straße 1 1600 Amphitheatre Parkway', 'Beta-Straße 1 2023 CinePostproduction GmbH']
[]
['Friedrichstraße 95 1027 Budapest', 'Friedrichstraße 95 80339 Munich', 'Heimeranstrasse 35 80339 Munich', 'Friedrichstraße 95 10117 Berlin', 'Heimeranstrasse 35 1027 Budapest', 'Heimeranstrasse 35 10117 Berlin']
['Am Staudamm 24 91710 Gunzenhausen', 'Am Staudamm 24 02906 Quitzdorf am See', 'Industriestr. 25 94043 USA', 'Am Staudamm 24 1600 Amphitheatre Parkway', 'Am Staudamm 24 94043 USA', 'Industriestr. 25 1600 Amphitheatre Parkway', 'Industriestr. 25 91710 Gunzenhausen', 'Industriestr. 25 02906 Quitzdorf am See']
['Hoheluftbrücke\xa0+ 6 20253 Hamburg', 'Hoheluftchaussee 40 20253 Hamburg', 'Hoheluftbrücke\xa0+ 5 20253 Hamburg', 'Eppendorfer Weg (Os

[]
[]
['Johann-Krane-Weg 37 2022 TRELOCK GmbH', 'Johann-Krane-Weg 37 48149 Münster', 'Johann-Krane-Weg 37 48042 Münster']
['Johann-Krane-Weg 37 2022 TRELOCK GmbH', 'Johann-Krane-Weg 37 48149 Münster', 'Johann-Krane-Weg 37 48042 Münster']
['Johann-Krane-Weg 37 2022 TRELOCK GmbH', 'Johann-Krane-Weg 37 48149 Münster', 'Johann-Krane-Weg 37 48042 Münster']
['Johann-Krane-Weg 37 2022 TRELOCK GmbH', 'Johann-Krane-Weg 37 48149 Münster', 'Johann-Krane-Weg 37 48042 Münster']
['Johann-Krane-Weg 37 2022 TRELOCK GmbH', 'Johann-Krane-Weg 37 48149 Münster', 'Johann-Krane-Weg 37 48042 Münster']
Timeout occurred while fetching URL: https://berlinlofts.com/en/contact


In [77]:
final_addressesD

[['https://www.cantus-bahn.de/kontakt', []],
 ['https://cantus-bahn.de', []],
 ['https://www.cantus-bahn.de/metamenu/impressum',
  ['Franz-Ulrich-Straße 18 34117 Kassel']],
 ['https://www.cantus-bahn.de/metamenu/datenschutz',
  ['Rechtsgrundlage Richtlinie 2004 34369 Hofgeismar',
   'Neue Straße 22 34369 Hofgeismar',
   'Rechtsgrundlage ist Art. 6 34369 Hofgeismar']],
 ['https://pastisani.com', []],
 ['https://pastisani.com/impressum/', ['Orber Straße 30 60386 Frankfurt']],
 ['https://pastisani.com/datenschutz/',
  ['Rechtsgrundlage hierfür ist Art. 6 60386 Frankfurt',
   'Elgendorfer Str. 57 56410 Montabaur',
   'Elgendorfer Str. 57 60386 Frankfurt',
   'Orber Straße 30 56410 Montabaur',
   'Orber Straße 30 60386 Frankfurt',
   'Rechtsgrundlage hierfür ist Art. 6 56410 Montabaur']],
 ['https://marry-jim.com/impressum/', ['Am Wacholderberg 9 61462 Königstein']],
 ['https://marry-jim.com/contact/', []],
 ['https://marry-jim.com', []],
 ['https://weinanzeiger.com/contact', []],
 ['https:

In [78]:
addresses_listD = [element for element in final_addressesD if element is not None and element[1]]

In [81]:
addresses_listD

[['https://www.cantus-bahn.de/metamenu/impressum',
  ['Franz-Ulrich-Straße 18 34117 Kassel']],
 ['https://www.cantus-bahn.de/metamenu/datenschutz',
  ['Rechtsgrundlage Richtlinie 2004 34369 Hofgeismar',
   'Neue Straße 22 34369 Hofgeismar',
   'Rechtsgrundlage ist Art. 6 34369 Hofgeismar']],
 ['https://pastisani.com/impressum/', ['Orber Straße 30 60386 Frankfurt']],
 ['https://pastisani.com/datenschutz/',
  ['Rechtsgrundlage hierfür ist Art. 6 60386 Frankfurt',
   'Elgendorfer Str. 57 56410 Montabaur',
   'Elgendorfer Str. 57 60386 Frankfurt',
   'Orber Straße 30 56410 Montabaur',
   'Orber Straße 30 60386 Frankfurt',
   'Rechtsgrundlage hierfür ist Art. 6 56410 Montabaur']],
 ['https://marry-jim.com/impressum/', ['Am Wacholderberg 9 61462 Königstein']],
 ['https://weinanzeiger.com', ['Denkmalgasse 18 2172 Schrattenbe']],
 ['https://weinanzeiger.com/impressum-p32', ['Kraus Straße 16 2123 Kronberg']],
 ['https://infinity-reisen.com/impressum/',
  ['Schorndorferstr. 27 70734 Fellbach',
 

#### Loading the existing data (previous found addresses)

In [56]:
existing_data = pd.read_csv('Addresses.csv')

#### For each URL we will check if the found address is real and put the valid ones into the CSV
- we can't make this step asynchronous because the Geopy library allows only 1 request per second

In [82]:
real_addressesD = []

In [83]:
for elem in addresses_listD:
    # print(elem[0])
    for adresa in elem[1]:
        # print(adresa)
        real_address = extract_address(adresa)
        if real_address != None:
            print("valid")
            if 'country' in real_address:
                country = real_address['country']
            else:
                country = ""
                
            if 'county' in real_address:
                region = real_address['county']
            else:
                region = ""
                
            if 'city' in real_address:
                city = real_address['city']
            elif 'village' in real_address:
                city = real_address['village']
            else:
                city = ""
                
            if 'postcode' in real_address:
                postcode = real_address['postcode']
            else:
                postcode = ""
                
            if 'road' in real_address:
                road = real_address['road']
            else:
                road = ""
                
            if 'house_number' in real_address:
                no_road = real_address['house_number']
            else:
                no_road = ""
            # Adding the new data rows in the list
            real_addressesD.append([elem[0], country, region, city, postcode, road, no_road])
            
        else:
            print("invalid")
            

Locatia este corecta
valid
The address is not valid
invalid
Locatia este corecta
valid
The address is not valid
invalid
Locatia este corecta
valid
The address is not valid
invalid
Locatia este corecta
valid
The address is not valid
invalid
The address is not valid
invalid
Locatia este corecta
valid
The address is not valid
invalid
Locatia este corecta
valid
The address is not valid
invalid
Locatia este corecta
valid
Locatia este corecta
valid
The address is not valid
invalid
Locatia este corecta
valid
The address is not valid
invalid
Locatia este corecta
valid
The address is not valid
invalid
Locatia este corecta
valid
Locatia este corecta
valid
The address is not valid
invalid
Locatia este corecta
valid
The address is not valid
invalid
The address is not valid
invalid
Locatia este corecta
valid
Locatia este corecta
valid
Locatia este corecta
valid
Locatia este corecta
valid
Locatia este corecta
valid
The address is not valid
invalid
The address is not valid
invalid
Locatia este corect

The address is not valid
invalid
Locatia este corecta
valid
Locatia este corecta
valid
Locatia este corecta
valid
The address is not valid
invalid
The address is not valid
invalid
Locatia este corecta
valid
Locatia este corecta
valid
Locatia este corecta
valid
Locatia este corecta
valid
The address is not valid
invalid
The address is not valid
invalid
The address is not valid
invalid
Locatia este corecta
valid
The address is not valid
invalid
Locatia este corecta
valid
Locatia este corecta
valid
Locatia este corecta
valid
Locatia este corecta
valid
Locatia este corecta
valid
Locatia este corecta
valid
The address is not valid
invalid
Locatia este corecta
valid
Locatia este corecta
valid
Locatia este corecta
valid
Locatia este corecta
valid
Locatia este corecta
valid
The address is not valid
invalid
The address is not valid
invalid
Locatia este corecta
valid
Locatia este corecta
valid
Locatia este corecta
valid
The address is not valid
invalid
The address is not valid
invalid
The addres

The address is not valid
invalid
Locatia este corecta
valid
The address is not valid
invalid
The address is not valid
invalid
The address is not valid
invalid
The address is not valid
invalid
Locatia este corecta
valid
Locatia este corecta
valid
The address is not valid
invalid
Locatia este corecta
valid
Locatia este corecta
valid
The address is not valid
invalid
The address is not valid
invalid
Locatia este corecta
valid
Locatia este corecta
valid
The address is not valid
invalid
The address is not valid
invalid
Locatia este corecta
valid
Locatia este corecta
valid
The address is not valid
invalid
The address is not valid
invalid
The address is not valid
invalid
Locatia este corecta
valid
Locatia este corecta
valid
Locatia este corecta
valid
Locatia este corecta
valid
Locatia este corecta
valid
Locatia este corecta
valid
Locatia este corecta
valid
The address is not valid
invalid
Locatia este corecta
valid
The address is not valid
invalid
Locatia este corecta
valid
Locatia este corect

Locatia este corecta
valid
Locatia este corecta
valid
Locatia este corecta
valid
The address is not valid
invalid
Locatia este corecta
valid
The address is not valid
invalid
The address is not valid
invalid
The address is not valid
invalid
The address is not valid
invalid
Locatia este corecta
valid
The address is not valid
invalid
The address is not valid
invalid
Locatia este corecta
valid
Locatia este corecta
valid
Locatia este corecta
valid
Locatia este corecta
valid
The address is not valid
invalid
The address is not valid
invalid
Locatia este corecta
valid
The address is not valid
invalid
Locatia este corecta
valid
Locatia este corecta
valid
Locatia este corecta
valid
Locatia este corecta
valid
Locatia este corecta
valid
The address is not valid
invalid
The address is not valid
invalid
The address is not valid
invalid
The address is not valid
invalid
The address is not valid
invalid
The address is not valid
invalid
Locatia este corecta
valid
The address is not valid
invalid
The add

The address is not valid
invalid
Locatia este corecta
valid
Locatia este corecta
valid
The address is not valid
invalid
Locatia este corecta
valid
Locatia este corecta
valid
Locatia este corecta
valid
Locatia este corecta
valid
Locatia este corecta
valid
The address is not valid
invalid
Locatia este corecta
valid
The address is not valid
invalid
Locatia este corecta
valid
The address is not valid
invalid
The address is not valid
invalid
Locatia este corecta
valid
The address is not valid
invalid
Locatia este corecta
valid
The address is not valid
invalid
The address is not valid
invalid
Locatia este corecta
valid
The address is not valid
invalid
The address is not valid
invalid
The address is not valid
invalid
The address is not valid
invalid
Locatia este corecta
valid
The address is not valid
invalid
The address is not valid
invalid
Locatia este corecta
valid
The address is not valid
invalid
Locatia este corecta
valid
Locatia este corecta
valid
Locatia este corecta
valid
Locatia este 

In [88]:
len(real_addressesD)

479

#### Making a DataFrame with the extracted addresses

In [89]:
new_data_df = pd.DataFrame(real_addressesD, columns=['URL', 'Country', 'Region', 'City', 'Postcode', 'Road', 'Road number'])

In [90]:
new_data_df

Unnamed: 0,URL,Country,Region,City,Postcode,Road,Road number
0,https://www.cantus-bahn.de/metamenu/impressum,Deutschland,,Kassel,34117,Franz-Ulrich-Straße,18
1,https://www.cantus-bahn.de/metamenu/datenschutz,Deutschland,Landkreis Kassel,,34369,Neue Straße,22
2,https://pastisani.com/impressum/,Deutschland,,Frankfurt am Main,60386,Orber Straße,30
3,https://pastisani.com/datenschutz/,Deutschland,Westerwaldkreis,,56410,Elgendorfer Straße,57
4,https://pastisani.com/datenschutz/,Deutschland,,Frankfurt am Main,60386,Orber Straße,30
...,...,...,...,...,...,...,...
474,https://pyrotechnik24.com/produkte/datenschutz,Deutschland,Landkreis Weilheim-Schongau,Hohenpeißenberg,82383,Rigistraße,
475,https://pyrotechnik24.com/kontakt/,Deutschland,Landkreis Weilheim-Schongau,Hohenpeißenberg,82383,Rigistraße,
476,https://pyrotechnik24.com/kontakt/,Deutschland,,Berlin,12277,Rigistraße,
477,https://pyrotechnik24.com,Deutschland,,Berlin,12277,Rigistraße,10


#### Adding them to the existent data (from the US)

In [91]:
final_data = pd.concat([existing_data, new_data_df], ignore_index=True)

In [92]:
final_data

Unnamed: 0,URL,Country,Region,City,Postcode,Road,Road number
0,https://embcmonroe.org,United States,Union County,Monroe,28112.0,Maurice Street,503
1,https://embcmonroe.org,United States,Union County,Monroe,28112.0,Maurice Street,503
2,https://embcmonroe.org/connect-with-us/,United States,Union County,Monroe,28112.0,Maurice Street,503
3,https://casepaper.com/contact-us/,United States,Philadelphia County,Philadelphia,19134.0,East Tioga Street,499
4,https://casepaper.com/contact-us/,United States,San Bernardino County,Rancho Cucamonga,91730.0,Hermosa Avenue,9168
...,...,...,...,...,...,...,...
581,https://pyrotechnik24.com/produkte/datenschutz,Deutschland,Landkreis Weilheim-Schongau,Hohenpeißenberg,82383,Rigistraße,
582,https://pyrotechnik24.com/kontakt/,Deutschland,Landkreis Weilheim-Schongau,Hohenpeißenberg,82383,Rigistraße,
583,https://pyrotechnik24.com/kontakt/,Deutschland,,Berlin,12277,Rigistraße,
584,https://pyrotechnik24.com,Deutschland,,Berlin,12277,Rigistraße,10


#### Export the data to the CSV

In [93]:
final_data.to_csv('Addresses.csv', index=False)

## United Kingdom

In [95]:
# the Deutsch domains are from to 1622 to the end
domainsUK = df_parquet.iloc[1622:]['domain']

In [96]:
domainsUK

1622                      liamjcurtin.com
1623                           trinyx.org
1624                       crystalage.com
1625                       datasafexl.com
1626                         enigmacs.com
                      ...                
2474          northernirelandscreen.co.uk
2475                      birdbrand.co.uk
2476    thorntonlodgenorthyorkshire.co.uk
2477              peasholmecharity.org.uk
2478                         mackay.co.uk
Name: domain, Length: 857, dtype: object

- Asynchronous main function where for the domains list we obtain the url scheme

In [97]:
async def main(domains):
    tasks = [get_url_scheme(domain) for domain in domains]
    results = await asyncio.gather(*tasks, return_exceptions=True)
    return results

In [100]:
# Rulează funcția principală
resultsUK = await main(domainsUK)

In [101]:
len(resultsUK)

857

In [104]:
# Printing the results
complete_urlsUK = []
for result in resultsUK:
    if isinstance(result, tuple):
        domain, scheme = result
        complete_url = f"{scheme}://{domain}"
        print(f"Url complet: {complete_url}")
        complete_urlsUK.append(complete_url)
    else:
        print(f"Rezultat neașteptat: {result}")

Url complet: https://liamjcurtin.com
Url complet: https://trinyx.org
Url complet: https://crystalage.com
Url complet: https://datasafexl.com
Url complet: https://enigmacs.com
Url complet: https://pennyanderson.com
Url complet: https://inovix.net
Url complet: https://rosybee.com
Url complet: https://newfield.co.uk
Url complet: https://dscsoundandlighting.co.uk
Url complet: https://neseafood.com
Url complet: https://web-design-studios.com
Url complet: https://oppobrothers.com
Url complet: http://lionheart-security.co.uk
Url complet: http://miltonkeynesphotography.com
Url complet: https://classichandtools.com
Url complet: https://pcacoach.com
Url complet: https://carinaskinnermanagement.co.uk
Url complet: https://sentiment.io
Url complet: https://urbanstrides.com
Url complet: https://easthams.co.uk
Url complet: https://bergsonandeaton.com
Url complet: https://vasek.co.uk
Url complet: https://truebeautyaesthetic.com
Url complet: https://kentscp.com
Url complet: https://snaresbrookprep.org


In [108]:
len(complete_urlsUK)

854

- Asynchronously main function where we apply the filter to the list of urls and obtain the valid ones

In [109]:
async def main():
    valid_urls = await filter_urls_async(complete_urlsUK)
    return valid_urls

In [110]:
valid_urlsUK = await main()

In [111]:
len(valid_urlsUK)

229

- Async main function that for a list of urls generate a bigger list with the urls that have a high chance to have an address

In [114]:
async def main(valid_urls):
    keywords = ['connect', 'contact', 'kontakt', 'impressum', 'adresse', 'reach', 'datenschutz', 'home']
    tasks = [find_urls_with_keywords(url, keywords) for url in valid_urls]
    final_urls = await asyncio.gather(*tasks)
    return final_urls

In [115]:
final_urlsUK = await main(valid_urlsUK)

In [116]:
urls_listUK = []
for set_elem in final_urlsUK:
    for elem in set_elem:
        urls_listUK.append(elem)

In [117]:
len(urls_listUK)

427

#### Checking the new list of URLS and the invalid ones will be eliminated

In [118]:
async def main():
    valid_urls = await filter_urls_async(urls_listUK)
    return valid_urls

In [119]:
urls_listUK = await main()

In [120]:
len(urls_listUK)

400

#### Making a DataFrame to save the valid URLS into a .csv to using them later

In [121]:
import pandas as pd

df = pd.DataFrame(columns = ["URL"])

df

Unnamed: 0,URL


In [122]:
for url in urls_listUK:
    individual_row_data = [url]
    length = len(df)
    df.loc[length] = individual_row_data

In [123]:
df

Unnamed: 0,URL
0,https://trinyx.org
1,https://enigmacs.com
2,https://enigmacs.com/contact/
3,https://www.pennyanderson.com/contact
4,https://pennyanderson.com
...,...
395,http://missionlimited.com
396,http://bill-medical.co.uk/contact
397,http://bill-medical.co.uk
398,http://affordablecleaning.co.uk


#### Reading the urls from the CSV

In [124]:
df.to_csv("UK_URLS.csv", index=False)

In [126]:
import pandas as pd
urlsUK = pd.read_csv("UK_URLS.csv")

In [127]:
print(urlsUK['URL'])

0                               https://trinyx.org
1                             https://enigmacs.com
2                    https://enigmacs.com/contact/
3            https://www.pennyanderson.com/contact
4                        https://pennyanderson.com
                          ...                     
395                      http://missionlimited.com
396              http://bill-medical.co.uk/contact
397                      http://bill-medical.co.uk
398                http://affordablecleaning.co.uk
399    http://affordablecleaning.co.uk/contact.php
Name: URL, Length: 400, dtype: object


#### Function that extracts the address from a page if it's possible

In [132]:
async def extract_addresses_from_url_uk_async(url):
    async with aiohttp.ClientSession() as session:
        try:
            async with session.get(url, ssl=False, timeout=60) as response:
                if response.status != 200:
                    return None
                
                binary_data = await response.read()
                page_text = binary_data.decode('latin-1')
                
        except asyncio.TimeoutError:
            print(f"Timeout occurred while fetching URL: {url}")
            return None
        except aiohttp.ClientError as e:
            print(f"Error occurred while fetching URL '{url}': {e}")
            return None
        except aiohttp.ClientConnectorError as e:
            print(f"Error connecting to server: {e}")
            return None
        
        # State Code 429 (Too Many Requests)
        if response.status == 429:
            print(f"Too Many Requests received. Retrying after waiting.")
            await asyncio.sleep(5)  # Wait 5 seconds to retry
            return await extract_addresses_from_url_async(url)  # Recall the function
        
        try:
            # extracts all the html content from the page
            soup = BeautifulSoup(page_text, 'html.parser')
        except:
            return None
        
        try:
            # extracts all the text from the page
            page_text = soup.get_text()
        except:
            return None
        
        try:
            # find an UK address in the text using pyap.parse function (from the pyap library)
            addresses = pyap.parse(page_text, country='GB')
        except:
            return None
        
        return [url, addresses]


- Asynchronous main function where with the list of URLS we extract addresses asynchronously

In [129]:
async def main(urls_list):
    tasks = [extract_addresses_from_url_uk_async(url) for url in urls_list]
    final_addresses = await asyncio.gather(*tasks, return_exceptions=True)
    return final_addresses

In [133]:
final_addressesUK = await main(urlsUK['URL'])

Error occurred while fetching URL 'https://twentytwenty.co': Cannot write to closing transport
Error occurred while fetching URL 'https://seatingmatters.com': Cannot write to closing transport
Error occurred while fetching URL 'https://seatingmatters.com/contact': Cannot write to closing transport
Error occurred while fetching URL 'https://highperformanceproductions.net/contact.html': Cannot connect to host highperformanceproductions.net:443 ssl:False [An established connection was aborted by the software in your host machine]
Error occurred while fetching URL 'https://highperformanceproductions.net': Cannot connect to host highperformanceproductions.net:443 ssl:False [An established connection was aborted by the software in your host machine]
Error occurred while fetching URL 'https://thelionatfarnsfield.com/contact-us/': Cannot connect to host thelionatfarnsfield.com:443 ssl:False [An established connection was aborted by the software in your host machine]
Error occurred while fetchi

Error occurred while fetching URL 'https://imajica.com': [WinError 10053] An established connection was aborted by the software in your host machine
Error occurred while fetching URL 'https://pandr.online': [WinError 10053] An established connection was aborted by the software in your host machine
Error occurred while fetching URL 'https://thestove.org': [WinError 10053] An established connection was aborted by the software in your host machine
Error occurred while fetching URL 'https://athertonstone.com': [WinError 10053] An established connection was aborted by the software in your host machine
Error occurred while fetching URL 'https://freedomdestinations.co.uk': [WinError 10053] An established connection was aborted by the software in your host machine
Error occurred while fetching URL 'https://class.agency/contact/': [WinError 10053] An established connection was aborted by the software in your host machine
Error occurred while fetching URL 'https://azizisearch.com': [WinError 100

In [134]:
final_addressesUK

[['https://trinyx.org', [160 Road City, London, United Kingdom EC1V 2NX]],
 ['https://enigmacs.com',
  [UK StudioSt. Stephens House, Arthur Road, Windsor Berkshire, SL4 1RU, United Kingdom]],
 ['https://enigmacs.com/contact/',
  [se Arthur Road WindsorBerkshire, SL4 1RU, United Kingdom,
   se Arthur Road WindsorBerkshire, SL4 1RU, United Kingdom]],
 ['https://www.pennyanderson.com/contact',
  [60 Park Road, Buxton, Derbyshire, SK17 6SN]],
 ['https://pennyanderson.com', []],
 ['https://inovix.net/contact-us/',
  [Wyvern Business Park, Stanier Way, Derby. DE21 6BF]],
 ['https://inovix.net', [83 Friar Gate, Derby, DE1 1FL]],
 ['https://dscsoundandlighting.co.uk', []],
 ['https://neseafood.com',
  [48 Cox Lane, Chessington, Surrey, KT9 1TW, United Kingdom]],
 ['https://neseafood.com/pages/contact',
  [48 Cox Lane, Chessington, Surrey, KT9 1TW, Great Britain,
   Europarc, Healing, Grimsby, DN37 9TU, Great Britain,
   48 Cox Lane, Chessington, Surrey, KT9 1TW, United Kingdom]],
 ['https://we

In [137]:
addresses_listUK = [element for element in final_addressesUK if element is not None and element[1]]


In [138]:
len(addresses_listUK)

187

#### Loading the existing data (previous found addresses)

In [139]:
existing_data = pd.read_csv('Addresses.csv')

#### For each URL we will check if the found address is real and put the valid ones into the CSV
- we can't make this step asynchronous because the Geopy library allows only 1 request per second

In [140]:
real_addressesUK = []

In [142]:
for elem in addresses_listUK:
    # print(elem[0])
    for adresa in elem[1]:
        # print(adresa)
        real_address = extract_address(adresa)
        if real_address != None:
            print("valid")
            if 'country' in real_address:
                country = real_address['country']
            else:
                country = ""
                
            if 'county' in real_address:
                region = real_address['county']
            else:
                region = ""
                
            if 'city' in real_address:
                city = real_address['city']
            elif 'village' in real_address:
                city = real_address['village']
            else:
                city = ""
                
            if 'postcode' in real_address:
                postcode = real_address['postcode']
            else:
                postcode = ""
                
            if 'road' in real_address:
                road = real_address['road']
            else:
                road = ""
                
            if 'house_number' in real_address:
                no_road = real_address['house_number']
            else:
                no_road = ""
            # Adaugă noile date în listă
            real_addressesUK.append([elem[0], country, region, city, postcode, road, no_road])
            
        else:
            print("invalid")
            

The address is not valid
invalid
The address is not valid
invalid
The address is not valid
invalid
The address is not valid
invalid
Locatia este corecta
valid
The address is not valid
invalid
Locatia este corecta
valid
The address is not valid
invalid
The address is not valid
invalid
The address is not valid
invalid
The address is not valid
invalid
Locatia este corecta
valid
The address is not valid
invalid
The address is not valid
invalid
The address is not valid
invalid
Locatia este corecta
valid
The address is not valid
invalid
Locatia este corecta
valid
The address is not valid
invalid
The address is not valid
invalid
The address is not valid
invalid
The address is not valid
invalid
The address is not valid
invalid
Locatia este corecta
valid
Locatia este corecta
valid
The address is not valid
invalid
The address is not valid
invalid
The address is not valid
invalid
The address is not valid
invalid
The address is not valid
invalid
The address is not valid
invalid
The address is not 

Locatia este corecta
valid
The address is not valid
invalid
The address is not valid
invalid
The address is not valid
invalid
The address is not valid
invalid
The address is not valid
invalid
The address is not valid
invalid
The address is not valid
invalid
The address is not valid
invalid
The address is not valid
invalid
The address is not valid
invalid
Locatia este corecta
valid
Locatia este corecta
valid
Locatia este corecta
valid
Locatia este corecta
valid
The address is not valid
invalid
The address is not valid
invalid
The address is not valid
invalid
Locatia este corecta
valid
Locatia este corecta
valid
Locatia este corecta
valid
Locatia este corecta
valid
Locatia este corecta
valid
Locatia este corecta
valid
Locatia este corecta
valid
The address is not valid
invalid
Locatia este corecta
valid
Locatia este corecta
valid
The address is not valid
invalid
Locatia este corecta
valid
Locatia este corecta
valid
Locatia este corecta
valid
Locatia este corecta
valid


In [144]:
len(real_addressesUK)

115

#### Making a DataFrame with the extracted addresses

In [145]:
new_data_df = pd.DataFrame(real_addressesUK, columns=['URL', 'Country', 'Region', 'City', 'Postcode', 'Road', 'Road number'])

In [146]:
new_data_df

Unnamed: 0,URL,Country,Region,City,Postcode,Road,Road number
0,https://www.pennyanderson.com/contact,United Kingdom,Derbyshire,High Peak,SK17 6NS,Park Road,
1,https://inovix.net,United Kingdom,Derby,Derby,DE1 1DF,Friar Gate,83
2,https://web-design-studios.com/gainsborough/we...,United Kingdom,Lincolnshire,West Lindsey,DN21 1DU,Strafford Street,
3,https://snaresbrookprep.org/about-us/contact-us,United Kingdom,,London,E18 2EB,Woodford Road,
4,https://snaresbrookprep.org,United Kingdom,,London,E18 2EB,Woodford Road,
...,...,...,...,...,...,...,...
110,http://arcovohotelloyalty.com/contact-us/,United Kingdom,,London,EC1V 2NX,City Road,152
111,http://bill-medical.co.uk/contact,United Kingdom,Buckinghamshire,Chesham Bois,HP6 5AE,Chiltern Avenue,
112,http://bill-medical.co.uk/contact,United Kingdom,Buckinghamshire,Chesham Bois,HP6 5AE,Chiltern Avenue,
113,http://bill-medical.co.uk,United Kingdom,Buckinghamshire,Chesham Bois,HP6 5AE,Chiltern Avenue,


#### Adding them to the existent data (from the US and Deutsch)

In [147]:
final_data = pd.concat([existing_data, new_data_df], ignore_index=True)

In [148]:
final_data

Unnamed: 0,URL,Country,Region,City,Postcode,Road,Road number
0,https://embcmonroe.org,United States,Union County,Monroe,28112.0,Maurice Street,503
1,https://embcmonroe.org,United States,Union County,Monroe,28112.0,Maurice Street,503
2,https://embcmonroe.org/connect-with-us/,United States,Union County,Monroe,28112.0,Maurice Street,503
3,https://casepaper.com/contact-us/,United States,Philadelphia County,Philadelphia,19134.0,East Tioga Street,499
4,https://casepaper.com/contact-us/,United States,San Bernardino County,Rancho Cucamonga,91730.0,Hermosa Avenue,9168
...,...,...,...,...,...,...,...
696,http://arcovohotelloyalty.com/contact-us/,United Kingdom,,London,EC1V 2NX,City Road,152
697,http://bill-medical.co.uk/contact,United Kingdom,Buckinghamshire,Chesham Bois,HP6 5AE,Chiltern Avenue,
698,http://bill-medical.co.uk/contact,United Kingdom,Buckinghamshire,Chesham Bois,HP6 5AE,Chiltern Avenue,
699,http://bill-medical.co.uk,United Kingdom,Buckinghamshire,Chesham Bois,HP6 5AE,Chiltern Avenue,


#### Export the data to the CSV

In [149]:
final_data.to_csv('Addresses.csv', index=False)