In [None]:
pip install -r requirements.txt

# Load the .parquet file of the combined data set

In [2]:
import pandas as pd
df = pd.read_parquet("combined_filtered_dataset.parquet")
print(df.head())

   EntityNumber                                       OfficialName ZipCode  \
0  0201.310.929                                                IGL    3600   
1  0202.239.951                                           PROXIMUS    1030   
2  0203.201.340                          Nationale Bank van België    1000   
3  0206.460.639  Intergemeentelijk Samenwerkingsverband van het...    9100   
4  0206.653.946  Rijksinstituut voor Ziekte- en Invaliditeitsve...    1210   

          Municipality                Street HouseNumber  \
0                 Genk            Klotstraat         125   
1           Schaarbeek  Koning AlbertII laan          27   
2              Brussel     de Berlaimontlaan          14   
3         Sint-Niklaas             Lamstraat         113   
4  Sint-Joost-ten-Node           Galileelaan           5   

                      URL  
0  extranet.iglimburg.be/  
1        www.proximus.com  
2              www.nbb.be  
3        www.interwaas.be  
4       www.inami.fgov.be  


# Create search query

In [3]:

def generate_query(row):
    """
    Generates a search query string for a given row of the DataFrame.
    
    Parameters:
    - row: A pandas Series representing a row in the DataFrame.
    
    Returns:
    - A string representing the search query.
    """
    # Construct the query using the business information
    query = f"Website {row['OfficialName']} {row['ZipCode']} {row['Municipality']} {row['Street']} {row['HouseNumber']}"
    return query

# Apply the function to each row in the DataFrame to create the queries
df['SearchQuery'] = df.apply(generate_query, axis=1)


# Set display options
pd.set_option('display.max_columns', None)  # Ensure all columns are displayed
pd.set_option('display.max_colwidth', None)  # Ensure full content of each cell is displayed
pd.set_option('display.width', None)  # Adjust the display width for readability

# Show the DataFrame with the generated queries
print(df[['SearchQuery']].head())
print(df.iloc[1600]['SearchQuery'])

                                                                                              SearchQuery
0                                                                    Website IGL 3600 Genk Klotstraat 125
1                                                Website PROXIMUS 1030 Schaarbeek Koning AlbertII laan 27
2                                     Website Nationale Bank van België 1000 Brussel de Berlaimontlaan 14
3    Website Intergemeentelijk Samenwerkingsverband van het Land van Waas 9100 Sint-Niklaas Lamstraat 113
4  Website Rijksinstituut voor Ziekte- en Invaliditeitsverzekering 1210 Sint-Joost-ten-Node Galileelaan 5
Website DATACTION 9320 Aalst Ninovesteenweg 198


# Webscraping of search engines results

## Google

In [4]:
import requests
import pandas as pd
import json
# Assuming 'df' is your DataFrame loaded with the necessary data

# Function to use Google's Custom Search JSON API
def google_search(query, api_key, cse_id, **kwargs):
    url = f"https://www.googleapis.com/customsearch/v1?q={query}&key={api_key}&cx={cse_id}"
    response = requests.get(url, params=kwargs)
    response.raise_for_status()
    return response.json()

def scrape_top_urls(search_query, search_engine, skip_domains, max_results=10):
    top_urls = []
    
    if search_engine == "Google":
        with open('config.json') as config_file:
            config = json.load(config_file)
            api_key = config['GOOGLE_API_KEY']
            cse_id = config['GOOGLE_CSE_ID']
            
            """ Create a config.json file where you copy paste these lines: 
            {  
            "GOOGLE_API_KEY": "your_api_key_here",
            "GOOGLE_CSE_ID": "your_cse_id_here"
            }
            """
        
        # Fetch more results initially to account for skipped URLs
        results = google_search(search_query, api_key, cse_id, num=max_results)
        
        # Filter out URLs from domains you want to skip and keep collecting until you have 5 (or run out)
        for item in results.get('items', []):
            url = item['link']
            # Check if URL is from a domain to skip
            if not any(skip_domain in url for skip_domain in skip_domains):
                top_urls.append(url)
                # Stop once you have 5 URLs after filtering
                if len(top_urls) == 5:
                    break
            
    return top_urls

# List of domains to skip
skip_domains = ['trendstop.knack.b', 'fincheck.be', 'bizzy.org', 'trendstop.levif.be']

# Fetch URLs from Google, skipping the specified domains
# Example: Using the first search query in your DataFrame for demonstration
first_search_query = df.iloc[0]['SearchQuery']
filtered_urls = scrape_top_urls(first_search_query, "Google", skip_domains)

# Print the filtered URLs
print("Filtered URLs:")
for url in filtered_urls:
    print(url)


Filtered URLs:
https://iglimburg.be/
https://www.terheide.be/
https://iglimburg.be/contact/
https://app.akov.be/pls/pakov/f?p=INSP_PUBLIEK:VERSLAGEN::DOWNLOAD:::P1000_DLSEC_BLOB_ID:5596
https://www.desocialekaart.be/fiches/7241847ec66a49801840a21ebb5c9eca8f8ac8b6bf162a15f978edbe96ab1789/administratieve-gegevens


## DuckDuckGo

In [18]:
# First, you'd install the package, usually via pip. Check the repository for the latest instructions.
#%pip install duckduckgo_search
# Uses https://github.com/deedy5/duckduckgo_search
from duckduckgo_search import DDGS

def scrape_top_urls_ddg(search_query, skip_domains, max_results=5):
    top_urls = []
    ddgs = DDGS()
    results = ddgs.text(keywords=search_query, max_results=max_results + len(skip_domains))
    
    for result in results:
        url = result.get('href')
        if url and not any(skip_domain in url for skip_domain in skip_domains):
            top_urls.append(url)
            if len(top_urls) == max_results:
                break

    return top_urls

first_search_query = df.iloc[0]['SearchQuery']
filtered_urls = scrape_top_urls_ddg(first_search_query, skip_domains, 5)

print("Filtered URLs:")
for url in filtered_urls:
    print(url)



Filtered URLs:
https://iglimburg.be/contact/
https://www.linkedin.com/company/iglimburg
https://iglimburg.be/
https://www.terheide.be/
https://opencorporates.com/companies/be/0201310929
