In [1]:
pip install -r requirements.txt

Note: you may need to restart the kernel to use updated packages.


# Load the .parquet file of the combined data set

In [2]:
import pandas as pd
df = pd.read_parquet("combined_filtered_dataset.parquet")
print(df.head())

   EntityNumber                                       OfficialName ZipCode  \
0  0201.310.929                                                IGL    3600   
1  0202.239.951                                           PROXIMUS    1030   
2  0203.201.340                          Nationale Bank van België    1000   
3  0206.460.639  Intergemeentelijk Samenwerkingsverband van het...    9100   
4  0206.653.946  Rijksinstituut voor Ziekte- en Invaliditeitsve...    1210   

          Municipality                Street HouseNumber  \
0                 Genk            Klotstraat         125   
1           Schaarbeek  Koning AlbertII laan          27   
2              Brussel     de Berlaimontlaan          14   
3         Sint-Niklaas             Lamstraat         113   
4  Sint-Joost-ten-Node           Galileelaan           5   

                      URL  
0  extranet.iglimburg.be/  
1        www.proximus.com  
2              www.nbb.be  
3        www.interwaas.be  
4       www.inami.fgov.be  


# Create search query

In [3]:

def generate_query(row):
    """
    Generates a search query string for a given row of the DataFrame.
    
    Parameters:
    - row: A pandas Series representing a row in the DataFrame.
    
    Returns:
    - A string representing the search query.
    """
    # Construct the query using the business information
    query = f"Website {row['OfficialName']} {row['ZipCode']} {row['Municipality']} {row['Street']} {row['HouseNumber']}"
    return query

# Apply the function to each row in the DataFrame to create the queries
df['SearchQuery'] = df.apply(generate_query, axis=1)


# Set display options
pd.set_option('display.max_columns', None)  # Ensure all columns are displayed
pd.set_option('display.max_colwidth', None)  # Ensure full content of each cell is displayed
pd.set_option('display.width', None)  # Adjust the display width for readability

# Show the DataFrame with the generated queries
print(df[['SearchQuery']].head())
print(df.iloc[1600]['SearchQuery'])

                                                                                              SearchQuery
0                                                                    Website IGL 3600 Genk Klotstraat 125
1                                                Website PROXIMUS 1030 Schaarbeek Koning AlbertII laan 27
2                                     Website Nationale Bank van België 1000 Brussel de Berlaimontlaan 14
3    Website Intergemeentelijk Samenwerkingsverband van het Land van Waas 9100 Sint-Niklaas Lamstraat 113
4  Website Rijksinstituut voor Ziekte- en Invaliditeitsverzekering 1210 Sint-Joost-ten-Node Galileelaan 5
Website DATACTION 9320 Aalst Ninovesteenweg 198


# Webscraping of search engines results

In [14]:
skip_domains = ['trendstop.knack.be', 'fincheck.be', 'bizzy.org', 'trendstop.levif.be', 'www.companyweb.be', 'www.linkedin.com', 'https://www.companyweb.be', 'https://bizzy.org', 'https://www.linkedin.com', 'https://fincheck.be', 'https://en.wikipedia.org']

## Google
API

In [18]:
import requests
import pandas as pd
import json
import time
from ratelimit import limits, sleep_and_retry

# Load API key and CSE ID from config
with open('config.json') as config_file:
    config = json.load(config_file)
api_key = config['GOOGLE_API_KEY']
cse_id = config['GOOGLE_CSE_ID']

# Decorator to enforce rate limiting
@sleep_and_retry
@limits(calls=20, period=1)
def google_search(query, api_key, cse_id, **kwargs):
    url = f"https://www.googleapis.com/customsearch/v1?q={query}&key={api_key}&cx={cse_id}"
    response = requests.get(url, params=kwargs)
    response.raise_for_status()
    return response.json()

def scrape_top_urls_google(search_query, skip_domains, max_results=5):
    top_urls = []
    
    results = google_search(search_query, api_key, cse_id, num=max_results)
    for item in results.get('items', []):
        url = item['link']
        if not any(skip_domain in url for skip_domain in skip_domains):
            top_urls.append(url)
            if len(top_urls) == max_results:
                break
                
    return top_urls

# Function to integrate all functionalities
def main():
    # Load or initialize progress tracking
    try:
        progress_df = pd.read_csv('search_results_Google.csv')
        collected_data = progress_df.to_dict('records')
        query_count = len(collected_data)
    except FileNotFoundError:
        collected_data = []
        query_count = 0

    # Define total_rows
    total_rows = len(df)

    for index, row in df.iterrows():
        if query_count >= 10000:
            print("Daily query limit reached. Please resume tomorrow.")
            break
        
        if any(d['EntityNumber'] == row['EntityNumber'] for d in collected_data):
            continue
        
        search_query = row['SearchQuery']
        entity_number = row['EntityNumber']
        filtered_urls = scrape_top_urls_google(search_query, [], 5)
        
        collected_data.append({"EntityNumber": entity_number, "URL1": filtered_urls[0] if len(filtered_urls) > 0 else "",
                               "URL2": filtered_urls[1] if len(filtered_urls) > 1 else "",
                               "URL3": filtered_urls[2] if len(filtered_urls) > 2 else "",
                               "URL4": filtered_urls[3] if len(filtered_urls) > 3 else "",
                               "URL5": filtered_urls[4] if len(filtered_urls) > 4 else ""})
        
        query_count += 1
        if (index % 100 == 0 or index == total_rows - 1) and collected_data:  # Save progress intermittently and at the end
            pd.DataFrame(collected_data).to_csv('search_results_Google.csv', index=False)
        
        # Optional: Display progress
        print(f"Processed {index + 1} / {total_rows}. Queries made: {query_count}")
        
    if query_count < 10000:
        print("All data has been processed and saved.")
    else:
        print("Reached the daily limit of queries. Please resume later.")



# Uncomment and call main() when ready
main()




Processed 5102 / 17239. Queries made: 4694
Processed 5103 / 17239. Queries made: 4695
Processed 5104 / 17239. Queries made: 4696
Processed 5105 / 17239. Queries made: 4697
Processed 5106 / 17239. Queries made: 4698
Processed 5107 / 17239. Queries made: 4699
Processed 5109 / 17239. Queries made: 4700
Processed 5110 / 17239. Queries made: 4701
Processed 5111 / 17239. Queries made: 4702
Processed 5112 / 17239. Queries made: 4703
Processed 5113 / 17239. Queries made: 4704
Processed 5114 / 17239. Queries made: 4705
Processed 5115 / 17239. Queries made: 4706
Processed 5116 / 17239. Queries made: 4707
Processed 5117 / 17239. Queries made: 4708
Processed 5118 / 17239. Queries made: 4709
Processed 5119 / 17239. Queries made: 4710
Processed 5120 / 17239. Queries made: 4711


HTTPError: 403 Client Error: Forbidden for url: https://www.googleapis.com/customsearch/v1?q=Website%20&num=5#NotOnlyIdeas%203090%20Overijse%20Vlierbeekberg%2090&key=AIzaSyCUgsOA7l1UF5FLqf8HcYGj77_1fbvMjCo&cx=547b992564d834d45

## DuckDuckGo
Api

In [19]:
# First, you'd install the package, usually via pip. Check the repository for the latest instructions.
#%pip install duckduckgo_search
# Uses https://github.com/deedy5/duckduckgo_search

import pandas as pd
import time
from duckduckgo_search import DDGS
from ratelimit import limits, sleep_and_retry

# Assuming df is your DataFrame

# Decorator to enforce rate limiting
@sleep_and_retry
@limits(calls=1, period=2)
def scrape_top_urls_ddg(search_query, skip_domains, max_results=5):
    top_urls = []
    ddgs = DDGS()
    try:
        results = ddgs.text(keywords=search_query, max_results=max_results + len(skip_domains))
        
        for result in results:
            url = result.get('href')
            if url and not any(skip_domain in url for skip_domain in skip_domains):
                top_urls.append(url)
                if len(top_urls) == max_results:
                    break
    except Exception as e:
        print(f"Encountered an error: {e}")
        raise  # Re-raise the exception to handle it outside
    
    return top_urls


# List to collect rows, or load existing progress if restarting script
try:
    # Try loading existing progress if this script is being restarted
    result_df = pd.read_csv('search_results_DDG.csv')
    collected_data = result_df.to_dict('records')
except FileNotFoundError:
    # If no existing data, start fresh
    collected_data = []

# Get the total number of rows for progress calculation
total_rows = len(df)

for index, row in df.iterrows():
    # Check if this query has already been processed
    if any(d['EntityNumber'] == row['EntityNumber'] for d in collected_data):
        continue  # Skip this row if already processed
    
    search_query = row['SearchQuery']
    entity_number = row['EntityNumber']
    try:
        filtered_urls = scrape_top_urls_ddg(search_query, skip_domains=[], max_results=5)
        # Append entity number and URLs to collected_data
        collected_data.append({"EntityNumber": entity_number, "URL1": filtered_urls[0] if len(filtered_urls) > 0 else "", 
                               "URL2": filtered_urls[1] if len(filtered_urls) > 1 else "", 
                               "URL3": filtered_urls[2] if len(filtered_urls) > 2 else "", 
                               "URL4": filtered_urls[3] if len(filtered_urls) > 3 else "", 
                               "URL5": filtered_urls[4] if len(filtered_urls) > 4 else ""})
    except Exception as e:
        print(f"Rate limit or other error encountered: {e}. Waiting for 1 minute before retrying...")
        time.sleep(60)  # Wait for 1 minute
        continue  # Retry the current iteration
        
    # Convert collected data to DataFrame and save after each successful retrieval
    result_df = pd.DataFrame(collected_data)
    result_df.to_csv('search_results_DDG.csv', index=False)
    
    # Progress feedback
    progress_percentage = ((index + 1) / total_rows) * 100
    print(f"Progress: {progress_percentage:.2f}% ({index + 1}/{total_rows})")

print("All data has been processed and saved.")





Progress: 0.09% (15/17239)
Progress: 0.09% (16/17239)
Progress: 0.10% (17/17239)
Progress: 0.10% (18/17239)
Progress: 0.12% (20/17239)
Progress: 0.12% (21/17239)
Progress: 0.13% (22/17239)
Progress: 0.13% (23/17239)
Progress: 0.15% (25/17239)
Progress: 0.16% (27/17239)
Progress: 0.16% (28/17239)
Progress: 0.17% (29/17239)
Progress: 0.17% (30/17239)
Progress: 0.19% (32/17239)
Progress: 0.19% (33/17239)
Progress: 0.20% (34/17239)
Progress: 0.21% (37/17239)
Progress: 0.23% (40/17239)
Progress: 0.31% (54/17239)
Progress: 0.32% (56/17239)
Progress: 0.33% (57/17239)
Progress: 0.34% (58/17239)
Progress: 0.46% (79/17239)
Progress: 0.48% (82/17239)
Progress: 0.49% (84/17239)
Progress: 0.53% (92/17239)
Progress: 0.54% (93/17239)
Progress: 0.56% (96/17239)
Progress: 0.56% (97/17239)
Progress: 0.58% (100/17239)
Progress: 0.60% (103/17239)
Progress: 0.60% (104/17239)
Progress: 0.68% (117/17239)
Progress: 0.69% (119/17239)
Progress: 0.70% (121/17239)
Progress: 0.73% (125/17239)
Progress: 0.73% (126/

## Bing search
API

In [None]:
import requests
import json
import pandas as pd

def read_config():
    with open('config.json') as config_file:
        return json.load(config_file)

def bing_search(query, api_key, endpoint):
    headers = {"Ocp-Apim-Subscription-Key": api_key}
    params = {"q": query, "count": 5}  # Adjust count as necessary
    response = requests.get(endpoint, headers=headers, params=params)
    response.raise_for_status()
    return response.json()

def scrape_top_urls_bing(search_query, skip_domains, max_results=5):
    config = read_config()
    top_urls = []

    results = bing_search(search_query, config['BING_API_KEY'], config['BING_ENDPOINT'])

    # Filter and collect URLs
    for result in results.get('webPages', {}).get('value', []):
        url = result.get('url')
        if not any(skip_domain in url for skip_domain in skip_domains):
            top_urls.append(url)
            if len(top_urls) == max_results:
                break
            
    return top_urls

# Example usage
# Assume 'df' is your DataFrame and 'skip_domains' are defined as shown earlier
if 'df' in locals():
    first_search_query = df.iloc[0]['SearchQuery']
    filtered_urls = scrape_top_urls_bing(first_search_query, skip_domains)

    print("Filtered URLs from Bing:")
    for url in filtered_urls:
        print(url)


Filtered URLs from Bing:
https://iglimburg.be/contact/
https://iglimburg.be/
https://www.terheide.be/over-ons
https://www.terheide.be/
https://publiek.departementzorg.be/Cobrha/Institutions/Institution/WVG_VAPH/201330/


## Mutli-search

Displays all unfiltered URLs results. Has duplicates regarding same website but to different pages on same website.

In [None]:
from collections import defaultdict

def perform_multi_search_with_scores_and_penalty(query, skip_domains, max_results=5):
    # Perform searches
    google_results = scrape_top_urls_google(query, "Google", skip_domains, max_results)
    bing_results = scrape_top_urls_bing(query, skip_domains, max_results)
    ddg_results = scrape_top_urls_ddg(query, skip_domains, max_results)

    # Initialize storage for aggregated results
    all_results = {}
    for url in set(google_results + bing_results + ddg_results):
        all_results[url] = {
            'ranks': [],
            'appearances': 0
        }
    
    # Assign ranks and count appearances
    for rank, url in enumerate(google_results, start=1):
        if url in all_results:
            all_results[url]['ranks'].append(rank)
            all_results[url]['appearances'] += 1
    for rank, url in enumerate(bing_results, start=1):
        if url in all_results:
            all_results[url]['ranks'].append(rank)
            all_results[url]['appearances'] += 1
    for rank, url in enumerate(ddg_results, start=1):
        if url in all_results:
            all_results[url]['ranks'].append(rank)
            all_results[url]['appearances'] += 1

    # Calculate scores with penalties
    scored_urls = {}
    for url, data in all_results.items():
        mean_rank = sum(data['ranks']) / len(data['ranks'])
        # Apply penalty based on the number of search engines the URL appeared in
        penalty = 0
        if data['appearances'] == 2:
            penalty = 5  # Missing in one search engine
        elif data['appearances'] == 1:
            penalty = 10  # Missing in two search engines
        scored_urls[url] = mean_rank + penalty

    # Sort URLs by their score
    sorted_urls = sorted(scored_urls.items(), key=lambda item: item[1])

    # Take the top N results and prepare them for display
    top_urls_with_scores = [(url, round(score, 2)) for url, score in sorted_urls][:max_results]

    return top_urls_with_scores

# Usage
query = df.iloc[1456]['SearchQuery']
max_results = 5  # Adjust as necessary
top_urls_with_scores = perform_multi_search_with_scores_and_penalty(query, skip_domains, max_results)

print("Top URLs from multi-search with scores and penalties:")
for url, score in top_urls_with_scores:
    print(f"{url} - Score: {score}")


Top URLs from multi-search with scores and penalties:
https://www.alkover.be/ - Score: 1.33
https://www.eventplanner.net/directory/2800_alkover.html - Score: 11.0
https://www.alkover.be/chalets - Score: 12.0
https://www.alkover.be/tenten - Score: 12.0
https://www.dnb.com/business-directory/company-profiles.alkover.015d6270d338219ba53647f45045714b.html - Score: 13.0


Only displays unique website URLs. Not the same websites leading to different pages.

In [None]:
from urllib.parse import urlparse
import pandas as pd
import collections
import requests
import json

def get_domain(url):
    """Extracts domain from a URL."""
    parsed_url = urlparse(url)
    return parsed_url.netloc

def perform_multi_search_with_aggregation(query, skip_domains, max_results=5):
    config = read_config()
    
    # Conduct searches across all engines
    google_results = scrape_top_urls_google(query, "Google", skip_domains, max_results)
    bing_results = scrape_top_urls_bing(query, skip_domains, max_results)
    ddg_results = scrape_top_urls_ddg(query, skip_domains, max_results)
    
    # Aggregate all URLs and their source ranks
    urls_info = collections.defaultdict(lambda: {"ranks": [], "appearances": 0})
    for index, url in enumerate(google_results + bing_results + ddg_results):
        domain = get_domain(url)
        urls_info[domain]['ranks'].append(index % max_results + 1)
        urls_info[domain]['appearances'] += 1
        if 'url' not in urls_info[domain] or index % max_results == 0:
            urls_info[domain]['url'] = url  # Prioritize URLs by first occurrence

    # Calculate scores and apply penalties
    for domain, info in urls_info.items():
        base_score = sum(info['ranks']) / len(info['ranks'])
        penalty = 0
        if info['appearances'] < 3:  # Apply penalty if not found by all engines
            penalty = 5 * (3 - info['appearances'])
        info['score'] = base_score + penalty

    # Sort domains by their scores
    sorted_domains = sorted(urls_info.values(), key=lambda x: x['score'])

    # Prepare top N results
    top_urls_with_scores = [(info['url'], info['score']) for info in sorted_domains][:max_results]

    return top_urls_with_scores


query = df.iloc[1456]['SearchQuery'] 
top_urls_with_scores = perform_multi_search_with_aggregation(query, skip_domains, 5)

print("Top URLs from multi-search with domain aggregation and scores:")
for url, score in top_urls_with_scores:
    print(f"{url} - Score: {score}")


Top URLs from multi-search with domain aggregation and scores:
https://www.alkover.be/tenten - Score: 2.7142857142857144
https://www.openingsuren.vlaanderen/alkover/8610-kortemark/ieperstraat-17 - Score: 8.5
https://www.eventplanner.net/directory/2800_alkover.html - Score: 11.0
https://www.dnb.com/business-directory/company-profiles.alkover.015d6270d338219ba53647f45045714b.html - Score: 12.0
https://www.eventplanner.be/bedrijven/2800_alkover.html - Score: 13.0
