In [None]:
pip install -r requirements.txt

# Load the .parquet file of the combined data set

In [46]:
import pandas as pd
df = pd.read_parquet("combined_filtered_dataset.parquet")
print(df.head())

   EntityNumber                                                  OfficialName  \
0  0201.310.929                                                           IGL   
1  0202.239.951                                                      PROXIMUS   
2  0203.201.340                                     Nationale Bank van België   
3  0206.460.639  Intergemeentelijk Samenwerkingsverband van het Land van Waas   
4  0206.653.946       Rijksinstituut voor Ziekte- en Invaliditeitsverzekering   

  ZipCode         Municipality                Street HouseNumber  \
0    3600                 Genk            Klotstraat         125   
1    1030           Schaarbeek  Koning AlbertII laan          27   
2    1000              Brussel     de Berlaimontlaan          14   
3    9100         Sint-Niklaas             Lamstraat         113   
4    1210  Sint-Joost-ten-Node           Galileelaan           5   

                      URL  
0  extranet.iglimburg.be/  
1        www.proximus.com  
2              www.n

# Create search query

In [47]:

def generate_query(row):
    """
    Generates a search query string for a given row of the DataFrame.
    
    Parameters:
    - row: A pandas Series representing a row in the DataFrame.
    
    Returns:
    - A string representing the search query.
    """
    # Construct the query using the business information
    query = f"Website {row['OfficialName']} {row['ZipCode']} {row['Municipality']} {row['Street']} {row['HouseNumber']}"
    return query

# Apply the function to each row in the DataFrame to create the queries
df['SearchQuery'] = df.apply(generate_query, axis=1)


# Set display options
pd.set_option('display.max_columns', None)  # Ensure all columns are displayed
pd.set_option('display.max_colwidth', None)  # Ensure full content of each cell is displayed
pd.set_option('display.width', None)  # Adjust the display width for readability

# Show the DataFrame with the generated queries
print(df[['SearchQuery']].head())
print(df.iloc[1600]['SearchQuery'])

                                                                                              SearchQuery
0                                                                    Website IGL 3600 Genk Klotstraat 125
1                                                Website PROXIMUS 1030 Schaarbeek Koning AlbertII laan 27
2                                     Website Nationale Bank van België 1000 Brussel de Berlaimontlaan 14
3    Website Intergemeentelijk Samenwerkingsverband van het Land van Waas 9100 Sint-Niklaas Lamstraat 113
4  Website Rijksinstituut voor Ziekte- en Invaliditeitsverzekering 1210 Sint-Joost-ten-Node Galileelaan 5
Website DATACTION 9320 Aalst Ninovesteenweg 198


# Webscraping of search engines results

## Google
API

In [48]:
import requests
import pandas as pd
import json

# Function to use Google's Custom Search JSON API
def google_search(query, api_key, cse_id, **kwargs):
    url = f"https://www.googleapis.com/customsearch/v1?q={query}&key={api_key}&cx={cse_id}"
    response = requests.get(url, params=kwargs)
    response.raise_for_status()
    return response.json()

def scrape_top_urls_google(search_query, search_engine, skip_domains, max_results=10):
    top_urls = []
    
    if search_engine == "Google":
        with open('config.json') as config_file:
            config = json.load(config_file)
            api_key = config['GOOGLE_API_KEY']
            cse_id = config['GOOGLE_CSE_ID']
            
            """ Create a config.json file where you copy paste these lines: 
            {  
            "GOOGLE_API_KEY": "your_api_key_here",
            "GOOGLE_CSE_ID": "your_cse_id_here"
            }
            """
        
        # Fetch more results initially to account for skipped URLs
        results = google_search(search_query, api_key, cse_id, num=max_results)
        
        # Filter out URLs from domains you want to skip and keep collecting until you have 5 (or run out)
        for item in results.get('items', []):
            url = item['link']
            # Check if URL is from a domain to skip
            if not any(skip_domain in url for skip_domain in skip_domains):
                top_urls.append(url)
                # Stop once you have 5 URLs after filtering
                if len(top_urls) == 5:
                    break
            
    return top_urls

# List of domains to skip
skip_domains = ['trendstop.knack.be', 'fincheck.be', 'bizzy.org', 'trendstop.levif.be', 'www.companyweb.be', 'www.linkedin.com']

# Fetch URLs from Google, skipping the specified domains
# Example: Using the first search query in your DataFrame for demonstration
first_search_query = df.iloc[0]['SearchQuery']
filtered_urls = scrape_top_urls_google(first_search_query, "Google", skip_domains)

# Print the filtered URLs
print("Filtered URLs:")
for url in filtered_urls:
    print(url)


Filtered URLs:
https://iglimburg.be/
https://www.terheide.be/
https://iglimburg.be/contact/
https://app.akov.be/pls/pakov/f?p=INSP_PUBLIEK:VERSLAGEN::DOWNLOAD:::P1000_DLSEC_BLOB_ID:5596
https://www.desocialekaart.be/fiches/7241847ec66a49801840a21ebb5c9eca8f8ac8b6bf162a15f978edbe96ab1789/administratieve-gegevens


## DuckDuckGo
Api

In [49]:
# First, you'd install the package, usually via pip. Check the repository for the latest instructions.
#%pip install duckduckgo_search
# Uses https://github.com/deedy5/duckduckgo_search
from duckduckgo_search import DDGS

def scrape_top_urls_ddg(search_query, skip_domains, max_results=5):
    top_urls = []
    ddgs = DDGS()
    results = ddgs.text(keywords=search_query, max_results=max_results + len(skip_domains))
    
    for result in results:
        url = result.get('href')
        if url and not any(skip_domain in url for skip_domain in skip_domains):
            top_urls.append(url)
            if len(top_urls) == max_results:
                break

    return top_urls

first_search_query = df.iloc[0]['SearchQuery']
filtered_urls = scrape_top_urls_ddg(first_search_query, skip_domains, 5)

print("Filtered URLs:")
for url in filtered_urls:
    print(url)



Filtered URLs:
https://iglimburg.be/
https://publiek.departementzorg.be/Cobrha/Institutions/Institution/WVG_VAPH/203001/
https://opencorporates.com/companies/be/0201310929
https://www.creditsafe.com/business-index/en-gb/company/igl-be00000028
https://b2bhint.com/en/company/be/igl--0201.310.929


## Bing search
API

In [50]:
import requests
import json
import pandas as pd

def read_config():
    with open('config.json') as config_file:
        return json.load(config_file)

def bing_search(query, api_key, endpoint):
    headers = {"Ocp-Apim-Subscription-Key": api_key}
    params = {"q": query, "count": 5}  # Adjust count as necessary
    response = requests.get(endpoint, headers=headers, params=params)
    response.raise_for_status()
    return response.json()

def scrape_top_urls_bing(search_query, skip_domains, max_results=5):
    config = read_config()
    top_urls = []

    results = bing_search(search_query, config['BING_API_KEY'], config['BING_ENDPOINT'])

    # Filter and collect URLs
    for result in results.get('webPages', {}).get('value', []):
        url = result.get('url')
        if not any(skip_domain in url for skip_domain in skip_domains):
            top_urls.append(url)
            if len(top_urls) == max_results:
                break
            
    return top_urls

# Example usage
# Assume 'df' is your DataFrame and 'skip_domains' are defined as shown earlier
if 'df' in locals():
    first_search_query = df.iloc[0]['SearchQuery']
    filtered_urls = scrape_top_urls_bing(first_search_query, skip_domains)

    print("Filtered URLs from Bing:")
    for url in filtered_urls:
        print(url)


Filtered URLs from Bing:
https://iglimburg.be/contact/
https://iglimburg.be/
https://www.terheide.be/over-ons
https://www.terheide.be/
https://publiek.departementzorg.be/Cobrha/Institutions/Institution/WVG_VAPH/201330/


## Mutli-search

Displays all unfiltered URLs results. Has duplicates regarding same website but to different pages on same website.

In [56]:
from collections import defaultdict

def perform_multi_search_with_scores_and_penalty(query, skip_domains, max_results=5):
    # Perform searches
    google_results = scrape_top_urls_google(query, "Google", skip_domains, max_results)
    bing_results = scrape_top_urls_bing(query, skip_domains, max_results)
    ddg_results = scrape_top_urls_ddg(query, skip_domains, max_results)

    # Initialize storage for aggregated results
    all_results = {}
    for url in set(google_results + bing_results + ddg_results):
        all_results[url] = {
            'ranks': [],
            'appearances': 0
        }
    
    # Assign ranks and count appearances
    for rank, url in enumerate(google_results, start=1):
        if url in all_results:
            all_results[url]['ranks'].append(rank)
            all_results[url]['appearances'] += 1
    for rank, url in enumerate(bing_results, start=1):
        if url in all_results:
            all_results[url]['ranks'].append(rank)
            all_results[url]['appearances'] += 1
    for rank, url in enumerate(ddg_results, start=1):
        if url in all_results:
            all_results[url]['ranks'].append(rank)
            all_results[url]['appearances'] += 1

    # Calculate scores with penalties
    scored_urls = {}
    for url, data in all_results.items():
        mean_rank = sum(data['ranks']) / len(data['ranks'])
        # Apply penalty based on the number of search engines the URL appeared in
        penalty = 0
        if data['appearances'] == 2:
            penalty = 5  # Missing in one search engine
        elif data['appearances'] == 1:
            penalty = 10  # Missing in two search engines
        scored_urls[url] = mean_rank + penalty

    # Sort URLs by their score
    sorted_urls = sorted(scored_urls.items(), key=lambda item: item[1])

    # Take the top N results and prepare them for display
    top_urls_with_scores = [(url, round(score, 2)) for url, score in sorted_urls][:max_results]

    return top_urls_with_scores

# Usage
query = df.iloc[1456]['SearchQuery']
max_results = 5  # Adjust as necessary
top_urls_with_scores = perform_multi_search_with_scores_and_penalty(query, skip_domains, max_results)

print("Top URLs from multi-search with scores and penalties:")
for url, score in top_urls_with_scores:
    print(f"{url} - Score: {score}")




Top URLs from multi-search with scores and penalties:
https://www.alkover.be/ - Score: 1.33
https://www.alkover.be/tenten - Score: 7.5
https://www.eventplanner.net/directory/2800_alkover.html - Score: 11.0
https://www.alkover.be/chalets - Score: 12.0
https://www.dnb.com/business-directory/company-profiles.alkover.015d6270d338219ba53647f45045714b.html - Score: 13.0


Only displays unique website URLs. Not the same websites leading to different pages.

In [57]:
from urllib.parse import urlparse
import pandas as pd
import collections
import requests
import json

def get_domain(url):
    """Extracts domain from a URL."""
    parsed_url = urlparse(url)
    return parsed_url.netloc

def perform_multi_search_with_aggregation(query, skip_domains, max_results=5):
    config = read_config()
    
    # Conduct searches across all engines
    google_results = scrape_top_urls_google(query, "Google", skip_domains, max_results)
    bing_results = scrape_top_urls_bing(query, skip_domains, max_results)
    ddg_results = scrape_top_urls_ddg(query, skip_domains, max_results)
    
    # Aggregate all URLs and their source ranks
    urls_info = collections.defaultdict(lambda: {"ranks": [], "appearances": 0})
    for index, url in enumerate(google_results + bing_results + ddg_results):
        domain = get_domain(url)
        urls_info[domain]['ranks'].append(index % max_results + 1)
        urls_info[domain]['appearances'] += 1
        if 'url' not in urls_info[domain] or index % max_results == 0:
            urls_info[domain]['url'] = url  # Prioritize URLs by first occurrence

    # Calculate scores and apply penalties
    for domain, info in urls_info.items():
        base_score = sum(info['ranks']) / len(info['ranks'])
        penalty = 0
        if info['appearances'] < 3:  # Apply penalty if not found by all engines
            penalty = 5 * (3 - info['appearances'])
        info['score'] = base_score + penalty

    # Sort domains by their scores
    sorted_domains = sorted(urls_info.values(), key=lambda x: x['score'])

    # Prepare top N results
    top_urls_with_scores = [(info['url'], info['score']) for info in sorted_domains][:max_results]

    return top_urls_with_scores


query = df.iloc[1456]['SearchQuery'] 
top_urls_with_scores = perform_multi_search_with_aggregation(query, skip_domains, 5)

print("Top URLs from multi-search with domain aggregation and scores:")
for url, score in top_urls_with_scores:
    print(f"{url} - Score: {score}")


Top URLs from multi-search with domain aggregation and scores:
https://www.alkover.be/tenten - Score: 2.7142857142857144
https://www.openingsuren.vlaanderen/alkover/8610-kortemark/ieperstraat-17 - Score: 8.5
https://www.eventplanner.net/directory/2800_alkover.html - Score: 11.0
https://www.dnb.com/business-directory/company-profiles.alkover.015d6270d338219ba53647f45045714b.html - Score: 12.0
https://www.eventplanner.be/bedrijven/2800_alkover.html - Score: 13.0
