In [1]:
pip install -r requirements.txt

Note: you may need to restart the kernel to use updated packages.


# Load the .parquet file of the combined data set

In [2]:
import pandas as pd
df = pd.read_parquet("combined_filtered_dataset.parquet")
print(df.head())

   EntityNumber                                       OfficialName ZipCode  \
0  0201.310.929                                                IGL    3600   
1  0202.239.951                                           PROXIMUS    1030   
2  0203.201.340                          Nationale Bank van België    1000   
3  0206.460.639  Intergemeentelijk Samenwerkingsverband van het...    9100   
4  0206.653.946  Rijksinstituut voor Ziekte- en Invaliditeitsve...    1210   

          Municipality                Street HouseNumber  \
0                 Genk            Klotstraat         125   
1           Schaarbeek  Koning AlbertII laan          27   
2              Brussel     de Berlaimontlaan          14   
3         Sint-Niklaas             Lamstraat         113   
4  Sint-Joost-ten-Node           Galileelaan           5   

                      URL  
0  extranet.iglimburg.be/  
1        www.proximus.com  
2              www.nbb.be  
3        www.interwaas.be  
4       www.inami.fgov.be  


# Create search query

In [3]:

def generate_query(row):
    """
    Generates a search query string for a given row of the DataFrame.
    
    Parameters:
    - row: A pandas Series representing a row in the DataFrame.
    
    Returns:
    - A string representing the search query.
    """
    # Construct the query using the business information
    query = f"{row['OfficialName']} {row['ZipCode']} {row['Municipality']}"
    return query

# Apply the function to each row in the DataFrame to create the queries
df['SearchQuery'] = df.apply(generate_query, axis=1)


# Set display options
pd.set_option('display.max_columns', None)  # Ensure all columns are displayed
pd.set_option('display.max_colwidth', None)  # Ensure full content of each cell is displayed
pd.set_option('display.width', None)  # Adjust the display width for readability

# Show the DataFrame with the generated queries
print(df[['SearchQuery']].head())

                                                                        SearchQuery
0                                                                     IGL 3600 Genk
1                                                          PROXIMUS 1030 Schaarbeek
2                                            Nationale Bank van België 1000 Brussel
3    Intergemeentelijk Samenwerkingsverband van het Land van Waas 9100 Sint-Niklaas
4  Rijksinstituut voor Ziekte- en Invaliditeitsverzekering 1210 Sint-Joost-ten-Node


# Webscraping of search engines results

In [4]:
skip_domains = ['trendstop.knack.be', 'fincheck.be', 'bizzy.org'
                , 'trendstop.levif.be', 'companyweb.be', 'linkedin.com'
                , 'en.wikipedia.org', 'facebook.com', 'be.linkedin.com'
                , 'instagram.com', 'werkenbijdeoverheid.be', 'dnb.com', 'nl.wikipedia.org'
                , 'youtube.com', 'staatsbladmonitor.be', 'werkenvoor.be'
                , 'twitter.com', 'vlaanderen.be/organisaties', 'jobat.be'
                , 'vdab.be']


In [5]:
#skip_domains=[]

## Google
API

In [7]:
import requests
import pandas as pd
import json
import time
from requests.exceptions import ConnectionError, HTTPError
from urllib3.exceptions import ProtocolError, NewConnectionError

# Load API key and CSE ID from config
with open('config.json') as config_file:
    config = json.load(config_file)
api_key = config['GOOGLE_API_KEY']
cse_id = config['GOOGLE_CSE_ID']

def google_search(query, api_key, cse_id, start=1, **kwargs):
    delay_between_requests = 0.1
    try:
        url = f"https://www.googleapis.com/customsearch/v1?q={query}&key={api_key}&cx={cse_id}&start={start}"
        response = requests.get(url, params=kwargs)
        response.raise_for_status()  # Check for HTTP-level issues
        data = response.json()  # Parse JSON response
        return data
    except (ConnectionError, ProtocolError, HTTPError) as e:
        print(f"Encountered a network error: {e}.")
    except Exception as e:
        print(f"An error occurred: {e}")
    finally:
        time.sleep(delay_between_requests)

    return None

def scrape_top_urls_google(search_query, skip_domains, min_results=5, max_retries=1):
    top_urls = []  # Initialize as an empty list
    retries = 0
    current_start = 1

    while len(top_urls) < min_results and retries < max_retries:
        results = google_search(search_query, api_key, cse_id, start=current_start, num=10)
        if results and 'items' in results:
            for item in results['items']:
                url = item['link']
                if not any(skip_domain in url for skip_domain in skip_domains) and url not in top_urls:
                    top_urls.append(url)
                if len(top_urls) == min_results:
                    break  # Found enough URLs, exit loop
            current_start += 10  # Prepare to query next page of results
        else:
            print(f"No results found in attempt {retries + 1}. Retrying...")
            retries += 1
            time.sleep(0.1)  # Short delay before retrying

    return top_urls[:min_results]


def main():
    file_path = 'search_results_Google.csv'
    placeholder = ""  # Placeholder for unfound URLs
    batch_size = 10  # Number of queries to process before saving to CSV
    processed_batch = []  # Initialize the batch list

    try:
        progress_df = pd.read_csv(file_path)
        last_processed_entity = progress_df['EntityNumber'].max() if not progress_df.empty else None
    except FileNotFoundError:
        progress_df = pd.DataFrame(columns=['EntityNumber', 'URL1', 'URL2', 'URL3', 'URL4', 'URL5'])
        last_processed_entity = None

    # Assuming 'df' is your DataFrame with the search queries
    total_queries = len(df)

    for index, row in df.iterrows():
        entity_number = row['EntityNumber']
        if last_processed_entity and entity_number <= last_processed_entity:
            continue

        # Perform search and process results
        filtered_urls = scrape_top_urls_google(row['SearchQuery'], skip_domains, 5)
        urls_to_add = [filtered_urls[i] if i < len(filtered_urls) else placeholder for i in range(5)]
        
        processed_batch.append({"EntityNumber": entity_number, "URL1": urls_to_add[0], "URL2": urls_to_add[1], "URL3": urls_to_add[2], "URL4": urls_to_add[3], "URL5": urls_to_add[4]})

        if len(processed_batch) >= batch_size or index == total_queries - 1:
            # Append batch to DataFrame and reset for next batch
            new_rows_df = pd.DataFrame(processed_batch)
            progress_df = pd.concat([progress_df, new_rows_df], ignore_index=True)
            progress_df.to_csv(file_path, index=False)
            processed_batch = []  # Clear the batch

        # Update progress
        processed_entries = index + 1  # Assuming 'df' is zero-indexed
        percentage_completed = (processed_entries / total_queries) * 100
        print(f"Processed EntityNumber {entity_number}. Completion: {percentage_completed:.2f}% [{processed_entries}/{total_queries}]")

    print("All data has been processed and saved.")

# Uncomment and call main() when ready
main()

Processed EntityNumber 0400.626.529. Completion: 0.78% [135/17239]
Processed EntityNumber 0400.630.685. Completion: 0.79% [136/17239]
Processed EntityNumber 0400.655.431. Completion: 0.79% [137/17239]
Processed EntityNumber 0400.699.773. Completion: 0.80% [138/17239]
Processed EntityNumber 0400.747.580. Completion: 0.81% [139/17239]
Processed EntityNumber 0400.764.210. Completion: 0.81% [140/17239]
Processed EntityNumber 0400.789.251. Completion: 0.82% [141/17239]
Processed EntityNumber 0400.797.169. Completion: 0.82% [142/17239]
Processed EntityNumber 0400.823.794. Completion: 0.83% [143/17239]
No results found in attempt 1. Retrying...
Processed EntityNumber 0400.837.454. Completion: 0.84% [144/17239]
Processed EntityNumber 0400.867.643. Completion: 0.84% [145/17239]
Processed EntityNumber 0400.886.350. Completion: 0.85% [146/17239]
Processed EntityNumber 0400.902.582. Completion: 0.85% [147/17239]
Processed EntityNumber 0400.945.837. Completion: 0.86% [148/17239]
Processed EntityNum

## DuckDuckGo
Api

In [None]:
# First, you'd install the package, usually via pip. Check the repository for the latest instructions. 
%pip install -U duckduckgo_search
# Uses https://github.com/deedy5/duckduckgo_search

import pandas as pd
import time
from duckduckgo_search import DDGS

# Assuming df is your DataFrame

# Decorator to enforce rate limiting
def scrape_top_urls_ddg(search_query, skip_domains, max_results=10):
    top_urls = []
    ddgs = DDGS()
    try:
        # Fetch results with potentially more than needed to account for skipped domains
        results = ddgs.text(keywords=search_query, max_results=max_results + len(skip_domains))
        
        for result in results:
            url = result.get('href')
            # Check if URL should be skipped
            if url and not any(skip_domain in url for skip_domain in skip_domains):
                top_urls.append(url)
                # Break if enough URLs have been collected
                if len(top_urls) == max_results:
                    break
    except Exception as e:
        print(f"Encountered an error: {e}")
        raise  # Re-raise the exception to handle it outside
    
    return top_urls

try:
    result_df = pd.read_csv('search_results_DDG.csv')
    collected_data = result_df.to_dict('records')
except FileNotFoundError:
    collected_data = []

total_rows = len(df)

for index, row in df.iterrows():
    if any(d['EntityNumber'] == row['EntityNumber'] for d in collected_data):
        continue  # Skip already processed
    
    search_query = row['SearchQuery']
    entity_number = row['EntityNumber']
    try:
        filtered_urls = scrape_top_urls_ddg(search_query, skip_domains, max_results=5)
        time.sleep(1)  # Enforce a simple rate limit

        collected_data.append({
            "EntityNumber": entity_number, 
            "URL1": filtered_urls[0] if len(filtered_urls) > 0 else "", 
            "URL2": filtered_urls[1] if len(filtered_urls) > 1 else "", 
            "URL3": filtered_urls[2] if len(filtered_urls) > 2 else "", 
            "URL4": filtered_urls[3] if len(filtered_urls) > 3 else "", 
            "URL5": filtered_urls[4] if len(filtered_urls) > 4 else ""
        })
    except Exception as e:
        print(f"Error encountered: {e}. Waiting before retrying...")
        time.sleep(20)  # Optional: Adjust based on your preference
        continue  # Optionally retry the current iteration

    result_df = pd.DataFrame(collected_data)
    result_df.to_csv('search_results_DDG.csv', index=False)

    progress_percentage = ((index + 1) / total_rows) * 100
    print(f"Progress: {progress_percentage:.2f}% ({index + 1}/{total_rows})")

print("All data has been processed and saved.")

Note: you may need to restart the kernel to use updated packages.


    Proactor event loop does not implement add_reader family of methods required.
    Registering an additional selector thread for add_reader support.
        asyncio.set_event_loop_policy(WindowsSelectorEventLoopPolicy())
    


Progress: 19.72% (3400/17239)
Progress: 19.83% (3419/17239)
Progress: 19.84% (3420/17239)
Progress: 19.84% (3421/17239)




Progress: 19.85% (3422/17239)
Progress: 19.86% (3423/17239)
Progress: 19.86% (3424/17239)
Progress: 19.87% (3425/17239)
Progress: 19.87% (3426/17239)
Progress: 19.88% (3427/17239)
Progress: 19.89% (3428/17239)
Progress: 19.89% (3429/17239)
Progress: 19.90% (3430/17239)
Progress: 19.90% (3431/17239)
Progress: 19.91% (3432/17239)
Progress: 19.91% (3433/17239)
Progress: 19.92% (3434/17239)
Progress: 19.93% (3435/17239)
Progress: 19.93% (3436/17239)
Progress: 19.94% (3437/17239)
Progress: 19.94% (3438/17239)
Progress: 19.95% (3439/17239)
Progress: 19.96% (3441/17239)
Progress: 19.97% (3442/17239)
Progress: 19.98% (3444/17239)
Progress: 19.98% (3445/17239)
Progress: 19.99% (3446/17239)
Progress: 20.00% (3447/17239)
Progress: 20.00% (3448/17239)
Progress: 20.01% (3449/17239)
Progress: 20.01% (3450/17239)
Progress: 20.02% (3451/17239)
Progress: 20.02% (3452/17239)
Progress: 20.03% (3453/17239)
Progress: 20.04% (3454/17239)
Progress: 20.04% (3455/17239)
Progress: 20.05% (3456/17239)
Progress: 

KeyboardInterrupt: 

## Mutli-search

Displays all unfiltered URLs results. Has duplicates regarding same website but to different pages on same website.

In [None]:
import requests
import pandas as pd
import json
import time
import os
from duckduckgo_search import DDGS
from requests.exceptions import ConnectionError, HTTPError
from urllib3.exceptions import ProtocolError, NewConnectionError

# Load API key and CSE ID from config
with open('config.json') as config_file:
    config = json.load(config_file)
api_key = config['GOOGLE_API_KEY']
cse_id = config['GOOGLE_CSE_ID']

# Rate limiting delay
DELAY_BETWEEN_REQUESTS = 1  # Adjust as per your rate limit settings

def google_search(query, api_key, cse_id, start=1, **kwargs):
    delay_between_requests = 0.1
    try:
        url = f"https://www.googleapis.com/customsearch/v1?q={query}&key={api_key}&cx={cse_id}&start={start}"
        response = requests.get(url, params=kwargs)
        response.raise_for_status()  # Check for HTTP-level issues
        data = response.json()  # Parse JSON response
        return data
    except (ConnectionError, ProtocolError, HTTPError) as e:
        print(f"Encountered a network error: {e}.")
    except Exception as e:
        print(f"An error occurred: {e}")
    finally:
        time.sleep(delay_between_requests)

    return None

def scrape_top_urls_google(search_query, skip_domains, min_results=5, max_retries=1):
    top_urls = []  # Initialize as an empty list
    retries = 0
    current_start = 1

    while len(top_urls) < min_results and retries < max_retries:
        results = google_search(search_query, api_key, cse_id, start=current_start, num=10)
        if results and 'items' in results:
            for item in results['items']:
                url = item['link']
                if not any(skip_domain in url for skip_domain in skip_domains) and url not in top_urls:
                    top_urls.append(url)
                if len(top_urls) == min_results:
                    break  # Found enough URLs, exit loop
            current_start += 10  # Prepare to query next page of results
        else:
            print(f"No results found in attempt {retries + 1}. Retrying...")
            retries += 1
            time.sleep(0.1)  # Short delay before retrying

    return top_urls[:min_results]

def scrape_top_urls_ddg(search_query, skip_domains, max_results=10):
    top_urls = []
    ddgs = DDGS()
    try:
        # Fetch results with potentially more than needed to account for skipped domains
        results = ddgs.text(keywords=search_query, max_results=max_results + len(skip_domains))
        
        for result in results:
            url = result.get('href')
            # Check if URL should be skipped
            if url and not any(skip_domain in url for skip_domain in skip_domains):
                top_urls.append(url)
                # Break if enough URLs have been collected
                if len(top_urls) == max_results:
                    break
    except Exception as e:
        print(f"Encountered an error: {e}")
        raise  # Re-raise the exception to handle it outside
    
    return top_urls

def perform_multi_search_with_scores_and_penalty(query, skip_domains, max_results=5):
    time.sleep(DELAY_BETWEEN_REQUESTS)
    google_results = scrape_top_urls_google(query, skip_domains, max_results)
    ddg_results = scrape_top_urls_ddg(query, skip_domains, max_results)

    all_results = {}
    for url in set(google_results + ddg_results):
        all_results[url] = {'ranks': [], 'appearances': 0}
    
    for rank, url in enumerate(google_results, start=1):
        if url in all_results:
            all_results[url]['ranks'].append(rank)
            all_results[url]['appearances'] += 1
    for rank, url in enumerate(ddg_results, start=1):
        if url in all_results:
            all_results[url]['ranks'].append(rank)
            all_results[url]['appearances'] += 1

    scored_urls = {}
    for url, data in all_results.items():
        mean_rank = sum(data['ranks']) / len(data['ranks']) if data['ranks'] else 100  # Penalize if not found
        penalty = 0 if data['appearances'] == 2 else 10  # Penalty for appearance in one engine only
        scored_urls[url] = mean_rank + penalty

    sorted_urls = sorted(scored_urls.items(), key=lambda item: item[1])
    top_urls_with_scores = [(url, round(score, 2)) for url, score in sorted_urls][:max_results]

    return top_urls_with_scores

def save_last_processed_entity(entity_number):
    with open('last_processed_entity.txt', 'w') as file:
        file.write(str(entity_number))

def get_last_processed_entity():
    try:
        with open('last_processed_entity.txt', 'r') as file:
            return int(file.read().strip())
    except FileNotFoundError:
        return None

def append_to_csv(new_row, file_name='search_results_Multi.csv'):
    with open(file_name, 'a', newline='') as file:
        new_row.to_csv(file, header=file.tell()==0, index=False)
def main():
    last_processed = get_last_processed_entity()
    start_index = df.index[df['EntityNumber'] == last_processed].tolist()[0] + 1 if last_processed is not None else 0
    
    for index, row in df.iloc[start_index:].iterrows():
        query = row['SearchQuery']
        entity_number = row['EntityNumber']
        
        top_urls_with_scores = perform_multi_search_with_scores_and_penalty(query, skip_domains=[], max_results=5)  # Ensure you have defined or included skip_domains list
        top_urls = [url for url, score in top_urls_with_scores]

        new_row = pd.DataFrame([{
            "EntityNumber": entity_number, 
            "URL1": top_urls[0] if len(top_urls) > 0 else "", 
            "URL2": top_urls[1] if len(top_urls) > 1 else "", 
            "URL3": top_urls[2] if len(top_urls) > 2 else "", 
            "URL4": top_urls[3] if len(top_urls) > 3 else "", 
            "URL5": top_urls[4] if len(top_urls) > 4 else ""
        }])
        
        append_to_csv(new_row)
        save_last_processed_entity(entity_number)

        progress_percentage = ((index + 1 - start_index) / len(df[start_index:])) * 100
        print(f"Progress: {progress_percentage:.2f}% ({index + 1}/{len(df)})")

    print("All data has been processed and saved.")

if __name__ == "__main__":
    main()


Progress: 0.01% (1/17239)
Progress: 0.01% (2/17239)
Progress: 0.02% (3/17239)
Progress: 0.02% (4/17239)
Progress: 0.03% (5/17239)
Progress: 0.03% (6/17239)
Progress: 0.04% (7/17239)
Progress: 0.05% (8/17239)
Progress: 0.05% (9/17239)
Progress: 0.06% (10/17239)
Progress: 0.06% (11/17239)
Progress: 0.07% (12/17239)
Progress: 0.08% (14/17239)
Progress: 0.09% (15/17239)
Progress: 0.09% (16/17239)
Progress: 0.10% (17/17239)
Progress: 0.10% (18/17239)
Progress: 0.11% (19/17239)
Progress: 0.12% (20/17239)
Progress: 0.12% (21/17239)
Progress: 0.13% (22/17239)


KeyboardInterrupt: 