In [None]:
pip install -r requirements.txt

# Load the .parquet file of the combined data set

In [None]:
import pandas as pd
df = pd.read_parquet("combined_filtered_dataset.parquet")
print(df.head())

# Create search query

In [None]:
def generate_query(row):
    """
    Generates a search query string for a given row of the DataFrame.
    
    Parameters:
    - row: A pandas Series representing a row in the DataFrame.
    
    Returns:
    - A string representing the search query.
    """
    # Construct the query using the business information
    query = f"{row['OfficialName']} {row['ZipCode']} {row['Municipality']}"
    return query

# Apply the function to each row in the DataFrame to create the queries
df['SearchQuery'] = df.apply(generate_query, axis=1)


# Set display options
pd.set_option('display.max_columns', None)  # Ensure all columns are displayed
pd.set_option('display.max_colwidth', None)  # Ensure full content of each cell is displayed
pd.set_option('display.width', None)  # Adjust the display width for readability

# Show the DataFrame with the generated queries
print(df[['SearchQuery']].head())

# Save dataset with query

In [4]:
df.to_csv('dataset_incl_query.csv', index=True)

# Webscraping of search engines results

In [5]:
skip_domains = ['trendstop.knack.be', 'fincheck.be', 'bizzy.org'
                , 'trendstop.levif.be', 'companyweb.be', 'linkedin.com'
                , 'en.wikipedia.org', 'facebook.com', 'be.linkedin.com'
                , 'instagram.com', 'werkenbijdeoverheid.be', 'dnb.com', 'nl.wikipedia.org'
                , 'youtube.com', 'staatsbladmonitor.be', 'werkenvoor.be'
                , 'twitter.com', 'vlaanderen.be/organisaties', 'jobat.be'
                , 'vdab.be', 'opencorporates.com','www.goldenpages.be',
                'www.immoweb.be', 'be.kompass.com','www.infobel.com',
                'www.bsearch.be', 'www.creditsafe.com','openthebox.be',
                'bedrijvengids.cybo.com','data.be','www.yelp.com',
                'www.goudengids.be','gb.kompass.com','www.cylex-belgie.be',
                'local.infobel.be','www.cybo.com','www.viamichelin.com','lokaal.infobel.be',
                'www.northdata.com','www.tripadvisor.com','www.zoominfo.com',
                'fr.kompass.com','www.openingsuren.vlaanderen','www.info-clipper.com',
                'www.northdata.de','b2bhint.com','www.realo.be',
                'www.pagesdor.be','www.worldpostalcodes.org','www.openingsurengids.be',
                'open-winkel.be','opencorpdata.com','lemariagedelouise.be',
                'www.signalhire.com','www.faillissementsdossier.be','www.bizique.be',
                'www.booking.com','www.hours.be','www.handelsgids.be',
                'foursquare.com','zaubee.com','be.top10place.com',
                'restaurantguru.com','www.zimmo.be','guide.michelin.com',
                'selfcity.be','belgium.worldplaces.me','www.boekhoudkantoren.be',
                'jaarrekening.be']
#Sets are faster
skip_domains = set(skip_domains)


## Google
Api

In [None]:
import requests
import pandas as pd
import json
import time
from requests.exceptions import ConnectionError, HTTPError
from urllib3.exceptions import ProtocolError, NewConnectionError

# Load API key and CSE ID from config
with open('config_search.json') as config_file:
    config = json.load(config_file)
api_key = config['GOOGLE_API_KEY']
cse_id = config['GOOGLE_CSE_ID']


def google_search(query, api_key, cse_id, start=1, **kwargs):
    base_delay = 0.1  # Base delay for exponential backoff
    max_delay = 20  # Maximum delay between retries
    attempt = 0  # Initial attempt number
    while True:
        try:
            url = f"https://www.googleapis.com/customsearch/v1?q={query}&key={api_key}&cx={cse_id}&start={start}&cr=countryBE&lr=lang_nl"
            response = requests.get(url, params=kwargs)
            response.raise_for_status()  # Check for HTTP-level issues
            data = response.json()  # Parse JSON response
            return data
        except (ConnectionError, ProtocolError, HTTPError) as e:
            print(f"Encountered a network error: {e}. Retrying...")
            attempt += 1
            delay = min(max_delay, base_delay * 2 ** attempt)  # Exponential backoff
            time.sleep(delay)
        except Exception as e:
            print(f"An unexpected error occurred: {e}")
            break

    return None

def scrape_top_urls_google(search_query, skip_domains_set, min_results=5, max_retries=3):
    top_urls = []  # Initialize as an empty list
    retries = 0
    current_start = 1
    skip_domains_set = set(skip_domains)  # Convert list to set for efficient testing

    while len(top_urls) < min_results and retries < max_retries:
        results = google_search(search_query, api_key, cse_id, start=current_start, num=10)
        if results and 'items' in results:
            for item in results['items']:
                url = item['link']
                if not any(skip_domain in url for skip_domain in skip_domains_set) and url not in top_urls:
                    top_urls.append(url)
                if len(top_urls) >= min_results:
                    break  # Found enough URLs, exit loop
            current_start += len(results.get('items', []))  # Adjust based on actual returned results
        else:
            print(f"No results found in attempt {retries + 1}. Retrying...")
            retries += 1
            time.sleep(1)  # Longer delay before retrying could be beneficial here

    return top_urls[:min_results]


def main():
    file_path = 'search_results_Google_2.csv'
    placeholder = ""  # Placeholder for unfound URLs
    batch_size = 20  # Number of queries to process before saving to CSV
    processed_batch = []  # Initialize the batch list

    try:
        progress_df = pd.read_csv(file_path)
        last_processed_entity = progress_df['EntityNumber'].max() if not progress_df.empty else None
    except FileNotFoundError:
        progress_df = pd.DataFrame(columns=['EntityNumber', 'URL1', 'URL2', 'URL3', 'URL4', 'URL5'])
        last_processed_entity = None

    total_queries = len(df)

    for index, row in df.iterrows():
        entity_number = row['EntityNumber']
        if last_processed_entity and entity_number <= last_processed_entity:
            continue

        # Perform search and process results
        filtered_urls = scrape_top_urls_google(row['SearchQuery'], skip_domains, 5)
        urls_to_add = [filtered_urls[i] if i < len(filtered_urls) else placeholder for i in range(5)]
        
        processed_batch.append({"EntityNumber": entity_number, "URL1": urls_to_add[0], "URL2": urls_to_add[1], "URL3": urls_to_add[2], "URL4": urls_to_add[3], "URL5": urls_to_add[4]})

        if len(processed_batch) >= batch_size or index == total_queries - 1:
            # Append batch to DataFrame and reset for next batch
            new_rows_df = pd.DataFrame(processed_batch)
            progress_df = pd.concat([progress_df, new_rows_df], ignore_index=True)
            progress_df.to_csv(file_path, index=False)
            processed_batch = []  # Clear the batch

        # Update progress
        processed_entries = index + 1 
        percentage_completed = (processed_entries / total_queries) * 100
        print(f"Processed EntityNumber {entity_number}. Completion: {percentage_completed:.2f}% [{processed_entries}/{total_queries}]")

    print("All data has been processed and saved.")

main()

## DuckDuckGo
Api

In [None]:
# First, you'd install the package, usually via pip. Check the repository for the latest instructions. 
#%pip install -U duckduckgo_search
# Uses https://github.com/deedy5/duckduckgo_search

import pandas as pd
import time
from duckduckgo_search import DDGS


def scrape_top_urls_ddg(search_query, skip_domains, max_results=10):
    top_urls = []
    ddgs = DDGS()
    try:
        # Fetch results with potentially more than needed to account for skipped domains
        results = ddgs.text(keywords=search_query, max_results=max_results + len(skip_domains))
        
        for result in results:
            url = result.get('href')
            # Check if URL should be skipped
            if url and not any(skip_domain in url for skip_domain in skip_domains):
                top_urls.append(url)
                # Break if enough URLs have been collected
                if len(top_urls) == max_results:
                    break
    except Exception as e:
        print(f"Encountered an error: {e}")
        raise  # Re-raise the exception to handle it outside
    
    return top_urls

try:
    result_df = pd.read_csv('search_results_DDG.csv')
    collected_data = result_df.to_dict('records')
except FileNotFoundError:
    collected_data = []

total_rows = len(df)

for index, row in df.iterrows():
    if any(d['EntityNumber'] == row['EntityNumber'] for d in collected_data):
        continue  # Skip already processed
    
    search_query = row['SearchQuery']
    entity_number = row['EntityNumber']
    try:
        filtered_urls = scrape_top_urls_ddg(search_query, skip_domains, max_results=5)
        time.sleep(1)  # Enforce a simple rate limit

        collected_data.append({
            "EntityNumber": entity_number, 
            "URL1": filtered_urls[0] if len(filtered_urls) > 0 else "", 
            "URL2": filtered_urls[1] if len(filtered_urls) > 1 else "", 
            "URL3": filtered_urls[2] if len(filtered_urls) > 2 else "", 
            "URL4": filtered_urls[3] if len(filtered_urls) > 3 else "", 
            "URL5": filtered_urls[4] if len(filtered_urls) > 4 else ""
        })
    except Exception as e:
        print(f"Error encountered: {e}. Waiting before retrying...")
        time.sleep(20)  
        continue  

    result_df = pd.DataFrame(collected_data)
    result_df.to_csv('search_results_DDG.csv', index=False)

    processed_entries = index + 1  
    percentage_completed = (processed_entries / total_rows) * 100
    print(f"Processed EntityNumber {entity_number}. Completion: {percentage_completed:.2f}% [{processed_entries}/{total_rows}]")

print("All data has been processed and saved.")

## Multi-search

In [None]:
#%pip install -U duckduckgo_search

import requests
import pandas as pd
import json
import time
import os
from duckduckgo_search import DDGS
from requests.exceptions import ConnectionError, HTTPError
from urllib3.exceptions import ProtocolError, NewConnectionError

# Load API key and CSE ID from config
with open('config.json') as config_file:
    config = json.load(config_file)
api_key = config['GOOGLE_API_KEY']
cse_id = config['GOOGLE_CSE_ID']

# Rate limiting delay
DELAY_BETWEEN_REQUESTS = 0.5 

def google_search(query, api_key, cse_id, start=1, **kwargs):
    delay_between_requests = DELAY_BETWEEN_REQUESTS
    try:
        url = f"https://www.googleapis.com/customsearch/v1?q={query}&key={api_key}&cx={cse_id}&start={start}"
        response = requests.get(url, params=kwargs)
        response.raise_for_status()  # Check for HTTP-level issues
        data = response.json()  # Parse JSON response
        return data
    except (ConnectionError, ProtocolError, HTTPError) as e:
        print(f"Encountered a network error: {e}.")
    except Exception as e:
        print(f"An error occurred: {e}")
    finally:
        time.sleep(delay_between_requests)

    return None

def scrape_top_urls_google(search_query, skip_domains, min_results=5, max_retries=1):
    top_urls = []  # Initialize as an empty list
    retries = 0
    current_start = 1

    while len(top_urls) < min_results and retries < max_retries:
        results = google_search(search_query, api_key, cse_id, start=current_start, num=10)
        if results and 'items' in results:
            for item in results['items']:
                url = item['link']
                if not any(skip_domain in url for skip_domain in skip_domains) and url not in top_urls:
                    top_urls.append(url)
                if len(top_urls) == min_results:
                    break  # Found enough URLs, exit loop
            current_start += 10  # Prepare to query next page of results
        else:
            print(f"No results found in attempt {retries + 1}. Retrying...")
            retries += 1
            time.sleep(DELAY_BETWEEN_REQUESTS)  # Short delay before retrying

    return top_urls[:min_results]

def scrape_top_urls_ddg(search_query, skip_domains, max_results=10):
    top_urls = []
    ddgs = DDGS()
    try:
        # Fetch results with potentially more than needed to account for skipped domains
        results = ddgs.text(keywords=search_query, max_results=max_results + len(skip_domains))
        
        for result in results:
            url = result.get('href')
            # Check if URL should be skipped
            if url and not any(skip_domain in url for skip_domain in skip_domains):
                top_urls.append(url)
                # Break if enough URLs have been collected
                if len(top_urls) == max_results:
                    break
    except Exception as e:
        print(f"Encountered an error: {e}")
        raise  # Re-raise the exception to handle it outside
    
    return top_urls

def perform_multi_search_with_scores_and_penalty(query, skip_domains, max_results=5):
    time.sleep(DELAY_BETWEEN_REQUESTS)
    google_results = scrape_top_urls_google(query, skip_domains, max_results)
    ddg_results = scrape_top_urls_ddg(query, skip_domains, max_results)

    combined_scores = calculate_combined_rank_score(google_results, ddg_results)

    top_urls_with_scores = combined_scores[:max_results]

    return top_urls_with_scores

def calculate_combined_rank_score(google_results, ddg_results):
    combined_scores = {}
    max_rank = max(len(google_results), len(ddg_results)) + 1

    all_urls = set(google_results + ddg_results)
    for url in all_urls:
        google_rank = google_results.index(url) + 1 if url in google_results else max_rank
        ddg_rank = ddg_results.index(url) + 1 if url in ddg_results else max_rank
        combined_score = google_rank * ddg_rank
        combined_scores[url] = combined_score

    sorted_urls_with_scores = sorted(combined_scores.items(), key=lambda item: item[1])
    final_scores = [(url, 1.0 / combined_score) for url, combined_score in sorted_urls_with_scores]
    return final_scores


def save_last_processed_entity(entity_number):
    with open('last_processed_entity.txt', 'w') as file:
        file.write(str(entity_number))

def get_last_processed_entity():
    try:
        with open('last_processed_entity.txt', 'r') as file:
            return file.read().strip()
    except FileNotFoundError:
        return None

def find_missing_entities(df, csv_path='search_results_Multi.csv'):
    """Find entity numbers in df that are missing in the CSV."""
    if not os.path.exists(csv_path):
        return set(df['EntityNumber'])  # If CSV doesn't exist, all entities are missing

    existing_df = pd.read_csv(csv_path)
    existing_entities = set(existing_df['EntityNumber'])
    all_entities = set(df['EntityNumber'])
    
    missing_entities = all_entities - existing_entities
    return missing_entities

def process_entity(entity_number, query, skip_domains):
    """Process a single entity number."""
    top_urls_with_scores = perform_multi_search_with_scores_and_penalty(query, skip_domains, max_results=5)
    top_urls = [url for url, score in top_urls_with_scores]

    new_row = pd.DataFrame([{
        "EntityNumber": entity_number, 
        "URL1": top_urls[0] if len(top_urls) > 0 else "", 
        "URL2": top_urls[1] if len(top_urls) > 1 else "", 
        "URL3": top_urls[2] if len(top_urls) > 2 else "", 
        "URL4": top_urls[3] if len(top_urls) > 3 else "", 
        "URL5": top_urls[4] if len(top_urls) > 4 else ""
    }])
    
    return new_row

def append_to_csv(new_row, file_name='search_results_Multi.csv'):
    with open(file_name, 'a', newline='', encoding='utf-8') as file:
        new_row.to_csv(file, header=file.tell()==0, index=False)
        
def main():
    scores_df = pd.DataFrame(columns=['EntityNumber', 'URL', 'Score'])
    missing_entities = find_missing_entities(df)
    
    for entity_number in missing_entities:
        row = df[df['EntityNumber'] == entity_number].iloc[0] 
        query = row['SearchQuery']
        
        # Process each entity
        new_row = process_entity(entity_number, query, skip_domains)
        append_to_csv(new_row)
        save_last_processed_entity(entity_number)
        
        # Perform search to get URLs and scores
        top_urls_with_scores = perform_multi_search_with_scores_and_penalty(query, skip_domains, max_results=5)
        
        # After collecting URLs and scores, append them to the scores data frame
        for url, score in top_urls_with_scores:
            new_score_row = pd.DataFrame({'EntityNumber': [entity_number], 'URL': [url], 'Score': [score]})
            scores_df = pd.concat([scores_df, new_score_row], ignore_index=True)
        
        print(f"Processed entity number: {entity_number}")
    
    # Optionally save scores_df to a file or handle it as needed
    scores_df.to_csv('final_scores.csv', index=False)

    print("All data has been processed and saved.")



if __name__ == "__main__":
    main()


# Check if all entity number are present and rearagne the entities in the same order

In [None]:
import os
import pandas as pd

def check_entity_numbers_in_csv(df, csv_path):
    """
    Check if all entity numbers in the dataframe (df) are present in the CSV file.
    
    Parameters:
    - df: The DataFrame containing entity numbers.
    - csv_path: The path to the CSV file to check against.
    
    Returns:
    - (bool, list): A tuple containing a boolean indicating if all entities are present, and a list of missing entities if any.
    """
    if not os.path.exists(csv_path):
        print("CSV file does not exist.")
        return False, list(df['EntityNumber'])  # If CSV doesn't exist, all entities are missing

    csv_df = pd.read_csv(csv_path)
    missing_entities = set(df['EntityNumber']) - set(csv_df['EntityNumber'])
    
    return len(missing_entities) == 0, list(missing_entities)


def reorder_csv_to_match_df(df, csv_path):
    """
    Reorder the entries in the CSV file to match the order of entries in the DataFrame (df) and overwrite the original CSV.
    
    Parameters:
    - df: The DataFrame containing the desired order of entity numbers.
    - csv_path: The path to the CSV file to reorder and overwrite.
    """
    if not os.path.exists(csv_path):
        print("CSV file does not exist.")
        return
    
    csv_df = pd.read_csv(csv_path)
    
    # Reorder the csv_df to match the order in df
    reordered_df = pd.merge(df[['EntityNumber']], csv_df, on='EntityNumber', how='left')
    
    # Overwrite the original CSV file with the reordered DataFrame
    reordered_df.to_csv(csv_path, index=False)
    print(f"CSV has been reordered and saved to {csv_path}.")

# Use of functions
all_present, missing_entities = check_entity_numbers_in_csv(df, 'search_results_Multi.csv')
if all_present:
    print("All entity numbers are present in the CSV.")
else:
    print(f"Missing entity numbers: {missing_entities}")
    print(f"Are missing: ({len(missing_entities)})")
    
#Only execute if the dataset is full
#reorder_csv_to_match_df(df, 'search_results_Multi.csv') 
