======= 1. requirements matching  > 0.85
            1. semantic analysis > 0.9
                a. similarity keywords
            2. Nace Code match > 1.0
        2. Location matching
            1. exact location matching
            2. Geocoding matching 

In [2]:
import json
import pandas as pd
import spacy
import nltk
import numpy as np
import re
import shelve
import json
import os
import logging
import gc
import torch

from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer, CrossEncoder
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
from datetime import datetime
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import AutoTokenizer, AutoModel
from geopy.distance import geodesic 

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')


In [3]:
# DATA IMPORTS
dataDIR = './data/'
originalSalesNexxtChangeData = f'{dataDIR}branche_nexxt_change_sales_listings_scrape.csv'
dejunaPurchases = './data/dejuna_buyer_latest.csv'

nacecode_josn =  './data/nace_codes.json' 
nacecode_array_josn =  './data/nace_codes_array.json' 
nacecode_array_obj =  './data/nace_codes_object.json' 
nacecode_array_obj_ext =  './data/nace_codes_object_ext.json' 
nacecode_array_obj_du =  './data/nace_codes_object_du.json'
 
dataFile =  './data/nexxt_change_sales_listings_geocoded_short_test.csv' 
# sales_file_nace =  './data/nexxt_change_sales_listings_geocoded.csv' 
sales_file_brachen =  './data/branche_nexxt_change_sales_listings.csv' 
sales_file_nace =  './data/dub_listings_geo.csv'
buyer_file_nace =  './data/nexxt_change_purchase_listings_geocoded.csv' 

In [4]:
# Ensure necessary NLTK data is downloaded
nltk.download('punkt')
nltk.download('stopwords')

# -------------------------------------------------------------------------
# 1. Load SpaCy's German model (for tokenization, NER, POS tagging)
# -------------------------------------------------------------------------
try:
    nlp = spacy.load('de_core_news_lg')
except OSError:
    from spacy.cli import download
    download('de_core_news_lg')
    nlp = spacy.load('de_core_news_lg')

[nltk_data] Downloading package punkt to /Users/abbasm1/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/abbasm1/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# -------------------------------------------------------------------------
# 2. Preprocessing function (with German NER, POS, etc.)
# -------------------------------------------------------------------------
def preprocess_text(text, nlp_model):
    """
    - Lowercases the text.
    - Removes URLs, emails, & large digit sequences.
    - Filters out non-alphabetic chars except German Umlauts/ß.
    - Uses SpaCy to keep only NOUN, PROPN, VERB tokens not in stopwords.
    - Applies Snowball stemming on remaining tokens.
    - Also includes certain named entities (ORG, PRODUCT, GPE).
    """
    if pd.isnull(text):
        return ''
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove URLs, emails, large numbers
    text = re.sub(r'http\S+|www.\S+|\S+@\S+|\b\d{10,}\b', '', text)
    
    # Keep only letters and German characters
    text = re.sub(r'[^a-zA-ZäöüÄÖÜß\s]', '', text)
    
    # Compact multiple spaces
    text = ' '.join(text.split())
    

    # # Initialize German stopwords and Snowball stemmer
    stop_words = set(stopwords.words('german'))
    stemmer = SnowballStemmer('german')

    # Process text with SpaCy
    doc = nlp_model(text)

    tokens = []
    for token in doc:
        # Keep nouns, proper nouns, and verbs
        if token.pos_ in {'NOUN', 'PROPN', 'VERB'} and token.text not in stop_words:
            stemmed = stemmer.stem(token.text)
            tokens.append(stemmed)

    # Extract named entities (ORG, PRODUCT, GPE) and include them
    entities = [
        ent.text for ent in doc.ents 
        if ent.label_ in {'ORG', 'PRODUCT', 'GPE'}
    ]
    # Stem and remove stopwords from entities
    entities = [
        stemmer.stem(ent.lower()) 
        for ent in entities 
        if ent.lower() not in stop_words
    ]
    tokens.extend(entities)

    return ' '.join(tokens)

def split_compounds_regex(text):
    """
    Basic German compound word splitting using regex rules.
    Example: "Kundenzufriedenheit" → ["kunden", "zufriedenheit"]
    """
    # Split at common compound connectors (e.g., -s-, -en-, -n-)
    parts = re.split(r'(s\b|en\b|n\b|e\b)(?=\w{3,})', text)
    return [p for p in parts if p and len(p) > 2]

import re
import spacy
from spacy.lang.de.stop_words import STOP_WORDS

# Load SpaCy model CORRECTLY (without disabling components during initial load)
nlp = spacy.load("de_core_news_lg")  # Load first
nlp.disable_pipes("parser", "ner")   # Disable components AFTER loading

def preprocess_text(text, nlp_model):
    """
    Preprocesses German text for similarity tasks.
    - Disables static vectors to avoid RuntimeError.
    - Uses regex-based compound splitting.
    """
    if pd.isnull(text):
        return ''
    
    # Clean text
    text = text.lower()
    text = re.sub(r'http\S+|www.\S+|\S+@\S+|\b\d{10,}\b', '', text)
    text = re.sub(r'[^a-zA-ZäöüÄÖÜß\s\'\-]', '', text)
    text = ' '.join(text.split())
    
    doc = nlp_model(text)
    tokens = []
    custom_stopwords = {"unternehmen", "firma", "dienstleistung", "kunde"}
    stop_words = STOP_WORDS.union(custom_stopwords)
    
    # Token processing (lemmatization + POS filtering)
    for token in doc:
        if token.pos_ in {'NOUN', 'PROPN', 'VERB', 'ADJ', 'NUM'} and token.text not in stop_words:
            tokens.append(token.lemma_.lower())
    
    # Regex-based compound splitting (no external dependencies)
    compounds = []
    for token in doc:
        if token.pos_ == 'NOUN' and len(token.text) > 8:
            parts = re.findall(r'\b\w{4,}(?=\w{4,})', token.text)  # Split long nouns
            compounds.extend([p.lower() for p in parts])
    tokens.extend(compounds)
    
    # Filter short tokens
    tokens = [token for token in tokens if len(token) > 2]
    
    return ' '.join(tokens)

In [None]:
import re
import spacy
from spacy.lang.de.stop_words import STOP_WORDS

# # Load SpaCy model CORRECTLY (without disabling components during initial load)
# nlp = spacy.load("de_core_news_lg")  # Load first
# nlp.disable_pipes("parser", "ner")   # Disable components AFTER loading

def preprocess_text(text, nlp_model):
    """
    Preprocesses German text for similarity tasks.
    - Disables static vectors to avoid RuntimeError.
    - Uses regex-based compound splitting.
    """
    if pd.isnull(text):
        return ''
    
    # Clean text
    text = text.lower()
    text = re.sub(r'http\S+|www.\S+|\S+@\S+|\b\d{10,}\b', '', text)
    text = re.sub(r'[^a-zA-ZäöüÄÖÜß\s\'\-]', '', text)
    text = ' '.join(text.split())
    
    doc = nlp_model(text)
    tokens = []
    custom_stopwords = {"unternehmen", "firma", "dienstleistung", "kunde"}
    stop_words = STOP_WORDS.union(custom_stopwords)
    
    # Token processing (lemmatization + POS filtering)
    for token in doc:
        if token.pos_ in {'NOUN', 'PROPN', 'VERB', 'ADJ', 'NUM'} and token.text not in stop_words:
            tokens.append(token.lemma_.lower())
    
    # Regex-based compound splitting (no external dependencies)
    compounds = []
    for token in doc:
        if token.pos_ == 'NOUN' and len(token.text) > 8:
            parts = re.findall(r'\b\w{4,}(?=\w{4,})', token.text)  # Split long nouns
            compounds.extend([p.lower() for p in parts])
    tokens.extend(compounds)
    
    # Filter short tokens
    tokens = [token for token in tokens if len(token) > 2]
    
    return ' '.join(tokens)


from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import numpy as np

def evaluate_similarity(test_cases, nlp_model, embedding_model):
    similarities = []
    labels = []
    
    for text1, text2, is_similar in test_cases:
        preprocessed1 = preprocess_text(text1, nlp_model)
        preprocessed2 = preprocess_text(text2, nlp_model)
        
        embedding1 = embedding_model.encode(preprocessed1)
        embedding2 = embedding_model.encode(preprocessed2)
        
        similarity = cosine_similarity([embedding1], [embedding2])[0][0]
        similarities.append(similarity)
        labels.append(is_similar)
    
    similarities = np.array(similarities)
    labels = np.array(labels)
    
    # Calculate metrics
    threshold = 0.7
    true_positives = np.sum((similarities > threshold) & (labels == True))
    false_positives = np.sum((similarities > threshold) & (labels == False))
    false_negatives = np.sum((similarities <= threshold) & (labels == True))
    
    precision = true_positives / (true_positives + false_positives + 1e-8)  # Avoid division by zero
    recall = true_positives / (true_positives + false_negatives + 1e-8)
    f1_score = 2 * (precision * recall) / (precision + recall + 1e-8)
    
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1 Score: {f1_score:.2f}")
    print(f"Avg Similarity (Matches): {np.mean(similarities[labels]):.2f}")
    print(f"Avg Similarity (Non-Matches): {np.mean(similarities[~labels]):.2f}")



# Load embedding model
embedding_model = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')

# Define test cases
test_cases = [
    ("Industriemaschinen", "Maschinenbau", True),
    ("Kundenbetreuung", "Autoreparatur", False),
    ("Logistikdienstleister", "Transportunternehmen", True),
]

# Evaluate
evaluate_similarity(test_cases, nlp, embedding_model)


In [141]:
# def _extract_location_parts(location):
#     """Extract and categorize location parts into states and cities."""
#     locations = set()
#     german_states = {
#         'baden-württemberg', 'bayern', 'berlin', 'brandenburg', 'bremen',
#         'hamburg', 'hessen', 'mecklenburg-vorpommern', 'niedersachsen',
#         'nordrhein-westfalen', 'rheinland-pfalz', 'saarland', 'sachsen',
#         'sachsen-anhalt', 'schleswig-holstein', 'thüringen'
#     }


#     if not location or not isinstance(location, str):
#         return locations

#     try:
#         # Split on common delimiters
#         parts = re.split(r'[>/\n]\s*', location)
#         split_locations = []

#         for part in parts:
#             part = part.strip().lower()
#             if part:
#                 # Further split by space if multiple states are concatenated
#                 words = part.split()
#                 temp = []
#                 current = ""
#                 for word in words:
#                     if word.lower() in german_states:
#                         if current:
#                             temp.append(current.strip())
#                         current = word
#                     else:
#                         current += " " + word if current else word
#                 if current:
#                     temp.append(current.strip())
#                 split_locations.extend(temp)

#         for loc in split_locations:
#             loc = loc.strip().lower()
#             if loc:
#                 if loc in german_states:
#                     locations.add(loc.title())  # Capitalize for better geocoding
#                 else:
#                     # Clean up common prefixes like "region"
#                     clean_part = re.sub(r'^region\s+', '', loc)
#                     if clean_part:
#                         locations.add(clean_part.title())
#     except Exception as e:
#         logging.error(f"Error extracting location parts: {e}")

#     return locations


def _extract_location_parts(location):
    """Extract and categorize location parts into states, districts, or cities."""
    locations = set()

    if not location or not isinstance(location, str):
        return locations

    try:
        # Split location string by " > ", handling the hierarchical structure
        parts = re.split(r'[\n]\s*', location)

        for part in parts:
            part = part.split(">")
            locations.add(part[-1].strip())

    except Exception as e:
        print(f"Error extracting location parts: {e}")

    return list(locations)


In [None]:
# _extract_location_parts('''Sachsen / Leipzig / Leipzig, Stadt''')
_extract_location_parts('''Berlin
Sachsen > Leipzig
Sachsen > Dresden
Brandenburg
Niedersachsen > Hannover
Hessen > Frankfurt am Main
                        Hamburg
                        Bayern > München
                        Bayern > Nürnberg
                        Bayern > Augsburg
                        A
                        B > C''')

NACE CODE

In [13]:
# -------------------------------------------------------------------------
# 3. Load NACE codes from JSON
# -------------------------------------------------------------------------
def load_nace_codes(filepath):
    """
    Expects a JSON file where keys = NACE code, values = textual descriptions.
    Example:
      {
        "01.1": "Growing of non-perennial crops",
        "01.2": "Growing of perennial crops",
        ...
      }
    """
    with open(filepath, 'r', encoding='utf-8') as file:
        nace_codes = json.load(file)
    return nace_codes

In [7]:
# -------------------------------------------------------------------------
# 4. Create embeddings with sentence-transformers
# -------------------------------------------------------------------------
def get_embedding_batch(texts, model, batch_size=64):
    """
    Encode texts in batches to optimize memory usage.
    """
    embeddings = model.encode(texts, batch_size=batch_size, show_progress_bar=True, 
                              convert_to_numpy=True, normalize_embeddings=True)
    return embeddings.astype('float32')  # Use float32 to save memory

In [None]:

# -------------------------------------------------------------------------
# 5. Load your Seller/Branchen data and NACE codes
# -------------------------------------------------------------------------
def load_data(nace_codes_filepath):
    nace_codes = load_nace_codes(nace_codes_filepath)
    return nace_codes

# -------------------------------------------------------------------------
# 6. MAIN LOGIC
# -------------------------------------------------------------------------
    # Filepaths (update to your actual paths)
# sellers_filepath = originalSalesNexxtChangeData       # CSV with a column 'branchen'
sellers_filepath =  dejunaPurchases    # CSV with a column 'branchen'
nace_codes_filepath = nacecode_array_obj_du

# Load data
sellers_df = pd.read_csv(sellers_filepath) 
nace_codes = load_data(nace_codes_filepath)
print("🚀 Sellers and NACE codes loaded.")

# Initialize the Sentence Transformer model
# For German or multilingual, consider e.g.: 
#    model_name = 'paraphrase-multilingual-mpnet-base-v2'
model_name = 'all-MiniLM-L12-v2'
model = SentenceTransformer(model_name)
print(f"🚀 Loaded SentenceTransformer model: {model_name}")

# ---------------------------------------------------------------------
# 6a. Preprocess NACE descriptions
# ---------------------------------------------------------------------
# Convert each NACE code's description with the same text preprocessing
nace_descriptions = [preprocess_text(desc, nlp) for desc in nace_codes.values()]
# Create embeddings for these descriptions
nace_embeddings = get_embedding_batch(nace_descriptions, model)
# We'll keep a list of NACE codes in the same order
nace_code_list = list(nace_codes.keys())
print("🚀 Created embeddings for NACE descriptions.")

# ---------------------------------------------------------------------
# 6b. Preprocess 'branchen' column in sellers data
# ---------------------------------------------------------------------
# We'll store it in a new column 'preprocessed_branchen'
# sellers_df['preprocessed_branchen'] = sellers_df['branchen'].apply(lambda x: preprocess_text(x, nlp))
# Split 'branchen' on '>' and join each part


logging.info('Preprocessing buyers\' text fields...')
sellers_df['title_preprocessed'] = sellers_df['title'].apply(lambda x:  preprocess_text(x, nlp))
sellers_df['description_preprocessed'] = sellers_df['description'].apply(lambda x:  preprocess_text(x, nlp))
sellers_df['long_description_preprocessed'] = sellers_df['long_description'].apply(lambda x:  preprocess_text(x, nlp))
sellers_df['preprocessed_branchen'] = sellers_df.apply(
    lambda row: f"{row['Industrie']} {row['Sub-Industrie']}", axis=1
)

# sellers_df['preprocessed_branchen'] = sellers_df['branchen'].apply(
#     lambda x: ' '.join(x.split('>'))
# )

def combine_text_fields(row):
    return ' '.join([
        row.get('title_preprocessed', ''),
        row.get('description_preprocessed', ''),
        row.get('long_description_preprocessed', ''),
        row.get('branchen_preprocessed', '')
    ])
sellers_df['combined_text'] = sellers_df.apply(combine_text_fields, axis=1)

# # Join 'Sub-Industrie' and 'Industrie' columns for buyer data
# sellers_df['preprocessed_branchen'] = sellers_df.apply(
#     lambda row: f"{row['Industrie']} {row['Sub-Industrie']}", axis=1
# )

# Create embeddings for all sellers' branchen
branchen_embeddings = get_embedding_batch(sellers_df['combined_text'].tolist(), model)
print("🚀 Created embeddings for sellers' 'branchen' field.")

# ---------------------------------------------------------------------
# 6c. Compute similarity and assign best-match NACE code
# ---------------------------------------------------------------------
similarities = cosine_similarity(branchen_embeddings, nace_embeddings)
best_match_indices = similarities.argmax(axis=1)

# For each seller row, pick the NACE code with highest similarity
sellers_df['assigned_nace_code'] = [nace_code_list[idx] for idx in best_match_indices]
sellers_df['assigned_nace_similarity'] = [similarities[i][idx] for i, idx in enumerate(best_match_indices)]
print("🚀 Assigned preliminary NACE codes based on 'branchen' similarity.")

# Optionally set a threshold. If similarity < threshold => 'Unassigned'
similarity_threshold = 0.4
sellers_df['nace_code'] = sellers_df.apply(
    lambda row: row['assigned_nace_code'] 
                if row['assigned_nace_similarity'] >= similarity_threshold 
                else 'Unassigned',
    axis=1
)

# ---------------------------------------------------------------------
# 6d. Save and review
# ---------------------------------------------------------------------
output_file = sellers_filepath.replace(".csv", "_nace.csv")
sellers_df.to_csv(output_file, index=False)
print(f"🚀 Saved sellers data with assigned NACE codes to: {output_file}\n")

# Print a small sample
print("Sample of NACE Code Assignments:")
print(sellers_df[['assigned_nace_code', 'assigned_nace_similarity', 'nace_code']].head(10))


Another NACE code

In [None]:

# -------------------------------------------------------------------------
# 4. Create embeddings using Hugging Face
# -------------------------------------------------------------------------
def create_hf_embeddings(texts, tokenizer, model):
    embeddings = []
    for text in texts:
        inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
        with torch.no_grad():
            outputs = model(**inputs)
            cls_embedding = outputs.last_hidden_state[:, 0, :]  # CLS token embedding
            embeddings.append(cls_embedding.squeeze().numpy())
    return np.vstack(embeddings)

# -------------------------------------------------------------------------
# 5. Load Sellers and NACE Data
# -------------------------------------------------------------------------
def load_data(sellers_filepath, nace_codes_filepath):
    sellers_df = pd.read_csv(sellers_filepath)
    nace_codes = load_nace_codes(nace_codes_filepath)
    return sellers_df, nace_codes

# -------------------------------------------------------------------------
# 6. MAIN LOGIC
# -------------------------------------------------------------------------
# Filepaths (update to your actual paths)
# sellers_filepath = './data/dejuna_buyer_latest.csv'
sellers_filepath = originalSalesNexxtChangeData
nace_codes_filepath = './data/nace_codes_object_du.json'

# Load data
sellers_df, nace_codes = load_data(sellers_filepath, nace_codes_filepath)
print("🚀 Sellers and NACE codes loaded.")

# Initialize Hugging Face model and tokenizer
# model_name = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
model_name = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
print(f"🚀 Loaded Hugging Face model: {model_name}")

# ---------------------------------------------------------------------
# 6a. Preprocess NACE descriptions
# ---------------------------------------------------------------------
nace_descriptions = [preprocess_text(desc, nlp) for desc in nace_codes.values()]
nace_embeddings = create_hf_embeddings(nace_descriptions, tokenizer, model)
nace_code_list = list(nace_codes.keys())
print("🚀 Created embeddings for NACE descriptions.")

# ---------------------------------------------------------------------
# 6b. Preprocess 'branchen' column in sellers data
# ---------------------------------------------------------------------

# sellers_df['preprocessed_branchen'] = sellers_df.apply(
#  lambda row: f"{row['Industrie']} {row['Sub-Industrie']}", axis=1
# )
sellers_df['preprocessed_branchen'] = sellers_df['branchen'].apply(lambda x: preprocess_text(x, nlp))
branchen_embeddings = create_hf_embeddings(sellers_df['preprocessed_branchen'].tolist(), tokenizer, model)
print("🚀 Created embeddings for sellers' 'branchen' field.")

# ---------------------------------------------------------------------
# 6c. Compute similarity and assign best-match NACE code
# ---------------------------------------------------------------------
similarities = cosine_similarity(branchen_embeddings, nace_embeddings)
best_match_indices = similarities.argmax(axis=1)

sellers_df['assigned_nace_code'] = [nace_code_list[idx] for idx in best_match_indices]
sellers_df['assigned_nace_similarity'] = [similarities[i][idx] for i, idx in enumerate(best_match_indices)]

similarity_threshold = 0.7
sellers_df['nace_code'] = sellers_df.apply(
    lambda row: row['assigned_nace_code'] if row['assigned_nace_similarity'] >= similarity_threshold else 'Unassigned',
    axis=1
)

# ---------------------------------------------------------------------
# 6d. Save and review
# ---------------------------------------------------------------------
output_file = sellers_filepath.replace(".csv", "_hf_nace.csv")
sellers_df.to_csv(output_file, index=False)
print(f"🚀 Saved sellers data with assigned NACE codes to: {output_file}\n")

print("Sample of NACE Code Assignments:")
print(sellers_df[[ 'assigned_nace_code', 'assigned_nace_similarity', 'nace_code']].head(10))


GEOCODING LOCATION

In [None]:
def get_all_unique_locations(buyers_df, sellers_df):
    """Extract all unique locations from buyers and sellers dataframes."""
    unique_locations = set()

    for df, name in [(buyers_df, 'buyers'), (sellers_df, 'sellers')]:
        for idx, location in df['location'].items():
            locations = _extract_location_parts(location)
            # logging.info(f'Extracted locations: {location.lower().split('\n')}')
            unique_locations.update(locations)
            

    logging.info(f'Total unique locations found: {len(unique_locations)}')
    return unique_locations

def geocode_locations(unique_locations, cache_path='geocode_cache.db'):
    """Geocode unique locations with caching."""
    geolocator = Nominatim(user_agent="buyer_seller_matching")
    geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1, max_retries=3, error_wait_seconds=10.0)

    # Ensure cache directory exists
    cache_dir = os.path.dirname(cache_path)
    if cache_dir and not os.path.exists(cache_dir):
        os.makedirs(cache_dir)

    with shelve.open(cache_path) as geocode_cache:
        for location in unique_locations:

            if location in geocode_cache:
                continue  # Already cached
            try:
                logging.info(f'Geocoding location: {location}')
                loc = geocode(location + ", Germany")
                if loc:
                    geocode_cache[location] = {'latitude': loc.latitude, 'longitude': loc.longitude}
                    logging.info(f'Geocoded {location}: ({loc.latitude}, {loc.longitude})')
                else:
                    geocode_cache[location] = {'latitude': None, 'longitude': None}
                    logging.warning(f'Geocoding failed for location: {location}')
            except Exception as e:
                logging.error(f"Geocoding error for location '{location}': {e}")
                geocode_cache[location] = {'latitude': None, 'longitude': None}

def update_dataframe_with_geocodes(df, cache_path='geocode_cache.db'):
    """Add latitude and longitude columns to the dataframe based on locations."""
    with shelve.open(cache_path) as geocode_cache:
        latitudes = []
        longitudes = []

        for idx, location in df['location'].items():
            locations = _extract_location_parts(location)
            lat_list = []
            lon_list = []
            for loc in locations:
                geocode_info = geocode_cache.get(loc, {'latitude': None, 'longitude': None})
                if geocode_info['latitude'] is not None and geocode_info['longitude'] is not None:
                    lat_list.append(geocode_info['latitude'])
                    lon_list.append(geocode_info['longitude'])
                else:
                    # If geocoding failed, append None
                    lat_list.append(None)
                    lon_list.append(None)
            # Convert lists to JSON strings for CSV compatibility
            latitudes.append(json.dumps(lat_list))
            longitudes.append(json.dumps(lon_list))

    df['latitude'] = latitudes
    df['longitude'] = longitudes
    return df

# Paths to input and output files
buyer_filepath = './data/dejuna_buyer_latest_hf_nace.csv'
sellers_filepath = './data/branche_nexxt_change_sales_listings_scrape_hf_nace.csv'

cache_path = './geocode_cache.db'

# Load buyer and seller datasets

sellers_df = pd.read_csv(sellers_filepath)
buyers_df = pd.read_csv(buyer_filepath)
unique_locations = get_all_unique_locations(buyers_df, sellers_df)
unique_locations
# Geocode locations with caching
geocode_locations(unique_locations, cache_path=cache_path)

# Update dataframes with geocodes

# # logging.info('Updating sellers dataframe with geocodes...')
sellers_df = update_dataframe_with_geocodes(sellers_df, cache_path=cache_path)
buyers_df = update_dataframe_with_geocodes(buyers_df, cache_path=cache_path)

# # Save updated dataframes to new CSV files
logging.info('Saving updated sellers dataframe...')
sellers_output_file = sellers_filepath.replace(".csv", "_geocoded.csv")
buyers_output_file = buyer_filepath.replace(".csv", "_geocoded.csv")
sellers_df.to_csv(sellers_output_file, index=False)
buyers_df.to_csv(buyers_output_file, index=False)
logging.info('Geocoding process completed successfully.')
print(f"🚀 Saved sellers data with assigned geocodes to: {buyers_output_file}\n {sellers_output_file}\n")



In [None]:
from geopy.geocoders import Nominatim
from geopy.distance import geodesic

def geocode(location):
    """
    Geocodes a location string to latitude and longitude.
    
    Args:
        location (str): The location string to geocode.
    
    Returns:
        tuple: Latitude and longitude of the location, or None if not found.
    """
    geolocator = Nominatim(user_agent="geoapi")
    location_data = geolocator.geocode(location)
    if location_data:
        return location_data.latitude, location_data.longitude
    else:
        return None
'''Hamburg
Schleswig-Holstein
Berlin

'''
location1 = 'Berlin'
location2 = 'Brandenburg'

coordinates1 = geocode(location1 + ", Germany")
coordinates2 = geocode(location2 + ", Germany")
if coordinates1 and coordinates2:
    distance = geodesic(coordinates1, coordinates2).kilometers
    print(f"Coordinates for '{location1}': {coordinates1}")
    print(f"Coordinates for '{location2}': {coordinates2}")
    print(f"Distance between '{location1}' and '{location2}': {distance:.2f} km")
else:
    print("One or both locations could not be geocoded.")


Location Matching

In [None]:

sales = pd.read_csv('./data/branche_nexxt_change_sales_listings_scrape_hf_nace_geocoded.csv')[10:30]
purchase = pd.read_csv('./data/dejuna_buyer_latest_hf_nace_geocoded.csv')[10:30]


sales['processed_location'] = sales['location'].apply(lambda x: list(_extract_location_parts(x)))
purchase['processed_location'] = purchase['location'].apply(lambda x: list(_extract_location_parts(x)))

# # Check if any element of sales processed_location is in purchase processed_location
purchase['latitude'] = purchase['latitude'].apply(lambda x: json.loads(x) if pd.notnull(x) else [])
purchase['longitude'] = purchase['longitude'].apply(lambda x: json.loads(x) if pd.notnull(x) else [])

sales['latitude'] = sales['latitude'].apply(lambda x: json.loads(x) if pd.notnull(x) else [])
sales['longitude'] = sales['longitude'].apply(lambda x: json.loads(x) if pd.notnull(x) else [])


def match_locations(sales_locations, purchase_locations):
    return any(loc in purchase_locations for loc in sales_locations)

# Create a dataframe to store matched locations
matched_locations = []

for idx, row in sales.iterrows():
    for idx2, row2 in purchase.iterrows():
        if match_locations(row.processed_location, row2.processed_location):
            matched_locations.append({
                'sales_id': idx,
                'sales_location': row.location,
                'sales_processed_location': row.processed_location,
                'purchase_id': idx2,
                'purchase_location': row2.location,
                'sales_longitude': s_lon,
                'sales_latitude': s_lat,
                'purchase_longitude': p_lon,
                'purchase_latitude': p_lat,
                'purchase_processed_location': row2.processed_location,
                'distance_km': None
            })
        else:
            # Calculate distance between sales and purchase locations
            sales_lat_lon = zip(row.latitude, row.longitude)
            purchase_lat_lon = zip(row2.latitude, row2.longitude)
            for s_lat, s_lon in sales_lat_lon:
                for p_lat, p_lon in purchase_lat_lon:
                    if s_lat is not None and s_lon is not None and p_lat is not None and p_lon is not None:
                        print(f"Calculating distance between ({s_lat}, {s_lon}) and ({p_lat}, {p_lon})")
                        distance = geodesic((s_lat, s_lon), (p_lat, p_lon)).km
                        print(f"Distance: {distance}")
                        if distance <= 50:
                            matched_locations.append({
                                'sales_id': idx,
                                'sales_processed_location': row.processed_location,
                                'sales_location': row.location,
                                'sales_longitude': s_lon,
                                'sales_latitude': s_lat,
                                'purchase_longitude': p_lon,
                                'purchase_latitude': p_lat,
                                'purchase_id': idx2,
                                'purchase_location': row2.location,
                                'purchase_processed_location': row2.processed_location,
                                'distance_km': distance
                            })
matched_locations_df = pd.DataFrame(matched_locations)
print(matched_locations_df)


Semantic Analysis

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
german_stop_words = stopwords.words('german')

# Load synonyms CSV
synonyms_df = pd.read_csv('./data/Updated_Keywords_and_Synonyms.csv')
synonym_dict = {}
for _, row in synonyms_df.iterrows():
    keyword = row['Keyword'].lower()
    synonyms = row.dropna().tolist()[1:]
    synonyms = [syn.lower() for syn in synonyms]
    synonym_dict[keyword] = synonyms

# Define augmentation functions
def extract_keywords(text, top_n=5):
    # vectorizer = TfidfVectorizer(stop_words='german', max_features=top_n)
    vectorizer = CountVectorizer(stop_words = german_stop_words) # Now use this in your pipeline

    tfidf_matrix = vectorizer.fit_transform([text])
    return vectorizer.get_feature_names_out()

def augment_text_with_synonyms(text, top_n=5):
    keywords = extract_keywords(text, top_n)
    synonyms = []
    for word in keywords:
        synonyms.extend(synonym_dict.get(word, []))
    synonyms = ' '.join(synonyms)
    return f"{text} {synonyms}"

# Initialize spaCy
nlp_de = spacy.load('de_core_news_sm')

def preprocess_text_de(text):
    doc = nlp_de(text.lower())
    tokens = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]
    return ' '.join(tokens)


def combine_text_fields(row):
    return ' '.join([
        row.get('title_preprocessed', ''),
        row.get('description_preprocessed', ''),
        row.get('long_description_preprocessed', ''),
        row.get('branchen_preprocessed', '')
    ])

def analyze_matches(matches_df, buyers_df, sellers_df):
    """Analyze matching results and print key metrics."""
    logging.info("\n=== Matching Analysis ===")
    
    total_buyers = len(buyers_df)
    total_sellers = len(sellers_df)
    total_matches = len(matches_df)
    
    logging.info(f"Total buyers: {total_buyers}")
    logging.info(f"Total sellers: {total_sellers}") 
    logging.info(f"Total matches found: {total_matches}")
    if total_buyers > 0:
        logging.info(f"Average matches per buyer: {total_matches/total_buyers:.2f}")
    else:
        logging.info("No buyers to match against.")

    # Save top matches for manual review
    top_matches = matches_df.head(10)
    top_matches.to_csv('./matches/top_matches_for_review.csv', index=False)
    logging.info("Saved top 10 matches for manual review")
    
    return {
        'total_matches': total_matches,
        'matches_per_buyer': total_matches/total_buyers if total_buyers else 0,
        'buyer_match_rate': len(matches_df['buyer_title'].unique())/total_buyers if total_buyers else 0
    }


# Load updated datasets
logging.info('Loading datasets...')
# buyers_df = pd.read_csv('./data/dejuna_buyer_latest_hf_nace.csv')
# sellers_df = pd.read_csv('./data/branche_nexxt_change_sales_listings_scrape_hf_nace.csv')
buyers_df = pd.read_csv('./data/dejuna_buyer_latest_hf_nace_geocoded.csv')
sellers_df = pd.read_csv('./data/branche_nexxt_change_sales_listings_scrape_hf_nace_geocoded.csv')
# Preprocess buyers' text fields
logging.info('Preprocessing buyers\' text fields...')
buyers_df['title_preprocessed'] = buyers_df['title'].apply(lambda x:  preprocess_text(x, nlp))
buyers_df['description_preprocessed'] = buyers_df['description'].apply(lambda x:  preprocess_text(x, nlp))
buyers_df['long_description_preprocessed'] = buyers_df['long_description'].apply(lambda x:  preprocess_text(x, nlp))
buyers_df['preprocessed_branchen'] = buyers_df.apply(
    lambda row: f"{row['Industrie']} {row['Sub-Industrie']}", axis=1
)
# Preprocess sellers' text fields
logging.info('Preprocessing sellers\' text fields...')
sellers_df['title_preprocessed'] = sellers_df['title'].apply(lambda x:  preprocess_text(x, nlp))
sellers_df['description_preprocessed'] = sellers_df['description'].apply(lambda x:  preprocess_text(x, nlp))
sellers_df['long_description_preprocessed'] = sellers_df['long_description'].apply(lambda x:  preprocess_text(x, nlp))
sellers_df['branchen_preprocessed'] = sellers_df['branchen'].apply(lambda x:  preprocess_text(x, nlp))

#Location processing
logging.info('Processing locations...')
sellers_df['processed_location'] = sellers_df['location'].apply(lambda x: list(_extract_location_parts(x)) if pd.notnull(x) else [])
buyers_df['processed_location'] = buyers_df['location'].apply(lambda x: list(_extract_location_parts(x)) if pd.notnull(x) else [])

# Parse latitude and longitude from JSON strings
sellers_df['latitude'] = sellers_df['latitude'].apply(lambda x: json.loads(x) if pd.notnull(x) else [])
sellers_df['longitude'] = sellers_df['longitude'].apply(lambda x: json.loads(x) if pd.notnull(x) else [])

buyers_df['latitude'] = buyers_df['latitude'].apply(lambda x: json.loads(x) if pd.notnull(x) else [])
buyers_df['longitude'] = buyers_df['longitude'].apply(lambda x: json.loads(x) if pd.notnull(x) else [])

# Combine text fields
logging.info('Combining text fields...')
buyers_df['combined_text'] = buyers_df.apply(combine_text_fields, axis=1)
sellers_df['combined_text'] = sellers_df.apply(combine_text_fields, axis=1)


# Augment text with synonyms
# logging.info('Augmenting buyers\' text with synonyms...')
# buyers_df['augmented_text'] = buyers_df['combined_text'].apply(lambda x: augment_text_with_synonyms(x, top_n=5))
# logging.info('Augmenting sellers\' text with synonyms...')
# sellers_df['augmented_text'] = sellers_df['combined_text'].apply(lambda x: augment_text_with_synonyms(x, top_n=5))

# # Preprocess augmented text
# buyers_df['final_text'] = buyers_df['combined_text'].apply(preprocess_text_de) + ' ' + buyers_df['augmented_text'].apply(preprocess_text_de)
# sellers_df['final_text'] = sellers_df['combined_text'].apply(preprocess_text_de) + ' ' + sellers_df['augmented_text'].apply(preprocess_text_de)


# Load the Sentence Transformer model
logging.info('Loading the Sentence Transformer model...')
# model_name = 'paraphrase-multilingual-mpnet-base-v2'
# model_name = 'all-MiniLM-L12-v2'
model_name ='aari1995/German_Semantic_STS_V2'

# model_name = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"

try:
    model = SentenceTransformer(model_name)
except Exception as e:
    logging.error(f"Error loading model {model_name}: {e}")

# Encode sellers' combined_text
logging.info("Encoding sellers' text...")
seller_texts = sellers_df['combined_text'].tolist()
seller_embeddings = get_embedding_batch(seller_texts, model, batch_size=64)
logging.info("Sellers' embeddings generated.")

# Set similarity threshold
similarity_threshold = 0.93
# Optional text length filter

matches = []
confidence_scores = []

total_buyers = len(buyers_df)
logging.info('Starting matching process...')
for i, buyer_row in buyers_df.iterrows():
    buyer_text = buyer_row['combined_text']
    

    # Encode buyer text
    buyer_embedding = model.encode(buyer_text, convert_to_numpy=True, normalize_embeddings=True).reshape(1, -1)

    # Calculate similarity scores to all sellers
    sim_scores = cosine_similarity(buyer_embedding, seller_embeddings)[0]

    # Indices above threshold
    matching_indices = np.where(sim_scores >= similarity_threshold)[0]

    for seller_idx in matching_indices:
        seller_row = sellers_df.iloc[seller_idx]

        confidence_score = sim_scores[seller_idx]
        if confidence_score < similarity_threshold:
            continue

        confidence_scores.append(confidence_score)
        
        match = {
            'buyer_date': buyer_row.get('date', ''),
            'buyer_title': buyer_row.get('title', ''),
            'buyer_description': buyer_row.get('description', ''),
            'buyer_long_description': buyer_row.get('long_description', ''),
            'buyer_location': buyer_row.get('location', ''),
            'buyer_nace_code': buyer_row.get('nace_code', ''), 

            'seller_date': seller_row.get('date', ''),
            'seller_title': seller_row.get('title', ''),
            'seller_description': seller_row.get('description', ''),
            'seller_long_description': seller_row.get('long_description', ''),
            'seller_location': seller_row.get('location', ''),
            'seller_nace_code': seller_row.get('nace_code', ''),
            
            'similarity_score': confidence_score
        }
        matches.append(match)

    # Progress logging
    if (i + 1) % 50 == 0:
        logging.info(f"Processed {i+1}/{total_buyers} buyers.")

logging.info('Creating matches DataFrame...')
matches_df = pd.DataFrame(matches)

if not matches_df.empty:
    matches_df['confidence_score'] = confidence_scores

    # Sort by confidence score
    matches_df = matches_df.sort_values('confidence_score', ascending=False)

    # Save all matches
    timestamp = datetime.now().strftime("%d_%H-%M")
    output_all = f'./matches/nlp_business_all_matches_{timestamp}.csv'
    matches_df.to_csv(output_all, index=False)
    logging.info(f'Saved all matches: {len(matches_df)} records => {output_all}')

    # Analyze results
    metrics = analyze_matches(matches_df, buyers_df, sellers_df)
    
    # Optionally filter for high confidence
    high_conf_df = matches_df[matches_df['confidence_score'] >= 0.95]
    output_high_conf = f'./matches/nlp_business_high_conf_{timestamp}.csv'
    high_conf_df.to_csv(output_high_conf, index=False)
    logging.info(f'Saved high confidence matches: {len(high_conf_df)} records => {output_high_conf}')
else:
    logging.info('No matches found.')

# Final memory cleanup
del seller_embeddings
gc.collect()


In [None]:
matches_df

Another Semantic analysis

In [None]:


# Load updated datasets
logging.info('Loading datasets...')
buyers_df = pd.read_csv('./data/dejuna_buyer_latest_hf_nace_geocoded.csv')
sellers_df = pd.read_csv('./data/branche_nexxt_change_sales_listings_scrape_hf_nace_geocoded.csv')

# Preprocess buyers' text fields
logging.info('Preprocessing buyers\' text fields...')
buyers_df['title_preprocessed'] = buyers_df['title'].apply(lambda x:  preprocess_text(x, nlp))
buyers_df['description_preprocessed'] = buyers_df['description'].apply(lambda x:  preprocess_text(x, nlp))
buyers_df['long_description_preprocessed'] = buyers_df['long_description'].apply(lambda x:  preprocess_text(x, nlp))
buyers_df['preprocessed_branchen'] = buyers_df.apply(
    lambda row: f"{row['Industrie']} {row['Sub-Industrie']}", axis=1
)

# Preprocess sellers' text fields
logging.info('Preprocessing sellers\' text fields...')
sellers_df['title_preprocessed'] = sellers_df['title'].apply(lambda x:  preprocess_text(x, nlp))
sellers_df['description_preprocessed'] = sellers_df['description'].apply(lambda x:  preprocess_text(x, nlp))
sellers_df['long_description_preprocessed'] = sellers_df['long_description'].apply(lambda x:  preprocess_text(x, nlp))
sellers_df['branchen_preprocessed'] = sellers_df['branchen'].apply(lambda x:  preprocess_text(x, nlp))



In [None]:

#Location processing
logging.info('Processing locations...')
sellers_df['processed_location'] = sellers_df['location'].apply(lambda x: list(_extract_location_parts(x)) if pd.notnull(x) else [])
buyers_df['processed_location'] = buyers_df['location'].apply(lambda x: list(_extract_location_parts(x)) if pd.notnull(x) else [])

# Parse latitude and longitude from JSON strings
sellers_df['latitude'] = sellers_df['latitude'].apply(lambda x: json.loads(x) if pd.notnull(x) else [])
sellers_df['longitude'] = sellers_df['longitude'].apply(lambda x: json.loads(x) if pd.notnull(x) else [])

buyers_df['latitude'] = buyers_df['latitude'].apply(lambda x: json.loads(x) if pd.notnull(x) else [])
buyers_df['longitude'] = buyers_df['longitude'].apply(lambda x: json.loads(x) if pd.notnull(x) else [])

def combine_text_fields(row):
    return ' '.join([
        row.get('title_preprocessed', ''),
        row.get('description_preprocessed', ''),
        row.get('long_description_preprocessed', ''),
        row.get('branchen_preprocessed', '')
    ])

# Combine text fields
logging.info('Combining text fields...')
buyers_df['combined_text'] = buyers_df.apply(combine_text_fields, axis=1)
sellers_df['combined_text'] = sellers_df.apply(combine_text_fields, axis=1)

In [186]:
def match_locations(sales_locations, purchase_locations):
    """
    Check if any element of sales_locations is in purchase_locations.
    """
    print(f'purchase_locations: {purchase_locations}, sales_locations: {sales_locations}, {any(loc in purchase_locations for loc in sales_locations)}')
    
    return any(loc in purchase_locations for loc in sales_locations)
# def match_locations(locations_input, target_location):
#     # Step 1: Normalize and parse the locations into a list of location parts
#     def parse_location(location):
#         return [part.strip().lower() for part in location.split(">")]

#     # Parse the target location
#     target_parts = parse_location(target_location)
    
#     # Split the multiple input locations by line breaks and parse them
#     locations = locations_input.strip().split("\n")
#     parsed_locations = [parse_location(loc) for loc in locations]

#     # Step 2: Function to compare the parsed locations with the target
#     def compare_location(loc1_parts, loc2_parts):
#         max_level = max(len(loc1_parts), len(loc2_parts))
#         match_score = 0

#         for level in range(max_level):
#             loc1_value = loc1_parts[level] if level < len(loc1_parts) else None
#             loc2_value = loc2_parts[level] if level < len(loc2_parts) else None
            
#             if loc1_value and loc2_value:
#                 # Exact match at this level
#                 if loc1_value == loc2_value:
#                     match_score += 1
#                 else:
#                     break  # If any level does not match, break early
#             elif loc1_value is None and loc2_value is None:
#                 continue  # Both are missing, no issue
#             else:
#                 match_score += 0.5  # Partial match (one location is more specific)

#         return match_score

#     # Step 3: Check if any location matches (either exact or partial)
#     # for loc_parts in parsed_locations:
#     #     match_score = compare_location(loc_parts, target_parts)
#     #     if match_score > 0:  # If there's a partial match or exact match
#     #         return True
#     for loc_parts in parsed_locations:
#         match_score = compare_location(loc_parts, target_parts)
#         print(" > ".join(loc_parts))
#         if match_score > 0:  
#             if target_coords and location_coordinates.get(" > ".join(loc_parts)):
#                 loc_coords = location_coordinates.get(" > ".join(loc_parts))
#                 if loc_coords:
#                     lat1, lon1 = target_coords
#                     lat2, lon2 = loc_coords
#                     distance = haversine(lat1, lon1, lat2, lon2)
#                     if distance <= max_distance_km:
#                         return True 
#             else:
#                 return True

#     # If no matches found, return False
#     return False

In [None]:
# # Augment text with synonyms
# logging.info('Augmenting buyers\' text with synonyms...')
# buyers_df['augmented_text'] = buyers_df['combined_text'].apply(lambda x: augment_text_with_synonyms(x, top_n=5))
# logging.info('Augmenting sellers\' text with synonyms...')
# sellers_df['augmented_text'] = sellers_df['combined_text'].apply(lambda x: augment_text_with_synonyms(x, top_n=5))

# # Preprocess augmented text
# buyers_df['final_text'] = buyers_df['combined_text'].apply(preprocess_text_de) + ' ' + buyers_df['augmented_text'].apply(preprocess_text_de)
# sellers_df['final_text'] = sellers_df['combined_text'].apply(preprocess_text_de) + ' ' + sellers_df['augmented_text'].apply(preprocess_text_de)

# -----------------------------------
# 4. Load Models
# -----------------------------------
logging.info("Loading SentenceTransformer and CrossEncoder models...")
# bi_encoder = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
# bi_encoder = SentenceTransformer("xlm-roberta-base")
# bi_encoder = SentenceTransformer('./fine_tuned_models/fine_tuned_all-MiniLM-L12-v2')
# bi_encoder = SentenceTransformer('aari1995/German_Semantic_STS_V2')
# bi_encoder = SentenceTransformer('./fine_tuned_models/fine_tuned_paraphrase-multilingual-MiniLM-L12-v2')
bi_encoder = SentenceTransformer('all-MiniLM-L12-v2')
cross_encoder = CrossEncoder('cross-encoder/stsb-roberta-large')

# -----------------------------------
# 5. Encode Sellers' Text
# -----------------------------------
logging.info("Encoding sellers' text...")
seller_texts = sellers_df['combined_text'].tolist()
seller_embeddings = bi_encoder.encode(seller_texts, convert_to_numpy=True, normalize_embeddings=True)
logging.info("Sellers' embeddings generated.")

In [None]:

# -----------------------------------
# 6. Matching Parameters
# -----------------------------------
similarity_threshold = 0.60
cross_encoder_threshold = 0.65
top_n = 100  # Number of top candidates to re-rank per buyer

matches = []
confidence_scores = []

total_buyers = len(buyers_df)
logging.info("Starting matching process...")

# -----------------------------------
# 7. Matching Loop
# -----------------------------------
matched_locations = []

for i, buyer_row in buyers_df.iterrows():
    buyer_text = buyer_row['combined_text']
    buyer_latitudes = buyers_df.iloc[i]['latitude']
    buyer_longitudes = buyers_df.iloc[i]['longitude']
    buyer_locations = buyers_df.iloc[i]['location']


    # Encode buyer text
    buyer_embedding = bi_encoder.encode([buyer_text], convert_to_numpy=True, normalize_embeddings=True)
    
    # Compute cosine similarities with all sellers
    sim_scores = cosine_similarity(buyer_embedding, seller_embeddings)[0]
    
    # Get indices of sellers with similarity >= threshold
    matching_indices = np.where(sim_scores >= similarity_threshold)[0]
    
    if len(matching_indices) == 0:
        continue  # No matches above threshold
    
    # Select top N matches based on similarity scores
    top_indices = matching_indices[np.argsort(sim_scores[matching_indices])[::-1][:top_n]]
    
    # Prepare pairs for cross-encoder
    buyer_texts = [buyer_text] * len(top_indices)
    seller_texts_top = [sellers_df.iloc[idx]['combined_text'] for idx in top_indices]
    pairs = list(zip(buyer_texts, seller_texts_top))
    
    # Get cross-encoder scores
    cross_scores = cross_encoder.predict(pairs)
    
    # Filter based on cross-encoder threshold
    for seller_idx, cross_score in zip(top_indices, cross_scores):


        if cross_score >= cross_encoder_threshold:
            seller_row = sellers_df.iloc[seller_idx]
            seller_latitudes = sellers_df.iloc[seller_idx]['latitude']
            seller_longitudes = sellers_df.iloc[seller_idx]['longitude']
            seller_locations = sellers_df.iloc[seller_idx]['location']

            location_match = False
            distance_km = None

            # if match_locations(seller_locations, buyer_locations):
            #     location_match = True
            # else:
            # Calculate distance between all combinations of seller and buyer coordinates
            for s_lat, s_lon in zip(seller_latitudes, seller_longitudes):
                for p_lat, p_lon in zip(buyer_latitudes, buyer_longitudes):
                    if None in [s_lat, s_lon, p_lat, p_lon]:
                        continue
                    distance = geodesic((s_lat, s_lon), (p_lat, p_lon)).km
                    if distance <= 50:  # 50 km threshold
                        distance_km = distance
                        location_match = True
                        break
                if location_match:
                    break
            if location_match:
                logging.info(f"Match found: Buyer location: {buyer_locations}, Seller location: {seller_locations}")
                match = {
                    'buyer_id': buyer_row.get('id', ''),
                    'buyer_date': buyer_row.get('date', ''),
                    'buyer_title': buyer_row.get('title', ''),
                    'buyer_description': buyer_row.get('description', ''),
                    'buyer_long_description': buyer_row.get('long_description', ''),
                    'buyer_location': buyer_row.get('location', ''),
                    'buyer_nace_code': buyer_row.get('nace_code', ''), 

                    'seller_id': seller_row.get('id', ''),
                    'seller_date': seller_row.get('date', ''),
                    'seller_title': seller_row.get('title', ''),
                    'seller_description': seller_row.get('description', ''),
                    'seller_long_description': seller_row.get('long_description', ''),
                    'seller_location': seller_row.get('location', ''),
                    'seller_source': seller_row.get('url', ''),
                    'seller_nace_code': seller_row.get('nace_code', ''),
                    
                    'similarity_score': cross_score,
                    'distance_km': distance_km if distance_km else 'Within processed locations'

                }
                matches.append(match)
                confidence_scores.append(cross_score)
    
    # Progress Logging
    if (i + 1) % 50 == 0:
        logging.info(f"Processed {i+1}/{total_buyers} buyers.")

# -----------------------------------
# 8. Create Matches DataFrame
# -----------------------------------
logging.info("Creating matches DataFrame...")
matches_df = pd.DataFrame(matches)
        
if not matches_df.empty:
    matches_df['confidence_score'] = confidence_scores
    matches_df = matches_df.sort_values('confidence_score', ascending=False)
    
    # Save All Matches
    timestamp = datetime.now().strftime("%d_%H-%M")
    output_all = f'./matches/nlp_business_all_matches_{timestamp}.csv'
    matches_df.to_csv(output_all, index=False)
    matches_df.to_excel(f'./matches/nlp_business_all_matches_{timestamp}.xlsx', index=False)
    logging.info(f"Saved all matches: {len(matches_df)} records => {output_all}")
    
    # Save High Confidence Matches
    high_conf_df = matches_df[matches_df['confidence_score'] >= cross_encoder_threshold]
    output_high_conf = f'./matches/nlp_business_high_conf_{timestamp}.csv'
    high_conf_df.to_csv(output_high_conf, index=False)
    logging.info(f"Saved high confidence matches: {len(high_conf_df)} => {output_high_conf}")
else:
    logging.info("No matches found.")

# -----------------------------------
# 9. Memory Cleanup
# -----------------------------------
# del seller_embeddings
# gc.collect()


In [None]:
from geopy.geocoders import Nominatim
from geopy.distance import geodesic

def geocode(location):
    """
    Geocodes a location string to latitude and longitude.
    
    Args:
        location (str): The location string to geocode.
    
    Returns:
        tuple: Latitude and longitude of the location, or None if not found.
    """
    geolocator = Nominatim(user_agent="geoapi")
    location_data = geolocator.geocode(location)
    if location_data:
        return location_data.latitude, location_data.longitude
    else:
        return None

# Example usage
# 2025-01-21 12:59:12,950 - INFO - Match found: Buyer location: ['Baden-Württemberg', 'Bayern'], Seller location: ['Rhein-Neckar-Kreis']

# purchase_locations: ['Baden-Württemberg', 'Bayern'], sales_locations: ['Rhein-Neckar-Kreis'], False
location1 = 'Brandenburg > Brandenburg > Potsdam'
location2 = 'Osnabrück'
#  {'Berlin',
#  'Brandenburg',
#  'Dresden',
#  'Hannover',
#  'Leipzig',
#  'Niedersachsen',
#  'Sachsen'}
coordinates1 = geocode(location1 + ", Germany")
coordinates2 = geocode(location2 + ", Germany")

if coordinates1 and coordinates2:
    distance = geodesic(coordinates1, coordinates2).kilometers
    print(f"Coordinates for '{location1}': {coordinates1}")
    print(f"Coordinates for '{location2}': {coordinates2}")
    print(f"Distance between '{location1}' and '{location2}': {distance:.2f} km")
else:
    print("One or both locations could not be geocoded.")


In [None]:
# from geopy.geocoders import Nominatim
# from geopy.distance import geodesic
# import math

# # Function to geocode a location string into latitude and longitude
# def geocode(location):
#     """
#     Geocodes a location string to latitude and longitude.
    
#     Args:
#         location (str): The location string to geocode.
    
#     Returns:
#         tuple: Latitude and longitude of the location, or None if not found.
#     """
#     geolocator = Nominatim(user_agent="geoapi")
#     location_data = geolocator.geocode(location)
#     if location_data:
#         return location_data.latitude, location_data.longitude
#     else:
#         return None


# # Haversine formula to calculate distance between two points (lat1, lon1) and (lat2, lon2)
# def haversine(lat1, lon1, lat2, lon2):
#     R = 6371  # Radius of the Earth in kilometers
#     phi1 = math.radians(lat1)
#     phi2 = math.radians(lat2)
#     delta_phi = math.radians(lat2 - lat1)
#     delta_lambda = math.radians(lon2 - lon1)

#     a = math.sin(delta_phi / 2)**2 + math.cos(phi1) * math.cos(phi2) * math.sin(delta_lambda / 2)**2
#     c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))

#     distance = R * c  # Distance in kilometers
#     return distance


# def match_locations(locations_input, target_location, max_distance_km=50):
#     # Step 1: Normalize and parse the locations into a list of location parts
#     def parse_location(location):
#         return [part.strip().lower() for part in location.split(">")]

#     # Parse the target location
#     target_parts = parse_location(target_location)
#     target_coords = geocode(target_location)  # Get coordinates of the target location
    
#     if not target_coords:
#         return False  # If target location couldn't be geocoded, return False

#     # Split the multiple input locations by line breaks and parse them
#     locations = locations_input.strip().split("\n")
#     parsed_locations = [parse_location(loc) for loc in locations]

#     # Step 2: Function to compare the parsed locations with the target
#     def compare_location(loc1_parts, loc2_parts):
#         max_level = max(len(loc1_parts), len(loc2_parts))
#         match_score = 0

#         for level in range(max_level):
#             loc1_value = loc1_parts[level] if level < len(loc1_parts) else None
#             loc2_value = loc2_parts[level] if level < len(loc2_parts) else None
            
#             if loc1_value and loc2_value:
#                 # Exact match at this level
#                 if loc1_value == loc2_value:
#                     match_score += 1
#                 else:
#                     break  # If any level does not match, break early
#             elif loc1_value is None and loc2_value is None:
#                 continue  # Both are missing, no issue
#             else:
#                 match_score += 0.5  # Partial match (one location is more specific)

#         return match_score>0

    # # Step 3: Check if any location matches (either exact or partial) and is within proximity
    # for loc_parts in parsed_locations:
    #     match_score = compare_location(loc_parts, target_parts)

    #     # If there's a partial or exact match, we also check the geographical proximity
    #     if match_score > 0:  # If there's a name match
    #         location_str = " > ".join(loc_parts)
    #         loc_coords = geocode(location_str)  # Get coordinates of the input location
            
    #         if loc_coords:
    #             lat1, lon1 = target_coords
    #             lat2, lon2 = loc_coords
    #             distance=geodesic(target_coords, loc_coords).kilometers
    #             logging.info(f"Distance between '{target_location}' and '{location_str}': {distance:.2f} km")
    #             # distance = haversine(lat1, lon1, lat2, lon2)
    #             if distance <= max_distance_km:
    #                 return True  # Match found within the proximity
    #         else:
    #             continue  # If geocoding failed for the input location, skip it

    # # If no matches found, return False
    # return False

# Example usage:

# locations_input = '''Berlin
# Sachsen > Leipzig
# Sachsen > Dresden
# Brandenburg
# Niedersachsen > Hannover'''

# target_location = '''Brandenburg > Brandenburg'''

# # Call the function
# result = match_locations(locations_input, target_location)
# print(result)  # True if any match (name + proximity), False otherwise


In [None]:
# Read sellers and buyers files into dataframes
sellers_df = pd.read_csv(sellers_filepath)
buyers_df = pd.read_csv(buyer_filepath)

# Add origin column to both dataframes
sellers_df['origin'] = 'seller'
buyers_df['origin'] = 'buyer'
buyers_df['branchen'] = buyers_df.apply(
    lambda row: f"{row['Industrie']} > {row['Sub-Industrie']}", axis=1
)


# Concatenate both dataframes
combined_df = pd.concat([sellers_df, buyers_df], ignore_index=True)

# Display the first few rows of the combined dataframe

NACE Labels for Combined data

In [None]:

# -------------------------------------------------------------------------
# 5. Load your Seller/Branchen data and NACE codes
# -------------------------------------------------------------------------
def load_data(nace_codes_filepath):
    nace_codes = load_nace_codes(nace_codes_filepath)
    return nace_codes

# -------------------------------------------------------------------------
# 6. MAIN LOGIC
# -------------------------------------------------------------------------
    # Filepaths (update to your actual paths)
# sellers_filepath = originalSalesNexxtChangeData       # CSV with a column 'branchen'
sellers_filepath =  dejunaPurchases    # CSV with a column 'branchen'
nace_codes_filepath = nacecode_array_obj_du

# Load data
sellers_df = combined_df 
nace_codes = load_data(nace_codes_filepath)
print("🚀 Sellers and NACE codes loaded.")

# Initialize the Sentence Transformer model
# For German or multilingual, consider e.g.: 
#    model_name = 'paraphrase-multilingual-mpnet-base-v2'
model_name = 'all-MiniLM-L12-v2'
model = SentenceTransformer(model_name)
print(f"🚀 Loaded SentenceTransformer model: {model_name}")

# ---------------------------------------------------------------------
# 6a. Preprocess NACE descriptions
# ---------------------------------------------------------------------
# Convert each NACE code's description with the same text preprocessing
nace_descriptions = [preprocess_text(desc, nlp) for desc in nace_codes.values()]
# Create embeddings for these descriptions
nace_embeddings = get_embedding_batch(nace_descriptions, model)
# We'll keep a list of NACE codes in the same order
nace_code_list = list(nace_codes.keys())
print("🚀 Created embeddings for NACE descriptions.")

# ---------------------------------------------------------------------
# 6b. Preprocess 'branchen' column in sellers data
# ---------------------------------------------------------------------
# We'll store it in a new column 'preprocessed_branchen'
# sellers_df['preprocessed_branchen'] = sellers_df['branchen'].apply(lambda x: preprocess_text(x, nlp))
# Split 'branchen' on '>' and join each part


logging.info('Preprocessing buyers\' text fields...')
sellers_df['title_preprocessed'] = sellers_df['title'].apply(lambda x:  preprocess_text(x, nlp))
sellers_df['description_preprocessed'] = sellers_df['description'].apply(lambda x:  preprocess_text(x, nlp))
# sellers_df['long_description_preprocessed'] = sellers_df['long_description'].apply(lambda x:  preprocess_text(x, nlp))
# sellers_df['preprocessed_branchen'] = sellers_df.apply(
#     lambda row: f"{row['Industrie']} {row['Sub-Industrie']}", axis=1
# )

sellers_df['preprocessed_branchen'] = sellers_df['branchen'].apply(
    lambda x: ' '.join(x.split('>'))
)

def combine_text_fields(row):
    return ' '.join([
        row.get('title_preprocessed', ''),
        row.get('description_preprocessed', ''),
        # row.get('long_description_preprocessed', ''),
        row.get('branchen_preprocessed', '')
    ])
sellers_df['combined_text'] = sellers_df.apply(combine_text_fields, axis=1)

# # Join 'Sub-Industrie' and 'Industrie' columns for buyer data
# sellers_df['preprocessed_branchen'] = sellers_df.apply(
#     lambda row: f"{row['Industrie']} {row['Sub-Industrie']}", axis=1
# )

# Create embeddings for all sellers' branchen
branchen_embeddings = get_embedding_batch(sellers_df['combined_text'].tolist(), model)
print("🚀 Created embeddings for sellers' 'branchen' field.")

# ---------------------------------------------------------------------
# 6c. Compute similarity and assign best-match NACE code
# ---------------------------------------------------------------------
similarities = cosine_similarity(branchen_embeddings, nace_embeddings)
best_match_indices = similarities.argmax(axis=1)

# For each seller row, pick the NACE code with highest similarity
sellers_df['assigned_nace_code'] = [nace_code_list[idx] for idx in best_match_indices]
sellers_df['assigned_nace_similarity'] = [similarities[i][idx] for i, idx in enumerate(best_match_indices)]
print("🚀 Assigned preliminary NACE codes based on 'branchen' similarity.")

# Optionally set a threshold. If similarity < threshold => 'Unassigned'
similarity_threshold = 0.4
sellers_df['nace_code'] = sellers_df.apply(
    lambda row: row['assigned_nace_code'] 
                if row['assigned_nace_similarity'] >= similarity_threshold 
                else 'Unassigned',
    axis=1
)

# ---------------------------------------------------------------------
# 6d. Save and review
# ---------------------------------------------------------------------
output_file = "buyers_sellers_combined_nace.csv"
sellers_df.to_csv(output_file, index=False)
print(f"🚀 Saved sellers data with assigned NACE codes to: {output_file}\n")

# Print a small sample
print("Sample of NACE Code Assignments:")
print(sellers_df[['assigned_nace_code', 'assigned_nace_similarity', 'nace_code']].head(10))


In [None]:
temp1= pd.read_csv("./matches/nlp_business_all_matches_16_19-31.csv")
temp2= pd.read_csv("./matches/nlp_business_all_matches_15_13-11.csv")

# Concatenate the dataframes
combined_df = pd.concat([temp1, temp2])
# Get all duplicates based on specific columns
# Get all rows that only exist once in the dataframe
unique_rows = combined_df.drop_duplicates(subset=['buyer_date', 'buyer_title', 'buyer_description', 'buyer_location', 'seller_date', 'seller_title', 'seller_description', 'seller_location'], keep=False)
duplicates = unique_rows[unique_rows.duplicated(subset=['buyer_date', 'buyer_title', 'buyer_description', 'buyer_location', 'seller_date', 'seller_title', 'seller_description', 'seller_location'], keep=False)]

# unique_rows.to_csv('./matches/unique_rows.csv', index=False)
# Display the unique rows
# print(unique_rows)
print(duplicates)
# Display the duplicates
# print(duplicates)

In [None]:
from geopy.distance import geodesic

def extract_locations(location_string):
    """
    Extracts locations from a string with nested structure.
    
    Args:
        location_string (str): Input string containing locations in nested format.

    Returns:
        list: A list of extracted locations.
    """
    locations = []

    # Split the input into lines
    lines = location_string.splitlines()

    for line in lines:
        # Strip whitespace from the line
        line = line.strip()
        
        # Ignore empty lines
        if not line:
            continue

        # If the line contains '>', it indicates nested locations
        if '>' in line:
            # Extract the specific location after the last '>'
            nested_location = line.split('>')[-1].strip()
            locations.append(nested_location)
        else:
            # Append the standalone location
            locations.append(line)

    return locations

def get_location_coordinates(location):
    """
    Mock function to return coordinates for a given location.
    Replace this with an actual geocoding API in production.

    Args:
        location (str): The location name.

    Returns:
        tuple: Latitude and longitude of the location.
    """
    # Mock coordinates for demonstration purposes
    coordinates = {
        "Celle": (52.6226, 10.0815),
        "Hannover": (52.3759, 9.7320),
        "Karlsruhe": (49.0069, 8.4037),
        "Mannheim": (49.4875, 8.4660),
        "Goettingen": (51.5413, 9.9158),
    }
    return coordinates.get(location, None)

def locations_match(s1, s2):
    """
    Checks if the locations extracted from two strings have any matches.
    If no exact match, it calculates the distance between locations.

    Args:
        s1 (str): First input string containing locations.
        s2 (str): Second input string containing locations.

    Returns:
        bool: True if there are matching locations or distance < 50KM, False otherwise.
    """
    # Extract locations from both strings
    locations1 = set(_extract_location_parts(s1))
    locations2 = set(_extract_location_parts(s2))

    print(f"Locations 1: {locations1}")
    print(f"Locations 2: {locations2}")
    # Check for exact match first
    if not locations1.isdisjoint(locations2):
        return True

    # If no exact match, calculate distances
    for loc1 in locations1:
        for loc2 in locations2:
            coord1 = get_location_coordinates(loc1)
            coord2 = get_location_coordinates(loc2)

            if coord1 and coord2:
                distance = geodesic(coord1, coord2).kilometers
                print(f"Distance between {loc1} and {loc2}: {distance} KM")
                if distance < 50:
                    return True

    return False

# Example usage
s1 = "Niedersachen > Celle"
s2 = "Niedersachsen > Goettingen"

output = locations_match(s1, s2)
print(output)

In [None]:
import pandas as pd
import numpy as np
import re
import logging
import json
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
from geopy.distance import geodesic
from sentence_transformers import SentenceTransformer, CrossEncoder
from sklearn.metrics.pairwise import cosine_similarity
import faiss
import os
import shelve
from unidecode import unidecode
import gc
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# -------------------------------------------------------------------------
# 1. Location Processing Functions
# -------------------------------------------------------------------------
def _extract_location_parts(location):
    """Extract hierarchical location components"""
    if pd.isna(location):
        return []
    try:
        return [part.strip() for part in re.split(r'>|\n', location) if part.strip()]
    except Exception as e:
        logging.error(f"Location parsing error: {e}")
        return []

def geocode_locations(unique_locations, cache_path='geocode_cache.db'):
    """Geocode locations with hierarchical fallback"""
    geolocator = Nominatim(user_agent="business_matcher")
    geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1)
    
    with shelve.open(cache_path) as cache:
        for loc in unique_locations:
            if loc not in cache:
                try:
                    # Try full location first, then fallback through hierarchy
                    parts = _extract_location_parts(loc)
                    for i in range(len(parts)):
                        query = ", ".join(parts[-i-1:]) + ", Germany"
                        result = geocode(query)
                        if result:
                            cache[loc] = (result.latitude, result.longitude)
                            break
                    if loc not in cache:
                        cache[loc] = (None, None)
                except Exception as e:
                    logging.error(f"Geocoding failed for {loc}: {e}")
                    cache[loc] = (None, None)

# -------------------------------------------------------------------------
# 2. Text Processing Functions (German-optimized)
# -------------------------------------------------------------------------
def clean_german_text(text):
    """Basic cleaning for German business text"""
    if pd.isna(text):
        return ''
    
    # Remove URLs and special characters
    text = re.sub(r'http\S+|www\.\S+|\b\d{10,}\b|[^\wäöüßÄÖÜ\s]', ' ', text)
    
    # Handle common business abbreviations
    replacements = {
        r'\bMio\b': 'millionen',
        r'\bTsd\b': 'tausend',
        r'\bca\.': 'circa',
        r'\bz\.B\.': 'zum beispiel'
    }
    for pattern, repl in replacements.items():
        text = re.sub(pattern, repl, text)
    
    return unidecode(text).lower().strip()

# -------------------------------------------------------------------------
# 3. Core Matching Logic
# -------------------------------------------------------------------------
# Load data
buyers_df = pd.read_csv('./data/dejuna_buyer_latest_hf_nace.csv')
sellers_df = pd.read_csv('./data/branche_nexxt_change_sales_listings_scrape_hf_nace.csv')[:100]

# Process locations
logging.info("Processing locations...")
for df in [buyers_df, sellers_df]:
    df['location_parts'] = df['location'].apply(_extract_location_parts)
    df['primary_location'] = df['location_parts'].apply(
        lambda x: x[-1] if x else None
    )

# Geocode locations
unique_locations = set(
    buyers_df['primary_location'].dropna().tolist() +
    sellers_df['primary_location'].dropna().tolist()
)
geocode_locations(unique_locations)

# Load geocodes
with shelve.open('geocode_cache.db') as cache:
    sellers_df['coords'] = sellers_df['primary_location'].apply(
        lambda x: cache.get(x, (None, None))
    )
    buyers_df['coords'] = buyers_df['primary_location'].apply(
        lambda x: cache.get(x, (None, None))
    )

# Combine text fields
def combine_text(row):
    return ' '.join([
        str(row.get('title', '')),
        str(row.get('description', '')),
        str(row.get('long_description', ''))
    ])
from sentence_transformers import SentenceTransformer, models

def initialize_models():
    """Explicit model/tokenizer configuration"""
    # Configure transformer with separate tokenizer args
    bi_encoder= SentenceTransformer(
    'T-Systems-onsite/german-roberta-sentence-transformer-v2',
    device='cuda',  # Use GPU if available
    tokenizer_kwargs={
        'use_fast': True,
        'model_max_length': 512,
        'truncation': True
    }
)
    # pooling = models.Pooling(transformer.get_word_embedding_dimension())
    
    # bi_encoder = SentenceTransformer(modules=[transformer, pooling])
    cross_encoder = CrossEncoder('cross-encoder/stsb-roberta-base')
    return bi_encoder, cross_encoder
sellers_df['combined_text'] = sellers_df.apply(combine_text, axis=1)
buyers_df['combined_text'] = buyers_df.apply(combine_text, axis=1)

# Clean text
logging.info("Cleaning text...")
sellers_df['clean_text'] = sellers_df['combined_text'].apply(clean_german_text)
buyers_df['clean_text'] = buyers_df['combined_text'].apply(clean_german_text)
sellers_df['id'] = sellers_df.index
# Initialize models
logging.info("Initializing models...")
# bi_encoder = SentenceTransformer('T-Systems-onsite/german-roberta-sentence-transformer-v2')
# cross_encoder = CrossEncoder('cross-encoder/stsb-roberta-large')
bi_encoder, cross_encoder = initialize_models()
# Encode sellers
logging.info("Encoding sellers...")
seller_embeddings = bi_encoder.encode(
    sellers_df['clean_text'].tolist(),
    batch_size=64,
    show_progress_bar=True,
    normalize_embeddings=True
)

# Create FAISS index
dimension = seller_embeddings.shape[1]
index = faiss.IndexFlatIP(dimension)
index.add(seller_embeddings)

# Matching parameters
SEMANTIC_THRESHOLD = 0.60
CROSS_ENCODER_THRESHOLD = 0.65
MAX_DISTANCE_KM = 50

# Process buyers
matches = []
for buyer_idx, buyer_row in buyers_df.iterrows():
    try:
        # Semantic search
        buyer_embedding = bi_encoder.encode([buyer_row['clean_text']])
        distances, indices = index.search(buyer_embedding, 100)
        # Filter candidates
        candidates = []
        for seller_idx, score in zip(indices[0], distances[0]):
            if score >= SEMANTIC_THRESHOLD:
                seller = sellers_df.iloc[seller_idx]
                candidates.append((seller, score))
        
        # Location filtering
        valid_matches = []
        for seller, score in candidates:
            # Hierarchical location match
            location_match = any(
                loc in buyer_row['location_parts']
                for loc in seller['location_parts']
            )
            

            # Distance check
            distance = None
            if not location_match and None not in [buyer_row['coords'], seller['coords']]:
                buyer_latitude, buyer_longitude= buyer_row['coords'].get('latitude', None), buyer_row['coords'].get('longitude', None)
                seller_latitude, seller_longitude = seller['coords'].get('latitude', None), seller['coords'].get('longitude', None)
                distance = geodesic((buyer_latitude, buyer_longitude),(seller_latitude, seller_longitude)).km
                location_match = distance <= MAX_DISTANCE_KM

            if location_match:
                valid_matches.append({
                    'buyer_id': buyer_row.get('id', ''),
                    
                    # 'buyer_date': buyer_row.get('date', ''),
                    'buyer_title': buyer_row.get('title', ''),
                    'buyer_description': buyer_row.get('description', ''),
                    # 'buyer_long_description': buyer_row.get('long_description', ''),
                    # 'buyer_location': buyer_row.get('location', ''),
                    # 'buyer_nace_code': buyer_row.get('nace_code', ''), 

                    'seller_id': seller.get('id', ''),
                    # 'seller_date': seller.get('date', ''),
                    'seller_title': seller.get('title', ''),
                    'seller_description': seller.get('description', ''),
                    # 'seller_long_description': seller.get('long_description', ''),
                    # 'seller_location': seller.get('location', ''),
                    'seller_source': seller.get('url', ''),
                    # 'seller_nace_code': seller.get('nace_code', ''),


                    'semantic_score': score,
                    'distance_km': round(distance, 2) if distance else 'Hierarchical match',
                    'buyer_location': ' > '.join(buyer_row['location_parts']),
                    'seller_location': ' > '.join(seller['location_parts'])
                })

        # Cross-encoder reranking
        if valid_matches:
            pairs = [(buyer_row['clean_text'], sellers_df.iloc[m['seller_id']]['clean_text']) 
                    for m in valid_matches]
            cross_scores = cross_encoder.predict(pairs)
            
            for match, cross_score in zip(valid_matches, cross_scores):
                if cross_score >= CROSS_ENCODER_THRESHOLD:
                    match['confidence_score'] = cross_score
                    matches.append(match)

    except Exception as e:
        logging.error(f"Error processing buyer {buyer_idx}:{buyer_row['coords']}: {e}")

# # Save results
# if matches:
#     matches_df = pd.DataFrame(matches)
#     matches_df = matches_df.sort_values('confidence_score', ascending=False)
#     matches_df.to_csv('./business_matches.csv', index=False)
#     logging.info(f"Saved {len(matches_df)} matches to business_matches.csv")
# else:
#     logging.info("No matches found")

# logging.info("Creating matches DataFrame...")
# matches_df = pd.DataFrame(matches)
matches_df = pd.DataFrame(matches)

if not matches_df.empty:
    matches_df = matches_df.sort_values('confidence_score', ascending=False)
    
    # Save All Matches
    timestamp = datetime.now().strftime("%d_%H-%M")
    output_all = f'./matches/nlp_business_all_matches_{timestamp}.csv'
    matches_df.to_csv(output_all, index=False)
    matches_df.to_excel(f'./matches/nlp_business_all_matches_{timestamp}.xlsx', index=False)
    logging.info(f"Saved all matches: {len(matches_df)} records => {output_all}")
    
    # Save High Confidence Matches
    high_conf_df = matches_df[matches_df['confidence_score'] >= cross_encoder_threshold]
    output_high_conf = f'./matches/nlp_business_high_conf_{timestamp}.csv'
    high_conf_df.to_csv(output_high_conf, index=False)
    logging.info(f"Saved high confidence matches: {len(high_conf_df)} => {output_high_conf}")
else:
    logging.info("No matches found.")

# -----------------------------------
# 9. Memory Cleanup
# -----------------------------------
del seller_embeddings
gc.collect()

In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader
import numpy as np
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
import logging

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

buyers_df = pd.read_csv('./data/dejuna_buyer_latest_hf_nace.csv')
sellers_df = pd.read_csv('./data/branche_nexxt_change_sales_listings_scrape_hf_nace.csv')
# Add negative examples (random pairs)

# Combine text fields
def combine_text(row):
    return ' '.join([
        str(row.get('title', '')),
        str(row.get('description', '')),
        str(row.get('long_description', ''))
    ])
buyer_text=buyers_df.apply(combine_text, axis=1).sample(69)
seller_text=sellers_df.apply(combine_text, axis=1).sample(50)
# Generate negative pairs
negative_pairs = []
for buyer, seller in zip(buyer_text, seller_text):
    negative_pairs.append({
        'buyer_text': buyer,
        'seller_text': seller,
        'similarity_score': np.random.uniform(0.1, 0.3)
    })

negative_pairs_df = pd.DataFrame(negative_pairs)

# Combine known matches and negative pairs
# negative_pairs['similarity_score'] = np.random.uniform(0.1, 0.3)
# Generate synthetic training pairs



# 1. Load Data
# Read the Excel file into a pandas dataframe
known_matches = pd.read_excel("./data/nlp_business_all_matches_15_13-11-JN 2 -revised.xlsx")

# remove row with missing values in buyer_title or seller_title col 
known_matches = known_matches.dropna(subset=['buyer_title', 'seller_title'])
# fill missing values or 'not in db' in similarity_score with 0.85 to 0.99 uniformly 
known_matches['similarity_score'] = known_matches['similarity_score'].apply(
    lambda x: np.random.uniform(0.85, 0.99) if pd.isnull(x) or x == 'not in db' else x
)
# known_matches = pd.read_csv("./matches/nlp_business_all_matches_21_13-50.csv")
# known_matches = known_matches[['buyer_title', 'buyer_description', 'seller_title', 'seller_description', 'similarity_score']]
known_matches['buyer_text'] = known_matches['buyer_title'] + ' ' + known_matches['buyer_description'] + ' ' + known_matches['buyer_long_description']
known_matches['seller_text'] = known_matches['seller_title'] + ' ' + known_matches['seller_description'] + ' ' + known_matches['seller_long_description']

combined_pairs = pd.concat([known_matches[['buyer_text', 'seller_text', 'similarity_score']], negative_pairs_df])

# 2. Build Examples
train_examples = []
for idx, row in combined_pairs.iterrows():
    train_examples.append(InputExample(texts=[row['buyer_text'], row['seller_text']], label=row['similarity_score']))

# 3. Create Dataloader
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)
# fine_tuned_all-MiniLM-L12-v2
# 4. Load Pretrained Model
model = SentenceTransformer('T-Systems-onsite/cross-en-de-roberta-sentence-transformer')

# 5. Define Loss (CosineSimilarityLoss if label in [0..1])
train_loss = losses.CosineSimilarityLoss(model=model)

# 6. Train
# model_save_path = "./fine_tuned_models/fine_tuned_all-MiniLM-L12-v2"
model_save_path = "./fine_tuned_models/fine_tuned_T-Systems-onsite/cross-en-de-roberta-sentence-transformer"
# Split data into training and validation sets
train_size = int(0.8 * len(train_examples))
train_examples, val_examples = train_examples[:train_size], train_examples[train_size:]

# Create DataLoader for validation set
val_dataloader = DataLoader(val_examples, shuffle=False, batch_size=16)

# Define evaluator
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(val_examples, name='val-eval')

# Fine-tune the model
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    evaluator=evaluator,
    epochs=2,
    evaluation_steps=100,
    optimizer_params={'lr': 2e-5},
    warmup_steps=100,
    output_path=model_save_path
)




2025-01-31 11:32:35,500 - INFO - Use pytorch device_name: mps
2025-01-31 11:32:35,501 - INFO - Load pretrained SentenceTransformer: T-Systems-onsite/cross-en-de-roberta-sentence-transformer
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/24 [00:00<?, ?it/s]

2025-01-31 11:44:34,207 - INFO - EmbeddingSimilarityEvaluator: Evaluating the model on the val-eval dataset after epoch 1.0:
2025-01-31 11:44:37,764 - INFO - Cosine-Similarity :	Pearson: 0.1114	Spearman: 0.1311
2025-01-31 11:44:37,769 - INFO - Save model to ./fine_tuned_models/fine_tuned_T-Systems-onsite/cross-en-de-roberta-sentence-transformer


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

2025-01-31 11:54:00,422 - INFO - EmbeddingSimilarityEvaluator: Evaluating the model on the val-eval dataset after epoch 2.0:
2025-01-31 11:54:03,125 - INFO - Cosine-Similarity :	Pearson: 0.0354	Spearman: 0.0481


{'train_runtime': 1279.2536, 'train_samples_per_second': 0.285, 'train_steps_per_second': 0.019, 'train_loss': 548081581948928.0, 'epoch': 2.0}


2025-01-31 11:54:03,448 - INFO - Use pytorch device_name: mps
2025-01-31 11:54:03,449 - INFO - Load pretrained SentenceTransformer: ./fine_tuned_models/fine_tuned_T-Systems-onsite/cross-en-de-roberta-sentence-transformer


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Cosine Similarity: 0.55793786


In [5]:
# 7. Use the fine-tuned model
fine_tuned_model = SentenceTransformer(model_save_path)

# Example inference
text1 = "Elektroinstallationsfirma oder ein Ingenieurbüro für Gebäudetechnik gesucht Matthias ist Sachverständiger für Gebäudetechnik & Brandschutz und sucht in Baden Württemberg oder Bayern eine Elektroinstallationsfirma oder ein Ingenieurbüro für Gebäudetechnik."
text2 = "Innovativer Elektrobetrieb im Albtal sucht Nachfolger #Pforzheim #Karlsruhe #Service #Wartung #Elektroinstallationen #Beleuchtungstechnik #gleitender Übergang"
embed1 = fine_tuned_model.encode(text1)
embed2 = fine_tuned_model.encode(text2)

cos_sim = np.dot(embed1, embed2) / (np.linalg.norm(embed1)*np.linalg.norm(embed2))
print("Cosine Similarity:", cos_sim)


2025-01-31 11:58:48,734 - INFO - Use pytorch device_name: mps
2025-01-31 11:58:48,734 - INFO - Load pretrained SentenceTransformer: ./fine_tuned_models/fine_tuned_T-Systems-onsite/cross-en-de-roberta-sentence-transformer


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Cosine Similarity: 0.6069496


In [None]:
import re
import spacy
import numpy as np
import pandas as pd
import logging
from sklearn.metrics.pairwise import cosine_similarity
from rank_bm25 import BM25Okapi
from geopy.distance import geodesic
from sentence_transformers import SentenceTransformer
from optimum.onnxruntime import ORTModelForSequenceClassification
from transformers import AutoTokenizer
import torch
from spacy.lang.de.stop_words import STOP_WORDS
import json

# Load the small spaCy model without vectors
nlp = spacy.load("de_core_news_sm", disable=["ner"])

def preprocess_text(text, nlp_model):
    if pd.isnull(text):
        return ''
    
    # Clean text (remove domain-specific terms, URLs, emails, etc.)
    text = re.sub(r'(geschäft|dienstleistung|industrie)\w*', '', text)
    text = re.sub(r'http\S+|www.\S+|\S+@\S+|\b\d{10,}\b', '', text)
    text = re.sub(r'[^a-zA-ZäöüÄÖÜß\s\'\-]', '', text)

    text = ' '.join(text.split())
    
    doc = nlp_model(text)
    tokens = []
    business_stopwords = {"gmbh", "ag", "eg", "e.k", "gesellschaft", "unternehmen", "firma"}
    stop_words = STOP_WORDS.union(business_stopwords)
    
    # Token processing: lemmatization and POS filtering
    for token in doc:
        if token.pos_ in {'NOUN', 'PROPN'} and token.text not in stop_words:
            tokens.append(token.lemma_.lower())
        elif token.pos_ in {'VERB', 'ADJ'} and token.text not in stop_words:
            tokens.append(token.lemma_.lower())
    
    return ' '.join([token for token in tokens if len(token) > 2])


# Load datasets
logging.info('Loading datasets...')
buyers_df = pd.read_csv('./data/dejuna_buyer_latest_hf_nace_geocoded.csv')
sellers_df = pd.read_csv('./data/branche_nexxt_change_sales_listings_scrape_hf_nace_geocoded.csv')[:500]

# Preprocess buyers' text fields
logging.info('Preprocessing buyers\' text fields...')
buyers_df['title_preprocessed'] = buyers_df['title'].apply(lambda x: preprocess_text(x, nlp))
buyers_df['description_preprocessed'] = buyers_df['description'].apply(lambda x: preprocess_text(x, nlp))
buyers_df['long_description_preprocessed'] = buyers_df['long_description'].apply(lambda x: preprocess_text(x, nlp))
buyers_df['preprocessed_branchen'] = buyers_df.apply(lambda row: f"{row['Industrie']} {row['Sub-Industrie']}", axis=1)

# Preprocess sellers' text fields
logging.info('Preprocessing sellers\' text fields...')
sellers_df['title_preprocessed'] = sellers_df['title'].apply(lambda x: preprocess_text(x, nlp))
sellers_df['description_preprocessed'] = sellers_df['description'].apply(lambda x: preprocess_text(x, nlp))
sellers_df['long_description_preprocessed'] = sellers_df['long_description'].apply(lambda x: preprocess_text(x, nlp))
sellers_df['branchen_preprocessed'] = sellers_df['branchen'].apply(lambda x: preprocess_text(x, nlp))

# Parse latitude and longitude from JSON strings
logging.info('Parsing latitude and longitude from JSON strings...')
sellers_df['latitude'] = sellers_df['latitude'].apply(lambda x: json.loads(x) if pd.notnull(x) else [])
sellers_df['longitude'] = sellers_df['longitude'].apply(lambda x: json.loads(x) if pd.notnull(x) else [])
buyers_df['latitude'] = buyers_df['latitude'].apply(lambda x: json.loads(x) if pd.notnull(x) else [])
buyers_df['longitude'] = buyers_df['longitude'].apply(lambda x: json.loads(x) if pd.notnull(x) else [])

# Combine text fields for both buyers and sellers
def combine_text_fields(row):
    return ' '.join([
        row.get('title_preprocessed', ''),
        row.get('description_preprocessed', ''),
        row.get('long_description_preprocessed', ''),
        row.get('branchen_preprocessed', '')
    ])

logging.info('Combining text fields...')
buyers_df['combined_text'] = buyers_df.apply(combine_text_fields, axis=1)
sellers_df['combined_text'] = sellers_df.apply(combine_text_fields, axis=1)

# Load fine-tuned bi-encoder and cross-encoder models
# bi_encoder = SentenceTransformer('./fine_tuned_models/fine_tuned_paraphrase-multilingual-MiniLM-L12-v2')
bi_encoder = SentenceTransformer('deepset/gbert-large-sts')
# bi_encoder = SentenceTransformer('T-Systems-onsite/cross-en-de-roberta-sentence-transformer')
cross_encoder = ORTModelForSequenceClassification.from_pretrained('./onnx_models/cross-encoder-de')
cross_tokenizer = AutoTokenizer.from_pretrained('./onnx_models/cross-encoder-de')

# Build BM25 index for sellers
logging.info('Building BM25 index...')
seller_texts = sellers_df['combined_text'].tolist()
tokenized_seller_texts = [text.split() for text in seller_texts]
bm25_index = BM25Okapi(tokenized_seller_texts)

# Set similarity and cross-encoder thresholds
similarity_threshold = 0.80  # Adjusted similarity threshold for better precision
cross_encoder_threshold = 0.9  # Adjusted cross-encoder threshold

logging.info("Encoding sellers' text...")
seller_embeddings = bi_encoder.encode(seller_texts, convert_to_numpy=True, normalize_embeddings=True)
logging.info("Sellers' embeddings generated.")

matches = []
for i, buyer_row in buyers_df.iterrows():
    buyer_text = buyer_row['combined_text']
    buyer_latitudes = buyer_row['latitude']
    buyer_longitudes = buyer_row['longitude']

    # BM25 first stage
    buyer_tokens = buyer_text.split()
    bm25_scores = bm25_index.get_scores(buyer_tokens)
    bm25_candidates = np.argsort(bm25_scores)[-500:][::-1]  # Top 500 BM25 matches
    
    # Bi-encoder stage (calculate similarity scores)
    buyer_embedding = bi_encoder.encode([buyer_text], convert_to_numpy=True, normalize_embeddings=True)
    seller_subset_embeddings = seller_embeddings[bm25_candidates]
    sim_scores = cosine_similarity(buyer_embedding, seller_subset_embeddings)[0]
    matching_indices = np.where(sim_scores >= similarity_threshold)[0]
    
    if len(matching_indices) == 0:
        continue

    # Adaptive top_n based on score distribution
    score_std = np.std(sim_scores[matching_indices])
    adaptive_top_n = min(100, int(len(matching_indices) * (1 - score_std)))
    top_indices = matching_indices[np.argsort(sim_scores[matching_indices])[::-1][:adaptive_top_n]]
    
    # Batch cross-encoder prediction
    seller_texts_top = [sellers_df.iloc[bm25_candidates[idx]]['combined_text'] for idx in top_indices]
    pairs = [(buyer_text, seller_text) for seller_text in seller_texts_top]
    
    # ONNX-optimized inference for cross-encoder
    features = cross_tokenizer(pairs, padding=True, truncation=True, return_tensors="pt")
    outputs = cross_encoder(**features)
    cross_scores = np.atleast_1d(torch.sigmoid(outputs.logits).squeeze().detach().numpy())

    # Location-aware scoring: calculate geodesic distance between buyer and seller locations
    for idx, score in zip(top_indices, cross_scores):
        if score >= cross_encoder_threshold:
            seller_idx = bm25_candidates[idx]
            seller_row = sellers_df.iloc[seller_idx]
            seller_latitudes = sellers_df.iloc[seller_idx]['latitude']
            seller_longitudes = sellers_df.iloc[seller_idx]['longitude']

            location_match = False
            distance_km = None

            for s_lat, s_lon in zip(seller_latitudes, seller_longitudes):
                for p_lat, p_lon in zip(buyer_latitudes, buyer_longitudes):
                    if None in [s_lat, s_lon, p_lat, p_lon]:
                        continue
                    distance = geodesic((s_lat, s_lon), (p_lat, p_lon)).km
                    if distance <= 50:  # 50 km proximity
                        distance_km = distance
                        location_match = True
                        break
                if location_match:
                    break

            if score >= 0.65 and location_match:  # Final matching criteria
                match = {
                    'buyer_title': buyer_row['title'],
                    'buyer_description': buyer_row['description'],
                    'buyer_location': buyer_row['location'],
                    'seller_title': seller_row['title'],
                    'seller_description': seller_row['description'],
                    'seller_location': seller_row['location'],
                    'similarity_score': score,
                    'distance_km': distance_km,
                }
                matches.append(match)
                logging.info(f"Match found: Buyer {buyer_row['title']} with Seller {seller_row['title']}")

# Save results
matches_df = pd.DataFrame(matches)
matches_df = matches_df.sort_values('similarity_score', ascending=False)
matches_df.to_csv('./matches/nlp_business_high_conf_matches.csv', index=False)
logging.info(f"Matches saved: {len(matches_df)}")


In [None]:
from sentence_transformers import SentenceTransformer, util
from transformers import AutoTokenizer, AutoModel
import torch


#Mean Pooling - Take attention mask into account for correct averaging
# def mean_pooling(model_output, attention_mask):
#     token_embeddings = model_output[0] #First element of model_output contains all token embeddings
#     input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
#     return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

# Load the model (replace with correct path if needed)
# Load model from HuggingFace Hub
# tokenizer = AutoTokenizer.from_pretrained('aari1995/German_Semantic_STS_V2')
# model = AutoModel.from_pretrained('aari1995/German_Semantic_STS_V2')

# matryoshka_dim = 1024 # How big your embeddings should be, choose from: 64, 128, 256, 512, 768, 1024
# model = SentenceTransformer("aari1995/German_Semantic_V3", trust_remote_code=True, truncate_dim=matryoshka_dim)

model = SentenceTransformer("aari1995/German_Semantic_STS_V2")
# model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
# model = SentenceTransformer('sentence-transformers/all-MiniLM-L12-v2')

sellers_combined_texts = sellers_df['combined_text'].tolist()
sellers_embeddings = model.encode(sellers_combined_texts, convert_to_tensor=True)

# # Tokenize sentences
# encoded_input = tokenizer(sellers_combined_texts, padding=True, truncation=True, return_tensors='pt')

# # Compute token embeddings
# with torch.no_grad():
#     model_output = model(**encoded_input)
# sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

# Define sentences
# buyers_combined_texts = buyers_df['combined_text'].tolist()

# Compute embeddings for sellers
# print("🚀 Encoded sellers' text.", seller_embeddings)


In [180]:
# Set similarity threshold
similarity_threshold = 0.88

In [None]:

results = []

# Iterate over each buyer's combined text
for i, buyer_row in buyers_df.iterrows():
    buyer_text = buyer_row['combined_text']
    buyer_title = buyer_row['title']
    buyer_description = buyer_row['description']
    logging.info(f"Processing buyer {i+1}/{len(buyers_df)}: {buyer_title}")
    buyer_latitudes = buyer_row['latitude']
    buyer_longitudes = buyer_row['longitude']
    buyer_locations = buyer_row['location']
        # Tokenize sentences
    # encoded_input = tokenizer(buyer_text, padding=True, truncation=True, return_tensors='pt')
    buyer_embedding = model.encode(buyer_text, convert_to_tensor=True)

    # Compute token embeddings
    # with torch.no_grad():
    #     model_output = model(**encoded_input)
    # buyer_embedding = mean_pooling(model_output, encoded_input['attention_mask'])
    # Compute embedding for the current buyer's text
    
    # # Calculate cosine similarities
    cosine_scores = util.cos_sim(buyer_embedding, sellers_embeddings)[0]
    # Print cosine scores if greater than similarity threshold
    for seller_idx, score in enumerate(cosine_scores):
        if score >= similarity_threshold:
            # Store the results in a DataFrame
            seller_row = sellers_df.iloc[seller_idx]
            seller_latitudes = sellers_df.iloc[seller_idx]['latitude']
            seller_longitudes = sellers_df.iloc[seller_idx]['longitude']
            seller_locations = sellers_df.iloc[seller_idx]['location']

            location_match = False
            distance_km = None

            # if match_locations(seller_locations, buyer_locations):
            #     location_match = True
            # else:
            # Calculate distance between all combinations of seller and buyer coordinates
            for s_lat, s_lon in zip(seller_latitudes, seller_longitudes):
                for p_lat, p_lon in zip(buyer_latitudes, buyer_longitudes):
                    if None in [s_lat, s_lon, p_lat, p_lon]:
                        continue
                    distance = geodesic((s_lat, s_lon), (p_lat, p_lon)).km
                    if distance <= 50:  # 50 km threshold
                        distance_km = distance
                        location_match = True
                        break
                if location_match:
                    break
            
                # print(f"Cosine Score: {score:.4f} (Index: {seller_idx}) sellers_title: {sellers_df.iloc[seller_idx]['title']}, distance: {distance_km} km")
            if location_match:
                results.append({
                    'buyer_title': buyer_title,
                    'buyer_description': buyer_description,
                    'seller_title': sellers_df.iloc[seller_idx]['title'],
                    'seller_description': sellers_df.iloc[seller_idx]['description'],
                    'distance': distance_km,
                    'seller_location': seller_locations,
                    'buyer_location': buyer_locations,
                    'buyer_combined_text': buyer_text,
                    'seller_combined_text': sellers_combined_texts[seller_idx],
                    'similarity_score': score.item()
                })

            # Convert results to DataFrame
            results_df = pd.DataFrame(results)

    # Find indices where cosine scores are greater than the similarity threshold
    # matching_indices = torch.where(cosine_scores >= similarity_threshold)[0].tolist()
    # print(matching_indices)
    # results = []
    # for seller_idx in matching_indices:
    #     seller_row = sellers_df.iloc[seller_idx]

    # # # Store results in a DataFrame
    #     if cosine_scores >= similarity_threshold:
    #         logging.info(f"Match found: Buyer text: {buyer_text}, Seller text: {seller_text}, Similarity: {score:.4f}")

    #         # results.append({
    #         #     'buyer_text': buyer_text,
    #         #     'buyer_title': buyer_title,
    #         #     'buyer_description': buyer_description,
    #         #     'seller_text': seller_text,
    #         #     'seller_title': sellers_df.iloc[seller_idx]['title'],
    #         #     'seller_description': sellers_df.iloc[seller_idx]['description'],
    #         #     'similarity_score': score.item()
    #         # })
    # # for seller_idx, (seller_text, score) in enumerate(zip(sellers_combined_texts, cosine_scores)):
    # #         logging.info(f"Match found: Buyer text: {buyer_text}, Seller text: {seller_text}, Similarity: {score:.4f}")
            

    # results_df = pd.DataFrame(results)
# # Print results
# for sentence, score in zip(sentences_to_compare, cosine_scores):
#     print(f"Sentence: {sentence}")
#     print(f"Similarity Score: {score.item():.4f}\n")

In [183]:
results_df.to_csv('./matches/resultsdf.csv', index=False)

In [None]:
# --------------------------
# 🔑 1. Enhanced Preprocessing
# --------------------------
import re
import spacy
from spacy.lang.de.stop_words import STOP_WORDS

# nlp = spacy.load("de_core_news_lg", disable=["ner"])
# nlp = spacy.load(
#     "de_core_news_sm",
#     disable=["ner"],
#     exclude=["vectors"]  # Disable vector subsystem
# )
# 🔧 Switch to small model without vectors
nlp = spacy.load("de_core_news_sm", disable=["ner"])

def preprocess_text(text, nlp_model):
    if pd.isnull(text):
        return ''
    
    # Clean text
    text = re.sub(r'(geschäft|dienstleistung|industrie)\w*', '', text)
    text = re.sub(r'http\S+|www.\S+|\S+@\S+|\b\d{10,}\b', '', text)
    text = re.sub(r'[^a-zA-ZäöüÄÖÜß\s\'\-]', '', text)

    text = ' '.join(text.split())
    
    doc = nlp_model(text)
    tokens = []
    business_stopwords = {"gmbh", "ag", "eg", "e.k", "gesellschaft"}
    stop_words = STOP_WORDS.union(business_stopwords)
    
    for token in doc:
        if token.pos_ in {'NOUN', 'PROPN'} and token.text not in stop_words:
            tokens.append(token.lemma_.lower())
        elif token.pos_ in {'VERB', 'ADJ'} and token.text not in stop_words:
            tokens.append(token.lemma_.lower())
    
    return ' '.join([token for token in tokens if len(token) > 2])


def _extract_location_parts(location):
    """Extract and categorize location parts into states, districts, or cities."""
    locations = set()

    if not location or not isinstance(location, str):
        return locations

    try:
        # Split location string by " > ", handling the hierarchical structure
        parts = re.split(r'[\n]\s*', location)

        for part in parts:
            part = part.split(">")
            locations.add(part[-1].strip())

    except Exception as e:
        print(f"Error extracting location parts: {e}")

    return list(locations)
# Load updated datasets
logging.info('Loading datasets...')
buyers_df = pd.read_csv('./data/dejuna_buyer_latest_hf_nace_geocoded.csv')
sellers_df = pd.read_csv('./data/branche_nexxt_change_sales_listings_scrape_hf_nace_geocoded.csv')

# Preprocess buyers' text fields
logging.info('Preprocessing buyers\' text fields...')
buyers_df['title_preprocessed'] = buyers_df['title'].apply(lambda x:  preprocess_text(x, nlp))
buyers_df['description_preprocessed'] = buyers_df['description'].apply(lambda x:  preprocess_text(x, nlp))
buyers_df['long_description_preprocessed'] = buyers_df['long_description'].apply(lambda x:  preprocess_text(x, nlp))
buyers_df['preprocessed_branchen'] = buyers_df.apply(
    lambda row: f"{row['Industrie']} {row['Sub-Industrie']}", axis=1
)

# Preprocess sellers' text fields
logging.info('Preprocessing sellers\' text fields...')
sellers_df['title_preprocessed'] = sellers_df['title'].apply(lambda x:  preprocess_text(x, nlp))
sellers_df['description_preprocessed'] = sellers_df['description'].apply(lambda x:  preprocess_text(x, nlp))
sellers_df['long_description_preprocessed'] = sellers_df['long_description'].apply(lambda x:  preprocess_text(x, nlp))
sellers_df['branchen_preprocessed'] = sellers_df['branchen'].apply(lambda x:  preprocess_text(x, nlp))

# #Location processing
# logging.info('Processing locations...')
sellers_df['processed_location'] = sellers_df['location'].apply(lambda x: list(_extract_location_parts(x)) if pd.notnull(x) else [])
buyers_df['processed_location'] = buyers_df['location'].apply(lambda x: list(_extract_location_parts(x)) if pd.notnull(x) else [])

# # Parse latitude and longitude from JSON strings
sellers_df['latitude'] = sellers_df['latitude'].apply(lambda x: json.loads(x) if pd.notnull(x) else [])
sellers_df['longitude'] = sellers_df['longitude'].apply(lambda x: json.loads(x) if pd.notnull(x) else [])

buyers_df['latitude'] = buyers_df['latitude'].apply(lambda x: json.loads(x) if pd.notnull(x) else [])
buyers_df['longitude'] = buyers_df['longitude'].apply(lambda x: json.loads(x) if pd.notnull(x) else [])

def combine_text_fields(row):
    return ' '.join([
        row.get('title_preprocessed', ''),
        row.get('description_preprocessed', ''),
        row.get('long_description_preprocessed', ''),
        row.get('branchen_preprocessed', '')
    ])

# Combine text fields
logging.info('Combining text fields...')
buyers_df['combined_text'] = buyers_df.apply(combine_text_fields, axis=1)
sellers_df['combined_text'] = sellers_df.apply(combine_text_fields, axis=1)

# --------------------------
# 🔑 2. Hybrid Retrieval Setup
# --------------------------
from rank_bm25 import BM25Okapi

# Precompute BM25 index
logging.info('Building BM25 index...')
seller_texts = sellers_df['combined_text'].tolist()
tokenized_seller_texts = [text.split() for text in seller_texts]
bm25_index = BM25Okapi(tokenized_seller_texts)

# --------------------------
# 🔑 3. Enhanced Model Loading
# --------------------------
from sentence_transformers import SentenceTransformer
from optimum.onnxruntime import ORTModelForSequenceClassification
from transformers import AutoTokenizer

# 🔑 Load fine-tuned bi-encoder (pretrain on business texts)
bi_encoder = SentenceTransformer('./fine_tuned_models/fine_tuned_all-MiniLM-L12-v2')

# 🔑 Optimized cross-encoder with ONNX
cross_encoder = ORTModelForSequenceClassification.from_pretrained('./onnx_models/cross-encoder-de')
cross_tokenizer = AutoTokenizer.from_pretrained('./onnx_models/cross-encoder-de')

# --------------------------
# 🔑 4. Optimized Matching Loop
# --------------------------
# 🔑 Dynamic threshold calculation (precompute from validation data)
similarity_threshold = 0.58  # Calculated from validation set
cross_encoder_threshold = 0.72
logging.info("Encoding sellers' text...")
seller_texts = sellers_df['combined_text'].tolist()
seller_embeddings = bi_encoder.encode(seller_texts, convert_to_numpy=True, normalize_embeddings=True)
logging.info("Sellers' embeddings generated.")
for i, buyer_row in buyers_df.iterrows():
    buyer_text = buyer_row['combined_text']
    
    # 🔑 Hybrid retrieval: BM25 first stage
    buyer_tokens = buyer_text.split()
    bm25_scores = bm25_index.get_scores(buyer_tokens)
    bm25_candidates = np.argsort(bm25_scores)[-500:][::-1]  # Top 500 BM25 matches
    
    # 🔑 Bi-encoder stage
    buyer_embedding = bi_encoder.encode([buyer_text], convert_to_numpy=True, normalize_embeddings=True)
    seller_subset_embeddings = seller_embeddings[bm25_candidates]
    
    # 🔑 Batch cosine similarity calculation
    sim_scores = cosine_similarity(buyer_embedding, seller_subset_embeddings)[0]
    matching_indices = np.where(sim_scores >= similarity_threshold)[0]
    
    if len(matching_indices) == 0:
        continue

    # 🔑 Adaptive top_n based on score distribution
    score_std = np.std(sim_scores[matching_indices])
    adaptive_top_n = min(100, int(len(matching_indices) * (1 - score_std)))
    top_indices = matching_indices[np.argsort(sim_scores[matching_indices])[::-1][:adaptive_top_n]]
    
    # 🔑 Batch cross-encoder prediction
    seller_texts_top = [sellers_df.iloc[bm25_candidates[idx]]['combined_text'] for idx in top_indices]
    pairs = [(buyer_text, seller_text) for seller_text in seller_texts_top]
    
    # 🔑 ONNX-optimized inference
    features = cross_tokenizer(pairs, padding=True, truncation=True, return_tensors="pt")
    outputs = cross_encoder(**features)
    cross_scores = np.atleast_1d(torch.sigmoid(outputs.logits).squeeze().detach().numpy())

    # 🔑 Location-aware scoring
    for idx, score in zip(top_indices, cross_scores):
        if score >= cross_encoder_threshold:
            seller_idx = bm25_candidates[idx]
            seller_row = sellers_df.iloc[seller_idx]
            
            # 🔑 Combined score calculation
            geo_score = calculate_geo_score(buyer_row, seller_row)  # Implement geo-scoring
            final_score = 0.7 * score + 0.3 * geo_score
            
            if final_score >= 0.65:
                # ... rest of match processing ...
                print(f"Match found: Buyer {buyer_row['title']} with Seller {seller_row['title']}")

# --------------------------
# 🔑 5. Post-processing
# --------------------------
def calculate_geo_score(buyer, seller):
    """Calculate normalized geographic compatibility score (0-1)"""
    # Implement sophisticated location matching
    return min(1.0, 1 / (1 + geodesic_distance_km))

In [None]:
import re
import spacy
import numpy as np
import pandas as pd
import logging
from sklearn.metrics.pairwise import cosine_similarity
from rank_bm25 import BM25Okapi
from geopy.distance import geodesic
from sentence_transformers import SentenceTransformer
from optimum.onnxruntime import ORTModelForSequenceClassification
from transformers import AutoTokenizer
import torch
from spacy.lang.de.stop_words import STOP_WORDS
import json

# Load the small spaCy model without vectors
nlp = spacy.load("de_core_news_sm", disable=["ner"])

def preprocess_text(text, nlp_model):
    if pd.isnull(text):
        return ''
    
    # Clean text (remove domain-specific terms, URLs, emails, etc.)
    text = re.sub(r'(geschäft|dienstleistung|industrie)\w*', '', text)
    text = re.sub(r'http\S+|www.\S+|\S+@\S+|\b\d{10,}\b', '', text)
    text = re.sub(r'[^a-zA-ZäöüÄÖÜß\s\'\-]', '', text)

    text = ' '.join(text.split())
    
    doc = nlp_model(text)
    tokens = []
    business_stopwords = {"gmbh", "ag", "eg", "e.k", "gesellschaft", "unternehmen", "firma"}
    stop_words = STOP_WORDS.union(business_stopwords)
    
    # Token processing: lemmatization and POS filtering
    for token in doc:
        if token.pos_ in {'NOUN', 'PROPN'} and token.text not in stop_words:
            tokens.append(token.lemma_.lower())
        elif token.pos_ in {'VERB', 'ADJ'} and token.text not in stop_words:
            tokens.append(token.lemma_.lower())
    
    return ' '.join([token for token in tokens if len(token) > 2])


# Load datasets
logging.info('Loading datasets...')
buyers_df = pd.read_csv('./data/dejuna_buyer_latest_hf_nace_geocoded.csv')
sellers_df = pd.read_csv('./data/branche_nexxt_change_sales_listings_scrape_hf_nace_geocoded.csv')[:500]

# Preprocess buyers' text fields
logging.info('Preprocessing buyers\' text fields...')
buyers_df['title_preprocessed'] = buyers_df['title'].apply(lambda x: preprocess_text(x, nlp))
buyers_df['description_preprocessed'] = buyers_df['description'].apply(lambda x: preprocess_text(x, nlp))
buyers_df['long_description_preprocessed'] = buyers_df['long_description'].apply(lambda x: preprocess_text(x, nlp))
buyers_df['preprocessed_branchen'] = buyers_df.apply(lambda row: f"{row['Industrie']} {row['Sub-Industrie']}", axis=1)

# Preprocess sellers' text fields
logging.info('Preprocessing sellers\' text fields...')
sellers_df['title_preprocessed'] = sellers_df['title'].apply(lambda x: preprocess_text(x, nlp))
sellers_df['description_preprocessed'] = sellers_df['description'].apply(lambda x: preprocess_text(x, nlp))
sellers_df['long_description_preprocessed'] = sellers_df['long_description'].apply(lambda x: preprocess_text(x, nlp))
sellers_df['branchen_preprocessed'] = sellers_df['branchen'].apply(lambda x: preprocess_text(x, nlp))

# Parse latitude and longitude from JSON strings
logging.info('Parsing latitude and longitude from JSON strings...')
sellers_df['latitude'] = sellers_df['latitude'].apply(lambda x: json.loads(x) if pd.notnull(x) else [])
sellers_df['longitude'] = sellers_df['longitude'].apply(lambda x: json.loads(x) if pd.notnull(x) else [])
buyers_df['latitude'] = buyers_df['latitude'].apply(lambda x: json.loads(x) if pd.notnull(x) else [])
buyers_df['longitude'] = buyers_df['longitude'].apply(lambda x: json.loads(x) if pd.notnull(x) else [])

# Combine text fields for both buyers and sellers
def combine_text_fields(row):
    return ' '.join([
        row.get('title_preprocessed', ''),
        row.get('description_preprocessed', ''),
        row.get('long_description_preprocessed', ''),
        row.get('branchen_preprocessed', '')
    ])

logging.info('Combining text fields...')
buyers_df['combined_text'] = buyers_df.apply(combine_text_fields, axis=1)
sellers_df['combined_text'] = sellers_df.apply(combine_text_fields, axis=1)

# Load fine-tuned bi-encoder and cross-encoder models
# bi_encoder = SentenceTransformer('./fine_tuned_models/fine_tuned_paraphrase-multilingual-MiniLM-L12-v2')
bi_encoder = SentenceTransformer('deepset/gbert-large-sts')
# bi_encoder = SentenceTransformer('T-Systems-onsite/cross-en-de-roberta-sentence-transformer')
cross_encoder = ORTModelForSequenceClassification.from_pretrained('./onnx_models/cross-encoder-de')
cross_tokenizer = AutoTokenizer.from_pretrained('./onnx_models/cross-encoder-de')

# Build BM25 index for sellers
logging.info('Building BM25 index...')
seller_texts = sellers_df['combined_text'].tolist()
tokenized_seller_texts = [text.split() for text in seller_texts]
bm25_index = BM25Okapi(tokenized_seller_texts)

# Set similarity and cross-encoder thresholds
similarity_threshold = 0.80  # Adjusted similarity threshold for better precision
cross_encoder_threshold = 0.9  # Adjusted cross-encoder threshold

logging.info("Encoding sellers' text...")
seller_embeddings = bi_encoder.encode(seller_texts, convert_to_numpy=True, normalize_embeddings=True)
logging.info("Sellers' embeddings generated.")

matches = []
for i, buyer_row in buyers_df.iterrows():
    buyer_text = buyer_row['combined_text']
    buyer_latitudes = buyer_row['latitude']
    buyer_longitudes = buyer_row['longitude']

    # BM25 first stage
    buyer_tokens = buyer_text.split()
    bm25_scores = bm25_index.get_scores(buyer_tokens)
    bm25_candidates = np.argsort(bm25_scores)[-500:][::-1]  # Top 500 BM25 matches
    
    # Bi-encoder stage (calculate similarity scores)
    buyer_embedding = bi_encoder.encode([buyer_text], convert_to_numpy=True, normalize_embeddings=True)
    seller_subset_embeddings = seller_embeddings[bm25_candidates]
    sim_scores = cosine_similarity(buyer_embedding, seller_subset_embeddings)[0]
    matching_indices = np.where(sim_scores >= similarity_threshold)[0]
    
    if len(matching_indices) == 0:
        continue

    # Adaptive top_n based on score distribution
    score_std = np.std(sim_scores[matching_indices])
    adaptive_top_n = min(100, int(len(matching_indices) * (1 - score_std)))
    top_indices = matching_indices[np.argsort(sim_scores[matching_indices])[::-1][:adaptive_top_n]]
    
    # Batch cross-encoder prediction
    seller_texts_top = [sellers_df.iloc[bm25_candidates[idx]]['combined_text'] for idx in top_indices]
    pairs = [(buyer_text, seller_text) for seller_text in seller_texts_top]
    
    # ONNX-optimized inference for cross-encoder
    features = cross_tokenizer(pairs, padding=True, truncation=True, return_tensors="pt")
    outputs = cross_encoder(**features)
    cross_scores = np.atleast_1d(torch.sigmoid(outputs.logits).squeeze().detach().numpy())

    # Location-aware scoring: calculate geodesic distance between buyer and seller locations
    for idx, score in zip(top_indices, cross_scores):
        if score >= cross_encoder_threshold:
            seller_idx = bm25_candidates[idx]
            seller_row = sellers_df.iloc[seller_idx]
            seller_latitudes = sellers_df.iloc[seller_idx]['latitude']
            seller_longitudes = sellers_df.iloc[seller_idx]['longitude']

            location_match = False
            distance_km = None

            for s_lat, s_lon in zip(seller_latitudes, seller_longitudes):
                for p_lat, p_lon in zip(buyer_latitudes, buyer_longitudes):
                    if None in [s_lat, s_lon, p_lat, p_lon]:
                        continue
                    distance = geodesic((s_lat, s_lon), (p_lat, p_lon)).km
                    if distance <= 50:  # 50 km proximity
                        distance_km = distance
                        location_match = True
                        break
                if location_match:
                    break

            if score >= 0.65 and location_match:  # Final matching criteria
                match = {
                    'buyer_title': buyer_row['title'],
                    'buyer_description': buyer_row['description'],
                    'buyer_location': buyer_row['location'],
                    'seller_title': seller_row['title'],
                    'seller_description': seller_row['description'],
                    'seller_location': seller_row['location'],
                    'similarity_score': score,
                    'distance_km': distance_km,
                }
                matches.append(match)
                logging.info(f"Match found: Buyer {buyer_row['title']} with Seller {seller_row['title']}")

# Save results
matches_df = pd.DataFrame(matches)
matches_df = matches_df.sort_values('similarity_score', ascending=False)
matches_df.to_csv('./matches/nlp_business_high_conf_matches.csv', index=False)
logging.info(f"Matches saved: {len(matches_df)}")


In [None]:
matches_df

In [None]:
from optimum.onnxruntime import ORTModelForSequenceClassification
from transformers import AutoTokenizer

# Convert to ONNX
cross_encoder = ORTModelForSequenceClassification.from_pretrained(
    "cross-encoder/ms-marco-MiniLM-L-12-v2",
    export=True
)
cross_encoder.save_pretrained("./onnx_models/cross-encoder-de")

# Load optimized version
cross_encoder = ORTModelForSequenceClassification.from_pretrained("./onnx_models/cross-encoder-de")
cross_tokenizer = AutoTokenizer.from_pretrained("cross-encoder/ms-marco-MiniLM-L-12-v2")