In [6]:
dataFile =  './data/nexxt_change_sales_listings_geocoded_short_test.csv' 
# sales_file_nace =  './data/nexxt_change_sales_listings_geocoded.csv' 
sales_file_brachen =  './data/branche_nexxt_change_sales_listings.csv' 
sales_file_nace =  './data/dub_listings_geo.csv'
buyer_file_nace =  './data/nexxt_change_purchase_listings_geocoded.csv' 
nacecode_josn =  './data/nace_codes.json' 
nacecode_array_josn =  './data/nace_codes_array.json' 
nacecode_array_obj =  './data/nace_codes_object.json' 
nacecode_array_obj_ext =  './data/nace_codes_object_ext.json' 
nacecode_array_obj_du =  './data/nace_codes_object_du.json' 

In [None]:
import json
import pandas as pd
import spacy
import nltk
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import numpy as np
from nltk.corpus import stopwords
import re
from nltk.stem import SnowballStemmer

# Ensure necessary NLTK data is downloaded
nltk.download('punkt')
nltk.download('stopwords')

# Load SpaCy's German model with NER and POS capabilities
try:
    nlp = spacy.load('de_core_news_sm')
except OSError:
    from spacy.cli import download
    download('de_core_news_sm')
    nlp = spacy.load('de_core_news_sm')

# Preprocess text function with NER and POS tagging
def preprocess_text(text, nlp_model):
    if pd.isnull(text):
        return ''
    text = text.lower()
    # Remove URLs, emails, and long numbers
    text = re.sub(r'http\S+|www.\S+|\S+@\S+|\b\d{10,}\b', '', text)
    # Remove non-alphabetic characters (keeping German characters)
    text = re.sub(r'[^a-zA-ZäöüÄÖÜß\s]', '', text)
    text = ' '.join(text.split())

    # Initialize stopwords and stemmer
    stop_words = set(stopwords.words('german'))
    stemmer = SnowballStemmer('german')

    # Process text with SpaCy
    doc = nlp_model(text)

    tokens = []
    for token in doc:
        # Retain nouns, proper nouns, and verbs
        if token.pos_ in {'NOUN', 'PROPN', 'VERB'} and token.text not in stop_words:
            stemmed = stemmer.stem(token.text)
            tokens.append(stemmed)

    # Extract named entities and include them
    entities = [ent.text for ent in doc.ents if ent.label_ in {'ORG', 'PRODUCT', 'GPE'}]
    entities = [stemmer.stem(ent.lower()) for ent in entities if ent.lower() not in stop_words]
    tokens.extend(entities)

    text = ' '.join(tokens)
    return text

# Load NACE codes
def load_nace_codes(filepath):
    with open(filepath, 'r', encoding='utf-8') as file:
        nace_codes = json.load(file)
    return nace_codes

# Create embeddings
def create_embeddings(texts, model):
    return model.encode(texts, show_progress_bar=True, convert_to_numpy=True, normalize_embeddings=True)

# Load Data
def load_data(sellers_filepath, nace_codes_filepath):
    sellers_df = pd.read_csv(sellers_filepath)
    nace_codes = load_nace_codes(nace_codes_filepath)
    return sellers_df, nace_codes

# Filepaths (replace with your actual file paths)
sellers_filepath = sales_file_brachen  # Path to your sellers CSV file
nace_codes_filepath = nacecode_array_obj_du  # Path to your NACE codes JSON file

# Load data
sellers_df, nace_codes = load_data(sellers_filepath, nace_codes_filepath)
print("🚀 ~ Sellers and NACE codes loaded.")

# Initialize the Sentence Transformer model
model_name = 'all-MiniLM-L6-v2'
# model_name = 'paraphrase-multilingual-mpnet-base-v2'
model = SentenceTransformer(model_name)
print(f"🚀 ~ Loaded SentenceTransformer model: {model_name}")

# Preprocess NACE descriptions
nace_descriptions = [preprocess_text(desc, nlp) for desc in nace_codes.values()]
nace_embeddings = create_embeddings(nace_descriptions, model)
nace_code_list = list(nace_codes.keys())
print("🚀 ~ Created embeddings for NACE descriptions.")

# Preprocess 'branchen' field in sellers data
sellers_df['preprocessed_branchen'] = sellers_df['branchen'].apply(lambda x: preprocess_text(x, nlp))

# Create embeddings for 'branchen'
branchen_embeddings = create_embeddings(sellers_df['preprocessed_branchen'].tolist(), model)
print("🚀 ~ Created embeddings for 'branchen' field.")

# Compute cosine similarity between 'branchen' embeddings and NACE embeddings
similarities = cosine_similarity(branchen_embeddings, nace_embeddings)

# Assign the NACE code with the highest similarity
best_match_indices = similarities.argmax(axis=1)
sellers_df['assigned_nace_code'] = [nace_code_list[idx] for idx in best_match_indices]
sellers_df['assigned_nace_similarity'] = [similarities[i][idx] for i, idx in enumerate(best_match_indices)]
print("🚀 ~ Assigned preliminary NACE codes based on 'branchen' similarity.")

# Optional: Set a similarity threshold to filter uncertain assignments
similarity_threshold = 0.2
sellers_df['nace_code'] = sellers_df.apply(
    lambda row: row['assigned_nace_code'] if row['assigned_nace_similarity'] >= similarity_threshold else 'Unassigned',
    axis=1
)

# Save the preliminary assignments
sellers_df.to_csv(f'{sellers_filepath}_nace.csv', index=False)
print("🚀 ~ Saved sellers data with preliminary NACE code assignments to 'sellers_with_preassigned_nace.csv'.")

# Review the assignments
print("\nSample of NACE Code Assignments:")
print(sellers_df[['branchen', 'assigned_nace_code', 'assigned_nace_similarity', 'nace_code']].head())




updated with better


In [8]:
import json
import pandas as pd
import spacy
import nltk
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import numpy as np
from nltk.corpus import stopwords
import re
from nltk.stem import SnowballStemmer

# Ensure necessary NLTK data is downloaded
nltk.download('punkt')
nltk.download('stopwords')

# -------------------------------------------------------------------------
# 1. Load SpaCy's German model (for tokenization, NER, POS tagging)
# -------------------------------------------------------------------------
try:
    nlp = spacy.load('de_core_news_sm')
except OSError:
    from spacy.cli import download
    download('de_core_news_sm')
    nlp = spacy.load('de_core_news_sm')

# -------------------------------------------------------------------------
# 2. Preprocessing function (with German NER, POS, etc.)
# -------------------------------------------------------------------------
def preprocess_text(text, nlp_model):
    """
    - Lowercases the text.
    - Removes URLs, emails, & large digit sequences.
    - Filters out non-alphabetic chars except German Umlauts/ß.
    - Uses SpaCy to keep only NOUN, PROPN, VERB tokens not in stopwords.
    - Applies Snowball stemming on remaining tokens.
    - Also includes certain named entities (ORG, PRODUCT, GPE).
    """
    if pd.isnull(text):
        return ''
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove URLs, emails, large numbers
    text = re.sub(r'http\S+|www.\S+|\S+@\S+|\b\d{10,}\b', '', text)
    
    # Keep only letters and German characters
    text = re.sub(r'[^a-zA-ZäöüÄÖÜß\s]', '', text)
    
    # Compact multiple spaces
    text = ' '.join(text.split())

    # Initialize German stopwords and Snowball stemmer
    stop_words = set(stopwords.words('german'))
    stemmer = SnowballStemmer('german')

    # Process text with SpaCy
    doc = nlp_model(text)

    tokens = []
    for token in doc:
        # Keep nouns, proper nouns, and verbs
        if token.pos_ in {'NOUN', 'PROPN', 'VERB'} and token.text not in stop_words:
            stemmed = stemmer.stem(token.text)
            tokens.append(stemmed)

    # Extract named entities (ORG, PRODUCT, GPE) and include them
    entities = [
        ent.text for ent in doc.ents 
        if ent.label_ in {'ORG', 'PRODUCT', 'GPE'}
    ]
    # Stem and remove stopwords from entities
    entities = [
        stemmer.stem(ent.lower()) 
        for ent in entities 
        if ent.lower() not in stop_words
    ]
    tokens.extend(entities)

    return ' '.join(tokens)

# -------------------------------------------------------------------------
# 3. Load NACE codes from JSON
# -------------------------------------------------------------------------
def load_nace_codes(filepath):
    """
    Expects a JSON file where keys = NACE code, values = textual descriptions.
    Example:
      {
        "01.1": "Growing of non-perennial crops",
        "01.2": "Growing of perennial crops",
        ...
      }
    """
    with open(filepath, 'r', encoding='utf-8') as file:
        nace_codes = json.load(file)
    return nace_codes

# -------------------------------------------------------------------------
# 4. Create embeddings with sentence-transformers
# -------------------------------------------------------------------------
def create_embeddings(texts, model):
    """
    Uses the model to encode a list of texts,
    returning normalized NumPy arrays of embeddings.
    """
    return model.encode(
        texts, 
        show_progress_bar=True, 
        convert_to_numpy=True, 
        normalize_embeddings=True
    )

# -------------------------------------------------------------------------
# 5. Load your Seller/Branchen data and NACE codes
# -------------------------------------------------------------------------
def load_data(sellers_filepath, nace_codes_filepath):
    sellers_df = pd.read_csv(sellers_filepath)
    nace_codes = load_nace_codes(nace_codes_filepath)
    return sellers_df, nace_codes

# -------------------------------------------------------------------------
# 6. MAIN LOGIC
# -------------------------------------------------------------------------
    # Filepaths (update to your actual paths)
sellers_filepath = sales_file_brachen       # CSV with a column 'branchen'
nace_codes_filepath = nacecode_array_obj_du

# Load data
sellers_df, nace_codes = load_data(sellers_filepath, nace_codes_filepath)
print("🚀 Sellers and NACE codes loaded.")

# Initialize the Sentence Transformer model
# For German or multilingual, consider e.g.: 
#    model_name = 'paraphrase-multilingual-mpnet-base-v2'
model_name = 'all-MiniLM-L6-v2'
model = SentenceTransformer(model_name)
print(f"🚀 Loaded SentenceTransformer model: {model_name}")

# ---------------------------------------------------------------------
# 6a. Preprocess NACE descriptions
# ---------------------------------------------------------------------
# Convert each NACE code's description with the same text preprocessing
nace_descriptions = [preprocess_text(desc, nlp) for desc in nace_codes.values()]
# Create embeddings for these descriptions
nace_embeddings = create_embeddings(nace_descriptions, model)
# We'll keep a list of NACE codes in the same order
nace_code_list = list(nace_codes.keys())
print("🚀 Created embeddings for NACE descriptions.")

# ---------------------------------------------------------------------
# 6b. Preprocess 'branchen' column in sellers data
# ---------------------------------------------------------------------
# We'll store it in a new column 'preprocessed_branchen'
sellers_df['preprocessed_branchen'] = sellers_df['branchen'].apply(lambda x: preprocess_text(x, nlp))

# Create embeddings for all sellers' branchen
branchen_embeddings = create_embeddings(sellers_df['preprocessed_branchen'].tolist(), model)
print("🚀 Created embeddings for sellers' 'branchen' field.")

# ---------------------------------------------------------------------
# 6c. Compute similarity and assign best-match NACE code
# ---------------------------------------------------------------------
similarities = cosine_similarity(branchen_embeddings, nace_embeddings)
best_match_indices = similarities.argmax(axis=1)

# For each seller row, pick the NACE code with highest similarity
sellers_df['assigned_nace_code'] = [nace_code_list[idx] for idx in best_match_indices]
sellers_df['assigned_nace_similarity'] = [similarities[i][idx] for i, idx in enumerate(best_match_indices)]
print("🚀 Assigned preliminary NACE codes based on 'branchen' similarity.")

# Optionally set a threshold. If similarity < threshold => 'Unassigned'
similarity_threshold = 0.2
sellers_df['nace_code'] = sellers_df.apply(
    lambda row: row['assigned_nace_code'] 
                if row['assigned_nace_similarity'] >= similarity_threshold 
                else 'Unassigned',
    axis=1
)

# ---------------------------------------------------------------------
# 6d. Save and review
# ---------------------------------------------------------------------
output_file = sellers_filepath.replace(".csv", "_nace.csv")
sellers_df.to_csv(output_file, index=False)
print(f"🚀 Saved sellers data with assigned NACE codes to: {output_file}\n")

# Print a small sample
print("Sample of NACE Code Assignments:")
print(sellers_df[['branchen', 'assigned_nace_code', 'assigned_nace_similarity', 'nace_code']].head(10))


[nltk_data] Downloading package punkt to /Users/abbasm1/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/abbasm1/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


🚀 Sellers and NACE codes loaded.
🚀 Loaded SentenceTransformer model: all-MiniLM-L6-v2


Batches:   0%|          | 0/31 [00:00<?, ?it/s]

🚀 Created embeddings for NACE descriptions.


Batches:   0%|          | 0/16 [00:00<?, ?it/s]

🚀 Created embeddings for sellers' 'branchen' field.
🚀 Assigned preliminary NACE codes based on 'branchen' similarity.
🚀 Saved sellers data with assigned NACE codes to: ./data/branche_nexxt_change_sales_listings_nace.csv

Sample of NACE Code Assignments:
                                            branchen assigned_nace_code  \
0              Verarbeitendes Gewerbe > Maschinenbau            C28.9.2   
1                          Gastgewerbe > Gastronomie                I56   
2  Dienstleistung > Sonstige Dienstleistungen > S...            C18.1.4   
3  Handel > Großhandel und Handelsvermittlung; Ve...                G46   
4  Baugewerbe > Bauinstallation > Elektroinstalla...            F43.2.1   
5       Handwerk > Handwerke für den privaten Bedarf            C28.2.4   
6  Handwerk > Ausbaugewerbe > Installateur und He...            G46.7.4   
7                     Grundstücks- und Wohnungswesen            Q86.9.0   
8                                             Handel              G45.1

In [3]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
import logging
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
import shelve
import json
import os

# Ensure nltk stopwords are downloaded
nltk.download('stopwords')

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def _extract_location_parts(location):
    """Extract and categorize location parts into states and cities."""
    locations = set()
    german_states = {
        'baden-württemberg', 'bayern', 'berlin', 'brandenburg', 'bremen',
        'hamburg', 'hessen', 'mecklenburg-vorpommern', 'niedersachsen',
        'nordrhein-westfalen', 'rheinland-pfalz', 'saarland', 'sachsen',
        'sachsen-anhalt', 'schleswig-holstein', 'thüringen'
    }

    if pd.isna(location):
        return locations

    try:
        # Split on common delimiters
        parts = re.split(r'[>/,\n]\s*', str(location))
        split_locations = []
        for part in parts:
            part = part.strip().lower()
            if part:
                # Further split by space if multiple states are concatenated
                words = part.split()
                temp = []
                current = ""
                for word in words:
                    if word in german_states:
                        if current:
                            temp.append(current.strip())
                        current = word
                    else:
                        current += " " + word if current else word
                if current:
                    temp.append(current.strip())
                split_locations.extend(temp)
        
        for loc in split_locations:
            loc = loc.strip().lower()
            if loc:
                if loc in german_states:
                    locations.add(loc.title())  # Capitalize for better geocoding
                else:
                    # Remove "region" and other common prefixes
                    clean_part = re.sub(r'^region\s+', '', loc)
                    if clean_part:
                        locations.add(clean_part.title())
    except Exception as e:
        logging.error(f"Error extracting location parts: {e}")

    return locations

# def get_all_unique_locations(buyers_df, sellers_df):
#     """Extract all unique locations from buyers and sellers dataframes."""
#     unique_locations = set()

#     for df, name in [(buyers_df, 'buyers'), (sellers_df, 'sellers')]:
#         logging.info(f'Extracting locations from {name} dataframe...')
#         for idx, location in df['location'].items():
#             locations = _extract_location_parts(location)
#             unique_locations.update(locations)

#     logging.info(f'Total unique locations found: {len(unique_locations)}')
#     return unique_locations

def geocode_locations(unique_locations, cache_path='geocode_cache.db'):
    """Geocode unique locations with caching."""
    geolocator = Nominatim(user_agent="buyer_seller_matching")
    geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1, max_retries=3, error_wait_seconds=10.0)

    # Ensure cache directory exists
    cache_dir = os.path.dirname(cache_path)
    if cache_dir and not os.path.exists(cache_dir):
        os.makedirs(cache_dir)

    with shelve.open(cache_path) as geocode_cache:
        for location in unique_locations:
            if location in geocode_cache:
                continue  # Already cached
            try:
                logging.info(f'Geocoding location: {location}')
                loc = geocode(location + ", Germany")
                if loc:
                    geocode_cache[location] = {'latitude': loc.latitude, 'longitude': loc.longitude}
                    logging.info(f'Geocoded {location}: ({loc.latitude}, {loc.longitude})')
                else:
                    geocode_cache[location] = {'latitude': None, 'longitude': None}
                    logging.warning(f'Geocoding failed for location: {location}')
            except Exception as e:
                logging.error(f"Geocoding error for location '{location}': {e}")
                geocode_cache[location] = {'latitude': None, 'longitude': None}

def update_dataframe_with_geocodes(df, cache_path='geocode_cache.db'):
    """Add latitude and longitude columns to the dataframe based on locations."""
    with shelve.open(cache_path) as geocode_cache:
        latitudes = []
        longitudes = []

        for idx, location in df['location'].items():
            locations = _extract_location_parts(location)
            lat_list = []
            lon_list = []
            for loc in locations:
                geocode_info = geocode_cache.get(loc, {'latitude': None, 'longitude': None})
                if geocode_info['latitude'] is not None and geocode_info['longitude'] is not None:
                    lat_list.append(geocode_info['latitude'])
                    lon_list.append(geocode_info['longitude'])
                else:
                    # If geocoding failed, append None
                    lat_list.append(None)
                    lon_list.append(None)
            # Convert lists to JSON strings for CSV compatibility
            latitudes.append(json.dumps(lat_list))
            longitudes.append(json.dumps(lon_list))

    df['latitude'] = latitudes
    df['longitude'] = longitudes
    return df

# Paths to input and output files
sellers_input_path = './data/branche_nexxt_change_sales_listings_nace.csv'
sellers_output_path = './data/branche_nexxt_change_sales_listings_nace_geocoded.csv'

# buyers_input_path = './data/nexxt_change_purchase_listings.csv'
# buyers_output_path = './data/nexxt_change_purchase_listings_geocoded.csv'
cache_path = './geocode_cache.db'

# Load buyer and seller datasets
logging.info('Loading buyer and seller datasets...')
# buyers_df = pd.read_csv(buyers_input_path)
sellers_df = pd.read_csv(sellers_input_path)

# Extract unique locations
# unique_locations = get_all_unique_locations(sellers_df)

# Geocode locations with caching
geocode_locations(sellers_df, cache_path=cache_path)

# Update dataframes with geocodes
logging.info('Updating buyers dataframe with geocodes...')
# buyers_df = update_dataframe_with_geocodes(buyers_df, cache_path=cache_path)

logging.info('Updating sellers dataframe with geocodes...')
sellers_df = update_dataframe_with_geocodes(sellers_df, cache_path=cache_path)

# Save updated dataframes to new CSV files
logging.info('Saving updated buyers dataframe...')
# buyers_df.to_csv(buyers_output_path, index=False)

logging.info('Saving updated sellers dataframe...')
sellers_df.to_csv(sellers_output_path, index=False)

logging.info('Geocoding process completed successfully.')


KeyboardInterrupt: 

In [2]:
import pandas as pd
# combined = pd.read_csv('./data/combined.csv')
training_updated_branchen = pd.read_csv('./data/nexxt_change_data_for_model_training_updated_branchen.csv')
nexxt_updated_branche = pd.read_csv('./data/branche_nexxt_change_sales_listings_nace_geocoded.csv')
dejuna = pd.read_csv('./data/buyer_dejuna_geocoded_test-.csv')
dub = pd.read_csv('./data/dub_listings.csv')
branche_nexxt_change_sales_listings = pd.read_csv('./data/branche_nexxt_change_sales_listings_scrape.csv')


In [None]:
import pandas as pd

source1_path="./data/nexxt_change_sales_listings_update.csv"
source2_path="./data/dub_listings.csv"
output_path="./data/combined.csv"
temp = pd.read_csv(source1_path, sep=',')
    # --- Read Source1 ---
df1 = pd.read_csv(source1_path, sep=',')

# Rename columns in Source1 to our standardized names (where needed).
# If the columns in Source1 are already named exactly as we want
# (e.g., 'title', 'description', 'location', etc.), we simply keep them.
# Otherwise, rename them accordingly.
df1.rename(columns={
    # 'date': 'some_other_name',            # Not needed in final, so not renamed
    'title': 'title',
    'description': 'description',
    'long_description': 'long_description',
    'branchen': 'branchen',
    'location': 'location'
}, inplace=True)

# Select only the standardized columns from Source1
# Some columns (like date, url, standort, etc.) are not needed in the final dataset
df1 = df1[['title', 'long_description', 'description', 'branchen', 'location']]

# --- Read Source2 ---
df2 = pd.read_csv(source2_path, sep=',')

# Rename columns in Source2 to our standardized names
df2.rename(columns={
    'Title': 'title',
    'Beschreibung des Verkaufsangebots': 'long_description',
    'Anforderungen an den Käufer': 'description',
    'Branchen': 'branchen',
    'Region': 'location'
}, inplace=True)

# Select only the standardized columns from Source2
df2 = df2[['title', 'long_description', 'description', 'branchen', 'location']]

# --- Concatenate both DataFrames ---
df_combined = pd.concat([df1, df2], ignore_index=True)

# --- Save to CSV ---
df_combined.to_csv(output_path, index=False)
print(f"Combined data has been saved to {output_path}")



NOT USEFULL

In [None]:
import csv
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import NoSuchElementException, TimeoutException

# Usage example:
input_file = "./data/nexxt_change_sales_listings_update.csv"      # CSV with a column named "URL"
output_file = './data/nexxt_change_sales_listings_udpate_branche.csv'
def get_branche_text(url):
    """
    Opens the URL in a Selenium-driven browser, 
    finds the 'Branche :' label, and extracts the associated text.
    Returns the extracted text or an empty string if not found.
    """
    # Initialize the Selenium driver (Chrome)
    # If you already have chromedriver in PATH, you could do:
    # driver = webdriver.Chrome()
    # Otherwise, webdriver_manager will install the correct version:
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    options.add_argument('--disable-gpu')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    
    driver= webdriver.Chrome(options=options)
    try:
        driver.get(url)
        
        # Method 1: Find <dt> containing text "Branche", then find sibling <dd>
        # XPATH Explanation:
        #   1) We look for any element whose text contains the word "Branche".
        #   2) Then we get the immediately following sibling <dd>.
        
        # Some sites might have leading/trailing spaces or slightly different text (like "Branche :"),
        # so we use 'contains(text(), "Branche")' rather than an exact match.
        element = driver.find_element(
            By.XPATH,
            "//*[contains(normalize-space(text()), 'Branche')]/following-sibling::dd[1]"
        )
        branche_text =  element.text.strip()

        return branche_text

    except (NoSuchElementException, TimeoutException):
        print(f"Could not find 'Branche :' text on page: {url}")
        return ""

    finally:
        driver.quit()


results = []

with open(input_file, mode='r', encoding='utf-8') as infile:
    reader = csv.DictReader(infile)
    for row in reader:
        url = row['url'].strip()
        print(f"Processing: {url}")
        branche_text = get_branche_text(url)
        results.append({
            'url': url,
            'branche': branche_text
        })

# Write results to the output CSV
with open(output_file, mode='w', newline='', encoding='utf-8') as outfile:
    fieldnames = ['URL', 'Branche']
    writer = csv.DictWriter(outfile, fieldnames=fieldnames)
    writer.writeheader()
    for item in results:
        writer.writerow(item)




In [None]:
import json
import pandas as pd
import spacy
import nltk
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import numpy as np
from nltk.corpus import stopwords

# -----------------------------
# 1. Setup and Initialization
# -----------------------------

# Download NLTK stopwords if not already downloaded
nltk.download('stopwords')
german_stopwords = set(stopwords.words('german'))

# Initialize spaCy German model
nlp = spacy.load('de_core_news_sm')

# Initialize Sentence Transformer Model
# You can choose a suitable pre-trained model. 'paraphrase-multilingual-MiniLM-L12-v2' is effective for German.
embedding_model = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')

# -----------------------------
# 2. Parsing and Flattening NACE JSON with Hierarchical Descriptions
# -----------------------------

def extract_nace_codes(nace_list, parent_descriptions=[], parent_code=''):
    """
    Recursively extract NACE codes with concatenated parent descriptions.
    
    Args:
        nace_list (list): List of NACE code dictionaries.
        parent_descriptions (list): List of parent descriptions.
        parent_code (str): Concatenated parent codes.
        
    Returns:
        list: List of dictionaries with 'code' and 'full_description'.
    """
    records = []
    for item in nace_list:
        code = item['code']
        description = item['description']
        full_code = f"{parent_code}{code}" if parent_code else code
        # Concatenate parent descriptions
        concatenated_description = ' '.join(parent_descriptions + [description])
        records.append({'code': full_code, 'full_description': concatenated_description})
        # Recursively extract child codes
        if item.get('children'):
            records.extend(extract_nace_codes(
                item['children'],
                parent_descriptions + [description],
                parent_code=full_code + '.'
            ))
    return records

# Load NACE codes from JSON file
with open(nacecode_array_obj_du, 'r', encoding='utf-8') as file:
    nace_data = json.load(file)

# Extract and flatten NACE codes with hierarchical descriptions
# nace_records = extract_nace_codes(nace_data)
# Convert the dictionary to a DataFrame
nace_df = pd.DataFrame.from_dict(nace_data, orient='index')

# Reset the index to turn the NACE codes into a column
nace_df.reset_index(inplace=True)

# Rename the 'index' column to 'nace_code'
nace_df.rename(columns={'index': 'code'}, inplace=True)
# nace_df['synonyms'] = nace_df['synonyms'].apply(', '.join)

nace_df = pd.DataFrame(nace_data.items(), columns=['code', 'full_description'])
# nace_df['full_description'] = nace_df[['description','synonyms']].astype(str).agg(' '.join, axis=1)

# Display first few NACE codes
print("Flattened NACE Codes with Hierarchical Descriptions:")
print(nace_df.head())

# -----------------------------
# 3. Loading and Preprocessing the Business Dataset
# -----------------------------

# Load your CSV data
data = pd.read_csv(dataFile, delimiter=',', encoding='utf-8')

# Display first few rows of the dataset
print("\nSample Business Data:")
print(data.head())

# Combine relevant text columns into a single 'combined_text' column
data['combined_text'] = data[['title', 'description', 'long_description', 'branchen']].astype(str).agg(' '.join, axis=1)

# -----------------------------
# 4. Text Preprocessing Function
# -----------------------------

def preprocess_text(text):
    """
    Preprocess text by lowercasing, removing stopwords, lemmatizing, and removing punctuation.
    
    Args:
        text (str): Input text.
        
    Returns:
        str: Preprocessed text.
    """
    doc = nlp(text.lower())
    tokens = [
        token.lemma_ for token in doc 
        if not token.is_stop and not token.is_punct and token.lemma_ not in german_stopwords
    ]
    return ' '.join(tokens)

# Apply preprocessing to business combined text
data['processed_text'] = data['combined_text'].apply(preprocess_text)

# Apply preprocessing to NACE full descriptions
nace_df['processed_description'] = nace_df['full_description'].apply(preprocess_text)

# Display preprocessed texts
print("\nPreprocessed Business Text:")
print(data['processed_text'].head())

print("\nPreprocessed NACE Descriptions:")
print(nace_df[['code', 'processed_description']].head())

# -----------------------------
# 5. Generating Embeddings
# -----------------------------

# Generate embeddings for NACE codes
nace_embeddings = embedding_model.encode(nace_df['processed_description'].tolist(), convert_to_tensor=False, show_progress_bar=True)

# Generate embeddings for business data
business_embeddings = embedding_model.encode(data['processed_text'].tolist(), convert_to_tensor=False, show_progress_bar=True)

# Convert embeddings to numpy arrays
nace_embeddings = np.array(nace_embeddings)
business_embeddings = np.array(business_embeddings)

# -----------------------------
# 6. Dimensionality Reduction (Optional)
# -----------------------------

# If you wish to reduce the dimensionality of embeddings (e.g., for visualization or to speed up computations),
# you can use techniques like PCA, t-SNE, or UMAP. However, for similarity computations, it's often best to keep
# embeddings in their original high-dimensional space.

# Example: Using PCA to reduce to 100 dimensions
# from sklearn.decomposition import PCA
# pca = PCA(n_components=100)
# nace_embeddings_reduced = pca.fit_transform(nace_embeddings)
# business_embeddings_reduced = pca.transform(business_embeddings)

# For this script, we'll proceed without dimensionality reduction.

# -----------------------------
# 7. Computing Similarity and Assigning NACE Codes
# -----------------------------

# Compute cosine similarity between each business embedding and all NACE embeddings
# This can be memory-intensive for large datasets. Consider processing in batches if needed.

# To optimize memory usage, process in smaller batches
def assign_nace_codes(business_embeds, nace_embeds, nace_codes, nace_dec, batch_size=1000, similarity_threshold=0.1):
    """
    Assign NACE codes to business embeddings based on cosine similarity.
    
    Args:
        business_embeds (np.array): Array of business embeddings.
        nace_embeds (np.array): Array of NACE embeddings.
        nace_codes (list): List of NACE codes corresponding to nace_embeds.
        nace_dec (list): List of NACE descriptions corresponding to nace_embeds.
        batch_size (int): Number of samples to process in each batch.
        similarity_threshold (float): Minimum similarity score to consider a match.
        
    Returns:
        list: Assigned NACE codes for each business.
        list: Corresponding similarity scores.
        list: Corresponding NACE descriptions.
    """
    assigned_codes = []
    similarity_scores = []
    nace_des = []
    num_business = business_embeds.shape[0]
    
    for start in range(0, num_business, batch_size):
        end = min(start + batch_size, num_business)
        batch = business_embeds[start:end]
        
        # Compute cosine similarity
        similarity = cosine_similarity(batch, nace_embeds)
        
        # Process each business in the batch
        for sim in similarity:
            # Get indices of similarities above the threshold
            above_threshold_indices = np.where(sim >= similarity_threshold)[0]
            # Sort indices by similarity score in descending order
            sorted_indices = above_threshold_indices[np.argsort(-sim[above_threshold_indices])]
            # Select top 3 indices
            top_indices = sorted_indices[:3]
            
            if len(top_indices) > 0:
                assigned_codes.append([nace_codes[idx] for idx in top_indices])
                similarity_scores.append([sim[idx] for idx in top_indices])
                nace_des.append([nace_dec[idx] for idx in top_indices])
            else:
                assigned_codes.append(['Unclassified'])
                similarity_scores.append([0])
                nace_des.append(['Unclassified'])
        
        print(f"Processed batch {start} to {end} of {num_business}")
    
    return assigned_codes, similarity_scores, nace_des

# Assign NACE codes to business data
assigned_nace_codes, similarity_scores, nace_des = assign_nace_codes(
    business_embeddings, 
    nace_embeddings, 
    nace_df['code'].tolist(), 
    nace_df['processed_description'].tolist(), 
    batch_size=1000, 
    similarity_threshold=0.1  # Adjust based on your needs
)

# Add the assigned codes and similarity scores to the dataframe
data['predicted_nace_code'] = assigned_nace_codes
data['similarity_score'] = similarity_scores
data['nace_des'] = nace_des

# -----------------------------
# 8. Saving the Labeled Dataset
# -----------------------------

# Save the labeled data to a new CSV file
# data.to_csv('labeled_data_with_nace.csv', index=False, encoding='utf-8')

print("\nNACE Code assignment completed and saved to 'labeled_data_with_nace.csv'.")


In [15]:
data['processed_text'].to_csv('preprocessing_data.csv')

In [None]:

import json
import pandas as pd
import spacy
import nltk
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import numpy as np
from nltk.corpus import stopwords
import re
import nltk
from nltk.corpus import stopwords
from datetime import datetime
from nltk.stem import SnowballStemmer
# Preprocess text function
def preprocess_text(text):
    if pd.isnull(text):
        return ''
    text = text.lower()
    text = re.sub(r'http\S+|www.\S+|\S+@\S+|\b\d{10,}\b', '', text)
    text = re.sub(r'[^a-zA-ZäöüÄÖÜß\s]', '', text)
    text = ' '.join(text.split())
    stop_words = set(stopwords.words('german'))
    tokens = nltk.word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    stemmer = SnowballStemmer('german')
    tokens = [stemmer.stem(word) for word in tokens]
    text = ' '.join(tokens)
    return text

# Load NACE codes
def load_nace_codes(filepath):
    with open(filepath, 'r', encoding='utf-8') as file:
        nace_codes = json.load(file)
    return nace_codes

# Create embeddings
def create_embeddings(texts, model):
    return model.encode(texts, show_progress_bar=True, convert_to_numpy=True, normalize_embeddings=True)

# Map NACE code
def map_nace_code(text, nace_codes, nace_embeddings, model):
    text_embedding = create_embeddings([text], model)[0]
    similarities = cosine_similarity([text_embedding], nace_embeddings)[0]
    best_match_idx = np.argmax(similarities)
    return list(nace_codes.keys())[best_match_idx]

def add_nace_codes(df, nace_codes, nace_embeddings, model):
    df['nace_code'] = df.apply(
        lambda row: map_nace_code(
            preprocess_text(' '.join([
                str(row.get('title', '')),
                str(row.get('description', '')),
                str(row.get('long_description', '')),
                str(row.get('branchen', ''))
            ])), nace_codes, nace_embeddings, model
        ), axis=1
    )
    return df

# Map NACE codes to DataFrame
def map_nace_codes(df, nace_codes, nace_embeddings, model):
    df = add_nace_codes(df, nace_codes, nace_embeddings, model)
    df['nace_description'] = df['nace_code'].apply(lambda code: nace_codes.get(code, ""))
    return df

# Load sample datasets
sellers_df = pd.read_csv(dataFile)

# Load the Sentence Transformer model
model = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')

# Load and preprocess NACE codes
nace_codes = load_nace_codes(nacecode_array_obj_du)
print("🚀 ~ nace_codes:", nace_codes)

# The values in nace_codes are already the descriptions, so we can use them directly
nace_descriptions = [preprocess_text(description) for description in nace_codes.values()]

# Create embeddings for NACE descriptions
nace_embeddings = create_embeddings(nace_descriptions, model)

sellers_df['post_precessed_text'] = sellers_df.apply( lambda row: preprocess_text(' '.join([str(row.get('title', '')), str(row.get('description', '')), str(row.get('long_description', '')), str(row.get('branchen', ''))])), axis=1)
sellers_df = map_nace_codes(sellers_df, nace_codes, nace_embeddings, model)
# Display the first few rows of the updated DataFrame
print("\nSellers DataFrame with NACE codes:")
print(sellers_df.head())
