In [2]:
dataFile =  './data/nexxt_change_sales_listings_geocoded_short_test.csv' 
sales_file_nace =  './data/nexxt_change_sales_listings_geocoded_nace.csv' 
buyer_file_nace =  './data/nexxt_change_purchase_listings_geocoded_nace.csv' 
nacecode_josn =  './data/nace_codes.json' 
nacecode_array_josn =  './data/nace_codes_array.json' 
nacecode_array_obj =  './data/nace_codes_object.json' 
nacecode_array_obj_ext =  './data/nace_codes_object_ext.json' 
nacecode_array_obj_du =  './data/nace_codes_object_du.json' 

In [None]:
import json
import pandas as pd
import spacy
import nltk
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import numpy as np
from nltk.corpus import stopwords

# -----------------------------
# 1. Setup and Initialization
# -----------------------------

# Download NLTK stopwords if not already downloaded
nltk.download('stopwords')
german_stopwords = set(stopwords.words('german'))

# Initialize spaCy German model
nlp = spacy.load('de_core_news_sm')

# Initialize Sentence Transformer Model
# You can choose a suitable pre-trained model. 'paraphrase-multilingual-MiniLM-L12-v2' is effective for German.
embedding_model = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')

# -----------------------------
# 2. Parsing and Flattening NACE JSON with Hierarchical Descriptions
# -----------------------------

def extract_nace_codes(nace_list, parent_descriptions=[], parent_code=''):
    """
    Recursively extract NACE codes with concatenated parent descriptions.
    
    Args:
        nace_list (list): List of NACE code dictionaries.
        parent_descriptions (list): List of parent descriptions.
        parent_code (str): Concatenated parent codes.
        
    Returns:
        list: List of dictionaries with 'code' and 'full_description'.
    """
    records = []
    for item in nace_list:
        code = item['code']
        description = item['description']
        full_code = f"{parent_code}{code}" if parent_code else code
        # Concatenate parent descriptions
        concatenated_description = ' '.join(parent_descriptions + [description])
        records.append({'code': full_code, 'full_description': concatenated_description})
        # Recursively extract child codes
        if item.get('children'):
            records.extend(extract_nace_codes(
                item['children'],
                parent_descriptions + [description],
                parent_code=full_code + '.'
            ))
    return records

# Load NACE codes from JSON file
with open(nacecode_array_obj_du, 'r', encoding='utf-8') as file:
    nace_data = json.load(file)

# Extract and flatten NACE codes with hierarchical descriptions
# nace_records = extract_nace_codes(nace_data)
# Convert the dictionary to a DataFrame
nace_df = pd.DataFrame.from_dict(nace_data, orient='index')

# Reset the index to turn the NACE codes into a column
nace_df.reset_index(inplace=True)

# Rename the 'index' column to 'nace_code'
nace_df.rename(columns={'index': 'code'}, inplace=True)
# nace_df['synonyms'] = nace_df['synonyms'].apply(', '.join)

nace_df = pd.DataFrame(nace_data.items(), columns=['code', 'full_description'])
# nace_df['full_description'] = nace_df[['description','synonyms']].astype(str).agg(' '.join, axis=1)

# Display first few NACE codes
print("Flattened NACE Codes with Hierarchical Descriptions:")
print(nace_df.head())

# -----------------------------
# 3. Loading and Preprocessing the Business Dataset
# -----------------------------

# Load your CSV data
data = pd.read_csv(dataFile, delimiter=',', encoding='utf-8')

# Display first few rows of the dataset
print("\nSample Business Data:")
print(data.head())

# Combine relevant text columns into a single 'combined_text' column
data['combined_text'] = data[['title', 'description', 'long_description', 'branchen']].astype(str).agg(' '.join, axis=1)

# -----------------------------
# 4. Text Preprocessing Function
# -----------------------------

def preprocess_text(text):
    """
    Preprocess text by lowercasing, removing stopwords, lemmatizing, and removing punctuation.
    
    Args:
        text (str): Input text.
        
    Returns:
        str: Preprocessed text.
    """
    doc = nlp(text.lower())
    tokens = [
        token.lemma_ for token in doc 
        if not token.is_stop and not token.is_punct and token.lemma_ not in german_stopwords
    ]
    return ' '.join(tokens)

# Apply preprocessing to business combined text
data['processed_text'] = data['combined_text'].apply(preprocess_text)

# Apply preprocessing to NACE full descriptions
nace_df['processed_description'] = nace_df['full_description'].apply(preprocess_text)

# Display preprocessed texts
print("\nPreprocessed Business Text:")
print(data['processed_text'].head())

print("\nPreprocessed NACE Descriptions:")
print(nace_df[['code', 'processed_description']].head())

# -----------------------------
# 5. Generating Embeddings
# -----------------------------

# Generate embeddings for NACE codes
nace_embeddings = embedding_model.encode(nace_df['processed_description'].tolist(), convert_to_tensor=False, show_progress_bar=True)

# Generate embeddings for business data
business_embeddings = embedding_model.encode(data['processed_text'].tolist(), convert_to_tensor=False, show_progress_bar=True)

# Convert embeddings to numpy arrays
nace_embeddings = np.array(nace_embeddings)
business_embeddings = np.array(business_embeddings)

# -----------------------------
# 6. Dimensionality Reduction (Optional)
# -----------------------------

# If you wish to reduce the dimensionality of embeddings (e.g., for visualization or to speed up computations),
# you can use techniques like PCA, t-SNE, or UMAP. However, for similarity computations, it's often best to keep
# embeddings in their original high-dimensional space.

# Example: Using PCA to reduce to 100 dimensions
# from sklearn.decomposition import PCA
# pca = PCA(n_components=100)
# nace_embeddings_reduced = pca.fit_transform(nace_embeddings)
# business_embeddings_reduced = pca.transform(business_embeddings)

# For this script, we'll proceed without dimensionality reduction.

# -----------------------------
# 7. Computing Similarity and Assigning NACE Codes
# -----------------------------

# Compute cosine similarity between each business embedding and all NACE embeddings
# This can be memory-intensive for large datasets. Consider processing in batches if needed.

# To optimize memory usage, process in smaller batches
def assign_nace_codes(business_embeds, nace_embeds, nace_codes, nace_dec, batch_size=1000, similarity_threshold=0.1):
    """
    Assign NACE codes to business embeddings based on cosine similarity.
    
    Args:
        business_embeds (np.array): Array of business embeddings.
        nace_embeds (np.array): Array of NACE embeddings.
        nace_codes (list): List of NACE codes corresponding to nace_embeds.
        nace_dec (list): List of NACE descriptions corresponding to nace_embeds.
        batch_size (int): Number of samples to process in each batch.
        similarity_threshold (float): Minimum similarity score to consider a match.
        
    Returns:
        list: Assigned NACE codes for each business.
        list: Corresponding similarity scores.
        list: Corresponding NACE descriptions.
    """
    assigned_codes = []
    similarity_scores = []
    nace_des = []
    num_business = business_embeds.shape[0]
    
    for start in range(0, num_business, batch_size):
        end = min(start + batch_size, num_business)
        batch = business_embeds[start:end]
        
        # Compute cosine similarity
        similarity = cosine_similarity(batch, nace_embeds)
        
        # Process each business in the batch
        for sim in similarity:
            # Get indices of similarities above the threshold
            above_threshold_indices = np.where(sim >= similarity_threshold)[0]
            # Sort indices by similarity score in descending order
            sorted_indices = above_threshold_indices[np.argsort(-sim[above_threshold_indices])]
            # Select top 3 indices
            top_indices = sorted_indices[:3]
            
            if len(top_indices) > 0:
                assigned_codes.append([nace_codes[idx] for idx in top_indices])
                similarity_scores.append([sim[idx] for idx in top_indices])
                nace_des.append([nace_dec[idx] for idx in top_indices])
            else:
                assigned_codes.append(['Unclassified'])
                similarity_scores.append([0])
                nace_des.append(['Unclassified'])
        
        print(f"Processed batch {start} to {end} of {num_business}")
    
    return assigned_codes, similarity_scores, nace_des

# Assign NACE codes to business data
assigned_nace_codes, similarity_scores, nace_des = assign_nace_codes(
    business_embeddings, 
    nace_embeddings, 
    nace_df['code'].tolist(), 
    nace_df['processed_description'].tolist(), 
    batch_size=1000, 
    similarity_threshold=0.1  # Adjust based on your needs
)

# Add the assigned codes and similarity scores to the dataframe
data['predicted_nace_code'] = assigned_nace_codes
data['similarity_score'] = similarity_scores
data['nace_des'] = nace_des

# -----------------------------
# 8. Saving the Labeled Dataset
# -----------------------------

# Save the labeled data to a new CSV file
# data.to_csv('labeled_data_with_nace.csv', index=False, encoding='utf-8')

print("\nNACE Code assignment completed and saved to 'labeled_data_with_nace.csv'.")


In [15]:
data['processed_text'].to_csv('preprocessing_data.csv')

In [None]:

import json
import pandas as pd
import spacy
import nltk
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import numpy as np
from nltk.corpus import stopwords
import re
import nltk
from nltk.corpus import stopwords
from datetime import datetime
from nltk.stem import SnowballStemmer
# Preprocess text function
def preprocess_text(text):
    if pd.isnull(text):
        return ''
    text = text.lower()
    text = re.sub(r'http\S+|www.\S+|\S+@\S+|\b\d{10,}\b', '', text)
    text = re.sub(r'[^a-zA-ZäöüÄÖÜß\s]', '', text)
    text = ' '.join(text.split())
    stop_words = set(stopwords.words('german'))
    tokens = nltk.word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    stemmer = SnowballStemmer('german')
    tokens = [stemmer.stem(word) for word in tokens]
    text = ' '.join(tokens)
    return text

# Load NACE codes
def load_nace_codes(filepath):
    with open(filepath, 'r', encoding='utf-8') as file:
        nace_codes = json.load(file)
    return nace_codes

# Create embeddings
def create_embeddings(texts, model):
    return model.encode(texts, show_progress_bar=True, convert_to_numpy=True, normalize_embeddings=True)

# Map NACE code
def map_nace_code(text, nace_codes, nace_embeddings, model):
    text_embedding = create_embeddings([text], model)[0]
    similarities = cosine_similarity([text_embedding], nace_embeddings)[0]
    best_match_idx = np.argmax(similarities)
    return list(nace_codes.keys())[best_match_idx]

def add_nace_codes(df, nace_codes, nace_embeddings, model):
    df['nace_code'] = df.apply(
        lambda row: map_nace_code(
            preprocess_text(' '.join([
                str(row.get('title', '')),
                str(row.get('description', '')),
                str(row.get('long_description', '')),
                str(row.get('branchen', ''))
            ])), nace_codes, nace_embeddings, model
        ), axis=1
    )
    return df

# Map NACE codes to DataFrame
def map_nace_codes(df, nace_codes, nace_embeddings, model):
    df = add_nace_codes(df, nace_codes, nace_embeddings, model)
    df['nace_description'] = df['nace_code'].apply(lambda code: nace_codes.get(code, ""))
    return df

# Load sample datasets
sellers_df = pd.read_csv(dataFile)

# Load the Sentence Transformer model
model = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')

# Load and preprocess NACE codes
nace_codes = load_nace_codes(nacecode_array_obj_du)
print("🚀 ~ nace_codes:", nace_codes)

# The values in nace_codes are already the descriptions, so we can use them directly
nace_descriptions = [preprocess_text(description) for description in nace_codes.values()]

# Create embeddings for NACE descriptions
nace_embeddings = create_embeddings(nace_descriptions, model)

sellers_df['post_precessed_text'] = sellers_df.apply( lambda row: preprocess_text(' '.join([str(row.get('title', '')), str(row.get('description', '')), str(row.get('long_description', '')), str(row.get('branchen', ''))])), axis=1)
sellers_df = map_nace_codes(sellers_df, nace_codes, nace_embeddings, model)
# Display the first few rows of the updated DataFrame
print("\nSellers DataFrame with NACE codes:")
print(sellers_df.head())


In [4]:
import json
import pandas as pd
import spacy
import nltk
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import numpy as np
from nltk.corpus import stopwords
import re
from nltk.stem import SnowballStemmer

# Ensure necessary NLTK data is downloaded
nltk.download('punkt')
nltk.download('stopwords')

# Load SpaCy's German model with NER and POS capabilities
try:
    nlp = spacy.load('de_core_news_sm')
except OSError:
    from spacy.cli import download
    download('de_core_news_sm')
    nlp = spacy.load('de_core_news_sm')

# Preprocess text function with NER and POS tagging
def preprocess_text(text, nlp_model):
    if pd.isnull(text):
        return ''
    text = text.lower()
    # Remove URLs, emails, and long numbers
    text = re.sub(r'http\S+|www.\S+|\S+@\S+|\b\d{10,}\b', '', text)
    # Remove non-alphabetic characters (keeping German characters)
    text = re.sub(r'[^a-zA-ZäöüÄÖÜß\s]', '', text)
    text = ' '.join(text.split())

    # Initialize stopwords and stemmer
    stop_words = set(stopwords.words('german'))
    stemmer = SnowballStemmer('german')

    # Process text with SpaCy
    doc = nlp_model(text)

    tokens = []
    for token in doc:
        # Retain nouns, proper nouns, and verbs
        if token.pos_ in {'NOUN', 'PROPN', 'VERB'} and token.text not in stop_words:
            stemmed = stemmer.stem(token.text)
            tokens.append(stemmed)

    # Extract named entities and include them
    entities = [ent.text for ent in doc.ents if ent.label_ in {'ORG', 'PRODUCT', 'GPE'}]
    entities = [stemmer.stem(ent.lower()) for ent in entities if ent.lower() not in stop_words]
    tokens.extend(entities)

    text = ' '.join(tokens)
    return text

# Load NACE codes
def load_nace_codes(filepath):
    with open(filepath, 'r', encoding='utf-8') as file:
        nace_codes = json.load(file)
    return nace_codes

# Create embeddings
def create_embeddings(texts, model):
    return model.encode(texts, show_progress_bar=True, convert_to_numpy=True, normalize_embeddings=True)

# Load Data
def load_data(sellers_filepath, nace_codes_filepath):
    sellers_df = pd.read_csv(sellers_filepath)
    nace_codes = load_nace_codes(nace_codes_filepath)
    return sellers_df, nace_codes

# Filepaths (replace with your actual file paths)
sellers_filepath = buyer_file_nace  # Path to your sellers CSV file
nace_codes_filepath = nacecode_array_obj_du  # Path to your NACE codes JSON file

# Load data
sellers_df, nace_codes = load_data(sellers_filepath, nace_codes_filepath)
print("🚀 ~ Sellers and NACE codes loaded.")

# Initialize the Sentence Transformer model
model_name = 'all-MiniLM-L6-v2'
# model_name = 'paraphrase-multilingual-mpnet-base-v2'
model = SentenceTransformer(model_name)
print(f"🚀 ~ Loaded SentenceTransformer model: {model_name}")

# Preprocess NACE descriptions
nace_descriptions = [preprocess_text(desc, nlp) for desc in nace_codes.values()]
nace_embeddings = create_embeddings(nace_descriptions, model)
nace_code_list = list(nace_codes.keys())
print("🚀 ~ Created embeddings for NACE descriptions.")

# Preprocess 'branchen' field in sellers data
sellers_df['preprocessed_branchen'] = sellers_df['branchen'].apply(lambda x: preprocess_text(x, nlp))

# Create embeddings for 'branchen'
branchen_embeddings = create_embeddings(sellers_df['preprocessed_branchen'].tolist(), model)
print("🚀 ~ Created embeddings for 'branchen' field.")

# Compute cosine similarity between 'branchen' embeddings and NACE embeddings
similarities = cosine_similarity(branchen_embeddings, nace_embeddings)

# Assign the NACE code with the highest similarity
best_match_indices = similarities.argmax(axis=1)
sellers_df['assigned_nace_code'] = [nace_code_list[idx] for idx in best_match_indices]
sellers_df['assigned_nace_similarity'] = [similarities[i][idx] for i, idx in enumerate(best_match_indices)]
print("🚀 ~ Assigned preliminary NACE codes based on 'branchen' similarity.")

# Optional: Set a similarity threshold to filter uncertain assignments
similarity_threshold = 0.5
sellers_df['nace_code'] = sellers_df.apply(
    lambda row: row['assigned_nace_code'] if row['assigned_nace_similarity'] >= similarity_threshold else 'Unassigned',
    axis=1
)

# Save the preliminary assignments
sellers_df.to_csv(sellers_filepath, index=False)
print("🚀 ~ Saved sellers data with preliminary NACE code assignments to 'sellers_with_preassigned_nace.csv'.")

# Review the assignments
print("\nSample of NACE Code Assignments:")
print(sellers_df[['branchen', 'assigned_nace_code', 'assigned_nace_similarity', 'nace_code']].head())




[nltk_data] Downloading package punkt to /Users/abbasm1/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/abbasm1/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


🚀 ~ Sellers and NACE codes loaded.
🚀 ~ Loaded SentenceTransformer model: all-MiniLM-L6-v2


Batches:   0%|          | 0/31 [00:00<?, ?it/s]

🚀 ~ Created embeddings for NACE descriptions.


Batches:   0%|          | 0/24 [00:00<?, ?it/s]

🚀 ~ Created embeddings for 'branchen' field.
🚀 ~ Assigned preliminary NACE codes based on 'branchen' similarity.
🚀 ~ Saved sellers data with preliminary NACE code assignments to 'sellers_with_preassigned_nace.csv'.

Sample of NACE Code Assignments:
                                            branchen assigned_nace_code  \
0  Dienstleistung; Baugewerbe; Grundstücks- und W...              G46.9   
1                                                NaN            D35.1.4   
2                                                NaN            D35.1.4   
3  Verarbeitendes Gewerbe > Herstellung von elekt...            C28.9.2   
4                                                NaN            D35.1.4   

   assigned_nace_similarity nace_code  
0                  0.591516     G46.9  
1                  1.000000   D35.1.4  
2                  1.000000   D35.1.4  
3                  0.692239   C28.9.2  
4                  1.000000   D35.1.4  
