# Enterprise Data Semantic Analysis with BERT

This notebook demonstrates how to use BERT-family models to extract semantic meaning from enterprise data column names and labels.

## Overview
- Load enterprise datasets (HR, Sales, Finance)
- Extract semantic meaning from column names
- Find semantic similarities between different datasets
- Create mappings between semantically similar columns


## Key Use Cases
- Data integration across different systems
- Automated schema mapping
- Semantic understanding of enterprise data


In [None]:
# Import required libraries
import sys
import os
sys.path.append('../src')

from semantic_extractor import EnterpriseSemanticExtractor
from data_loader import EnterpriseDataLoader
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Initialize the semantic extractor
extractor = EnterpriseSemanticExtractor('bert-base-uncased')

# Initialize data loader
loader = EnterpriseDataLoader('../data')

# Load enterprise datasets
datasets = loader.get_enterprise_datasets()

print('Available datasets:')
for name, df in datasets.items():
    print(f"\n{name.upper()}:")
    print(f'  Shape: {df.shape}')
    print(f'  Columns: {list(df.columns)}')
    print('  Sample data:')
    print(df.head(2))

# Extract semantics from each dataset
print('
' + '=' * 50)
print('SEMANTIC ANALYSIS RESULTS')
print('=' * 50)

for dataset_name, df in datasets.items():
    print(f"\n{dataset_name.upper()} - Column Semantics:")
    semantics = extractor.extract_column_semantics(df)

    for col, info in semantics.items():
        print(f'  {col}:')
        print(f"    - Category: {info['semantic_category']}")
        print(f"    - Data Type: {info['data_type']}")
        print(f"    - Sample Values: {info['sample_values'][:3]}")

# Find semantic similarities between datasets
print('
' + '=' * 50)
print('SEMANTIC MAPPING BETWEEN DATASETS')
print('=' * 50)

dataset_names = list(datasets.keys())

for i in range(len(dataset_names)):
    for j in range(i + 1, len(dataset_names)):
        source_name = dataset_names[i]
        target_name = dataset_names[j]

        print(f"\nMapping: {source_name.upper()} -> {target_name.upper()}")

        mapping = extractor.create_semantic_mapping(
            datasets[source_name],
            datasets[target_name],
            similarity_threshold=0.7,
        )

        if mapping:
            for source_col, matches in mapping.items():
                for match in matches:
                    print(
                        f"  {source_col} -> {match['target_column']} "
                        f"(score: {match['similarity_score']:.3f}, "
                        f"categories: {match['source_category']}->{match['target_category']})"
                    )
        else:
            print('  No strong semantic matches found')

# Test semantic similarity with custom enterprise terms
print('
' + '=' * 50)
print('SEMANTIC SIMILARITY TESTING')
print('=' * 50)

financial_terms = ['revenue', 'income', 'sales', 'profit', 'earnings']
hr_terms = ['employee', 'staff', 'worker', 'personnel', 'human resources']
temporal_terms = ['date', 'time', 'timestamp', 'period', 'duration']

test_cases = [
    ('Financial Terms', financial_terms),
    ('HR Terms', hr_terms),
    ('Temporal Terms', temporal_terms),
]

for category_name, terms in test_cases:
    print(f"\n{category_name}:")

    similarities = extractor.find_semantic_similarities(terms, terms, threshold=0.8)

    for term1, term2, score in similarities:
        if term1 != term2:
            print(f"  {term1} <-> {term2}: {score:.3f}")

# Visualize semantic categories
print('
' + '=' * 50)
print('SEMANTIC CATEGORY DISTRIBUTION')
print('=' * 50)

category_counts = {}

for dataset_name, df in datasets.items():
    semantics = extractor.extract_column_semantics(df)

    for col, info in semantics.items():
        category = info['semantic_category']
        category_counts[category] = category_counts.get(category, 0) + 1

plt.figure(figsize=(10, 6))
plt.bar(category_counts.keys(), category_counts.values(), color='skyblue')
plt.title('Distribution of Semantic Categories Across All Datasets')
plt.xlabel('Semantic Category')
plt.ylabel('Number of Columns')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

print('
Category Distribution:')
for category, count in category_counts.items():
    print(f'  {category}: {count} columns')
