# European Company Data Enrichment with Mistral AI & Serper

Author: Sascha Seniuk
GitHub: @saschaseniuk
Affiliation: AIscream - Peisker & Seniuk GbR

A streamlined tool for enriching European company data using Mistral AI and Serper.dev for reliable web search.

## Setup and Installation

In [None]:
!pip install mistralai requests beautifulsoup4 trafilatura pydantic python-dotenv

In [None]:
import os
import json
import requests
from datetime import datetime
from typing import List, Optional, Dict, Any, Literal
from pydantic import BaseModel, Field
from bs4 import BeautifulSoup
import trafilatura

from mistralai.client import MistralClient
from mistralai.models.chat_completion import ChatMessage

# API Keys setup for Google Colab compatibility
try:
    from google.colab import userdata
    MISTRAL_API_KEY = userdata.get('MISTRAL_API_KEY')
    SERPER_API_KEY = userdata.get('SERPER_API_KEY')
except:
    from dotenv import load_dotenv
    load_dotenv()
    MISTRAL_API_KEY = os.getenv("MISTRAL_API_KEY")
    SERPER_API_KEY = os.getenv("SERPER_API_KEY")

if not MISTRAL_API_KEY:
    MISTRAL_API_KEY = input("Enter MISTRAL_API_KEY: ")
if not SERPER_API_KEY:
    SERPER_API_KEY = input("Enter SERPER_API_KEY (get from https://serper.dev): ")

mistral_client = MistralClient(api_key=MISTRAL_API_KEY)

## Data Models

In [None]:
class CompanyProfile(BaseModel):
    legal_name: str
    country: str
    vat_id: Optional[str] = None
    registration_number: Optional[str] = None
    address: Optional[str] = None
    phone: Optional[str] = None
    email: Optional[str] = None
    website: Optional[str] = None
    industry: Optional[str] = None
    description: Optional[str] = None
    confidence_score: float = 0.0
    sources: List[str] = Field(default_factory=list)

class SearchRequest(BaseModel):
    company_name: str
    country: Optional[str] = None  # DE, FR, ES, IT, etc.

## Search and Scraping Functions

In [None]:
# European search query templates
SEARCH_TEMPLATES = {
    "DE": [
        '{company} impressum',
        '{company} kontakt adresse',
        '{company} offizielle website'
    ],
    "FR": [
        '{company} mentions légales',
        '{company} contact adresse',
        '{company} site officiel'
    ],
    "ES": [
        '{company} aviso legal',
        '{company} contacto dirección',
        '{company} sitio oficial'
    ],
    "IT": [
        '{company} note legali',
        '{company} contatti indirizzo',
        '{company} sito ufficiale'
    ],
    "default": [
        '{company} legal information',
        '{company} contact address',
        '{company} official website'
    ]
}

def search_company_urls(company_name: str, country: str = None) -> List[str]:
    """Search for company URLs using Serper API"""
    
    # Get search templates
    templates = SEARCH_TEMPLATES.get(country, SEARCH_TEMPLATES["default"])
    
    all_urls = []
    
    for template in templates[:2]:  # Limit to 2 searches
        query = template.format(company=company_name)
        
        payload = {
            "q": query,
            "num": 5,
            "gl": country.lower() if country else "us"
        }
        
        headers = {
            "X-API-KEY": SERPER_API_KEY,
            "Content-Type": "application/json"
        }
        
        try:
            response = requests.post(
                "https://google.serper.dev/search",
                json=payload,
                headers=headers,
                timeout=10
            )
            
            if response.status_code == 200:
                data = response.json()
                for result in data.get("organic", []):
                    all_urls.append(result["link"])
                print(f"Found {len(data.get('organic', []))} results for: {query}")
            else:
                print(f"Search failed for '{query}': {response.status_code}")
                
        except Exception as e:
            print(f"Error searching '{query}': {e}")
    
    # Remove duplicates and limit
    unique_urls = list(dict.fromkeys(all_urls))[:5]
    print(f"Total unique URLs found: {len(unique_urls)}")
    return unique_urls

def scrape_url(url: str) -> Dict[str, Any]:
    """Scrape content from a URL"""
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
    }
    
    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        
        # Use trafilatura for clean text extraction
        text = trafilatura.extract(response.text, include_formatting=False)
        
        if not text:
            # Fallback to BeautifulSoup
            soup = BeautifulSoup(response.text, 'html.parser')
            for script in soup(["script", "style"]):
                script.decompose()
            text = soup.get_text(separator='\n', strip=True)
        
        return {
            "url": url,
            "text": text[:5000] if text else "",  # Limit text length
            "success": True
        }
        
    except Exception as e:
        return {
            "url": url,
            "text": "",
            "success": False,
            "error": str(e)
        }

## Mistral AI Data Extraction

In [None]:
def extract_company_data(text: str, company_name: str, source_url: str) -> Dict[str, Any]:
    """Extract company data using Mistral AI function calling"""
    
    extraction_tool = {
        "type": "function",
        "function": {
            "name": "extract_company_info",
            "description": "Extract European company information from text",
            "parameters": {
                "type": "object",
                "properties": {
                    "legal_name": {"type": "string", "description": "Official company name"},
                    "vat_id": {"type": "string", "description": "VAT number (USt-IdNr, TVA, IVA, etc.)"},
                    "registration_number": {"type": "string", "description": "Registration number (HRB, SIRET, etc.)"},
                    "address": {"type": "string", "description": "Full business address"},
                    "phone": {"type": "string", "description": "Phone number"},
                    "email": {"type": "string", "description": "Email address"},
                    "website": {"type": "string", "description": "Website URL"},
                    "industry": {"type": "string", "description": "Industry sector"},
                    "description": {"type": "string", "description": "Company description"}
                },
                "required": ["legal_name"]
            }
        }
    }
    
    system_prompt = f"""Extract company information from the text. The company is likely '{company_name}'.
Focus on legal registration data, VAT numbers, and contact information.
Only extract data explicitly stated in the text."""
    
    user_prompt = f"""Extract company information from this text:\n\n{text[:2000]}\n\nSource: {source_url}"""
    
    try:
        messages = [
            ChatMessage(role="system", content=system_prompt),
            ChatMessage(role="user", content=user_prompt)
        ]
        
        response = mistral_client.chat(
            model="mistral-large-latest",
            messages=messages,
            tools=[extraction_tool],
            tool_choice="any"
        )
        
        if response.choices[0].message.tool_calls:
            tool_call = response.choices[0].message.tool_calls[0]
            extracted_data = json.loads(tool_call.function.arguments)
            return {"success": True, "data": extracted_data, "source": source_url}
        else:
            return {"success": False, "data": {}, "source": source_url, "error": "No extraction"}
            
    except Exception as e:
        return {"success": False, "data": {}, "source": source_url, "error": str(e)}

## Main Enrichment Function

In [None]:
def enrich_company(company_name: str, country: str = None) -> CompanyProfile:
    """Main function to enrich company data"""
    
    print(f"🔍 Enriching: {company_name} ({country or 'No country specified'})")
    
    # Step 1: Search for URLs
    urls = search_company_urls(company_name, country)
    
    if not urls:
        print("❌ No URLs found")
        return CompanyProfile(
            legal_name=company_name,
            country=country or "Unknown",
            confidence_score=0.0
        )
    
    # Step 2: Scrape content
    print(f"🕷️ Scraping {len(urls)} URLs...")
    scraped_data = []
    for url in urls:
        result = scrape_url(url)
        if result["success"] and result["text"]:
            scraped_data.append(result)
            print(f"  ✅ {url} - {len(result['text'])} chars")
        else:
            print(f"  ❌ {url} - {result.get('error', 'No content')}")
    
    if not scraped_data:
        print("❌ No content scraped")
        return CompanyProfile(
            legal_name=company_name,
            country=country or "Unknown",
            confidence_score=0.0
        )
    
    # Step 3: Extract data with Mistral AI
    print(f"🤖 Extracting data from {len(scraped_data)} pages...")
    extractions = []
    for item in scraped_data:
        result = extract_company_data(item["text"], company_name, item["url"])
        if result["success"]:
            extractions.append(result)
            print(f"  ✅ Extracted from {item['url']}")
    
    # Step 4: Merge and create profile
    merged_data = {}
    sources = []
    
    for extraction in extractions:
        sources.append(extraction["source"])
        for field, value in extraction["data"].items():
            if value and field not in merged_data:
                merged_data[field] = value
    
    # Calculate confidence based on data completeness
    critical_fields = ["legal_name", "vat_id", "address", "website"]
    confidence = sum(1 for field in critical_fields if merged_data.get(field)) / len(critical_fields)
    
    profile = CompanyProfile(
        legal_name=merged_data.get("legal_name", company_name),
        country=country or "Unknown",
        vat_id=merged_data.get("vat_id"),
        registration_number=merged_data.get("registration_number"),
        address=merged_data.get("address"),
        phone=merged_data.get("phone"),
        email=merged_data.get("email"),
        website=merged_data.get("website"),
        industry=merged_data.get("industry"),
        description=merged_data.get("description"),
        confidence_score=confidence,
        sources=sources
    )
    
    print(f"✨ Enrichment complete! Confidence: {confidence:.1%}")
    return profile

## Examples and Usage

In [None]:
# Example 1: German company
result = enrich_company("Siemens", "DE")

print("\n" + "="*50)
print(f"Company: {result.legal_name}")
print(f"Country: {result.country}")
print(f"VAT ID: {result.vat_id}")
print(f"Registration: {result.registration_number}")
print(f"Address: {result.address}")
print(f"Website: {result.website}")
print(f"Industry: {result.industry}")
print(f"Confidence: {result.confidence_score:.1%}")
print(f"Sources: {len(result.sources)} pages")

In [None]:
# Example 2: French company
result_fr = enrich_company("Mistral AI", "FR")

print("\n" + "="*50)
print(f"Company: {result_fr.legal_name}")
print(f"Country: {result_fr.country}")
print(f"VAT ID: {result_fr.vat_id}")
print(f"Address: {result_fr.address}")
print(f"Website: {result_fr.website}")
print(f"Description: {result_fr.description}")
print(f"Confidence: {result_fr.confidence_score:.1%}")

In [None]:
# Batch processing example
companies = [
    {"name": "BMW", "country": "DE"},
    {"name": "LVMH", "country": "FR"},
    {"name": "Telefonica", "country": "ES"}
]

results = []
for company in companies:
    try:
        profile = enrich_company(company["name"], company["country"])
        results.append({
            "name": profile.legal_name,
            "country": profile.country,
            "vat_id": profile.vat_id,
            "website": profile.website,
            "confidence": f"{profile.confidence_score:.1%}"
        })
        print(f"✅ {company['name']} - {profile.confidence_score:.1%}")
    except Exception as e:
        print(f"❌ {company['name']} - Error: {e}")
        results.append({"name": company["name"], "error": str(e)})

# Display results
import json
print("\n" + "="*60)
print("BATCH RESULTS:")
print(json.dumps(results, indent=2))

## Setup Instructions

1. **Get Serper API Key**: Go to https://serper.dev and sign up for a free account
2. **Create .env file** with:
   ```
   MISTRAL_API_KEY=your_mistral_key
   SERPER_API_KEY=your_serper_key
   ```
3. **Run the notebook** - it should work reliably!

## Key Features

- ✅ **Reliable Search** with Serper.dev API (no Google blocking)
- ✅ **European Language Support** (DE, FR, ES, IT)
- ✅ **Mistral AI Function Calling** for structured extraction
- ✅ **Clean Text Extraction** with trafilatura
- ✅ **Confidence Scoring** based on data completeness
- ✅ **Batch Processing** for multiple companies
- ✅ **Slim & Fast** - under 200 lines of code!