In [4]:
import requests
import json
import os


def get_all_companies():
    # The base URL where your Flask server is running
    url = "http://localhost:5000/api/companies"

    try:
        # Make GET request to the endpoint
        response = requests.get(url)

        # Raise an exception for bad status codes
        response.raise_for_status()

        # Parse the response
        companies = response.json()

        # Create data directory if it doesn't exist
        os.makedirs('../data', exist_ok=True)

        # Save to a JSON file in the data directory
        file_path = os.path.join('../data', 'companies.json')
        with open(file_path, 'w') as f:
            json.dump(companies, f, indent=2)
        print(f"Companies data saved to {file_path}")

        # Print summary
        print(f"\nSaved {len(companies)} companies to file")

    except requests.exceptions.RequestException as e:
        print(f"Error occurred: {str(e)}")
    except IOError as e:
        print(f"File error occurred: {str(e)}")


if __name__ == "__main__":
    get_all_companies()

Companies data saved to ../data/companies.json

Saved 229 companies to file


In [5]:
# Read the companies data from the JSON file
try:
    file_path = os.path.join('../data', 'companies.json')
    with open(file_path, 'r') as f:
        companies_data = json.load(f)
    print(f"Successfully loaded {
          len(companies_data)} companies from {file_path}")

    # Print out company names and IDs
    print("\nCompany Details:")
    print("-" * 50)
    for company in companies_data:
        print(f"ID: {company['id']:<8} | Name: {company['companyName']}")
    print("-" * 50)

except FileNotFoundError:
    print(f"Error: File not found at {file_path}")
    companies_data = []
except json.JSONDecodeError:
    print(f"Error: Invalid JSON format in {file_path}")
    companies_data = []
except Exception as e:
    print(f"Error occurred while reading file: {str(e)}")
    companies_data = []

Successfully loaded 229 companies from ../data/companies.json

Company Details:
--------------------------------------------------
ID: 0        | Name: FLYONIT
ID: 174      | Name: Datto Corporation
ID: 184      | Name: 6's Pizza and Pasta
ID: 185      | Name: Ace Real Estate
ID: 186      | Name: AICCI
ID: 187      | Name: Airmet
ID: 188      | Name: Akaysha Energy
ID: 189      | Name: ANGAD Australian Institute of Technology
ID: 190      | Name: Astracorp
ID: 191      | Name: Atlas Civil Construction Pty Ltd
ID: 192      | Name: Australian Friends of Asha
ID: 193      | Name: Bayside Security Doors and Shower Screens
ID: 194      | Name: Caliyan Associates
ID: 195      | Name: Carson Homes
ID: 196      | Name: Charlie Teo Foundation
ID: 197      | Name: City Central Apartments
ID: 198      | Name: Commscode
ID: 199      | Name: Consulate General of India
ID: 200      | Name: Covet International
ID: 201      | Name: Eastside Veterinary Pty Ltd
ID: 202      | Name: EmbroidMe Melton
ID: 

In [6]:
company_ids = [company['id'] for company in companies_data]
print(company_ids)


[0, 174, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 272, 274, 276, 277, 278, 280, 282, 283, 284, 285, 286, 287, 288, 289, 291, 292, 293, 295, 296, 297, 298, 301, 302, 303, 304, 305, 306, 308, 309, 310, 311, 313, 315, 316, 317, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 331, 332, 333, 334, 337, 338, 339, 340, 341, 345, 347, 349, 351, 352, 353, 355, 356, 357, 358, 359, 360, 364, 366, 367, 368, 370, 373, 374, 375, 376, 381, 382, 383, 385, 386, 389, 391, 393, 394, 395, 396, 397, 398, 400, 401, 402, 404, 406, 409, 410, 411, 412, 413, 414, 415, 416, 417, 419, 420, 421, 422, 423, 426, 427, 428, 429, 4

In [11]:
# Get contacts and domains for all companies
domains_by_company = {}

try:
    for company_id in company_ids:
        try:
            response = requests.get(
                f"http://localhost:5000/api/company/{company_id}/contacts")
            response.raise_for_status()
            contacts = response.json()

            company_domains = set()
            contact_count = 0

            for contact in contacts:
                # Only process up to 2 contacts that have at least one email
                if contact_count >= 2:
                    break

                emails = [
                    contact['emailAddress'],
                    contact['emailAddress2'],
                    contact['emailAddress3']
                ]

                # Check if contact has at least one non-empty email
                valid_emails = [email for email in emails if email]
                if valid_emails:
                    for email in valid_emails:
                        domain = email.split('@')[-1]
                        company_domains.add(domain)
                    contact_count += 1

            domains_by_company[company_id] = list(
                company_domains) if company_domains else None

        except requests.exceptions.RequestException as e:
            print(f"API request error for company {company_id}: {str(e)}")
            domains_by_company[company_id] = None
            continue

    # Save domains to JSON file
    file_path = os.path.join('../data', 'domains.json')
    with open(file_path, 'w') as f:
        json.dump(domains_by_company, f, indent=2)
    print(f"Successfully saved domains for {
          len(domains_by_company)} companies to {file_path}")

except Exception as e:
    print(f"Error occurred: {str(e)}")

Successfully saved domains for 229 companies to ../data/domains.json


In [14]:
# List of common personal email domains to exclude
personal_domains = {
    'gmail.com',
    'yahoo.com',
    'yahoo.com.au', 
    'hotmail.com',
    'outlook.com',
    'bigpond.com',
    'live.com',
    'icloud.com',
    'mac.com',
    'me.com',
    'msn.com',
    'ymail.com',
    'aol.com',
    "yahoo.co.in"
}

# Load the domains file
file_path = os.path.join('../data', 'domains.json')
with open(file_path, 'r') as f:
    domains_by_company = json.load(f)

# Process each company's domains
for company_id, domains in domains_by_company.items():
    if domains is not None:
        # Filter out personal domains
        business_domains = [d for d in domains if d not in personal_domains]
        # If no business domains remain, set to null
        domains_by_company[company_id] = business_domains if business_domains else None

# Save updated domains back to JSON file
with open(file_path, 'w') as f:
    json.dump(domains_by_company, f, indent=2)

print(f"Successfully removed personal email domains and updated {file_path}")


Successfully removed personal email domains and updated ../data/domains.json
