In [4]:
# libraries
import datetime
import time
import json
from pymongo import MongoClient
from faker import Faker

# Data generation

In [5]:
class Lab2models:
    def __init__(self):
        # Connect to MongoDB
        self.client = MongoClient('mongodb://localhost:27017/')
        self.db = self.client['lab2']
        
    def data_generator(self, num_companies, people_per_company):
        # Connect to MongoDB - local host
        client = MongoClient('mongodb://localhost:27017/')
        db = client['lab2']
        # delete collection data if exists from all fo the 3 models
        db.drop_collection("m1_people")
        db.drop_collection("m1_companies")
        db.drop_collection("m2_people")
        db.drop_collection("m3_companies")
        # create and obtain collection
        m1_people = db.create_collection('m1_people')
        m1_companies = db.create_collection('m1_companies')
        m2_people = db.create_collection('m2_people')
        m3_companies = db.create_collection('m3_companies')
        # It will generate data in spanish: 'es_ES'
        fake = Faker(['es_ES'])
        # Generate company data
        company_data = []
        for i in range(num_companies):
            company_name = fake.company()
            domain = fake.domain_name()
            company = {
                "domain": domain,
                "email": f"info@{domain}",
                "name": company_name,
                "url": f"www.{domain}",
                "vatNumber": fake.bothify(text='??######')
            }
            company_data.append(company)
            # only show the 5 first companies
            if i < 5:
                print(f"Company {i+1}: {company_name}")
            elif i == 5:
                print(f"... and {num_companies - 5} more companies")
        
        # Insert companies for Model 1
        company_ids = m1_companies.insert_many(company_data).inserted_ids

        # Generate people's data
        person_data = []
        for i in range(num_companies * people_per_company):
            company_idx = i % num_companies  # Distribute people among companies
            
            first_name = fake.first_name()
            last_name = fake.last_name()
            full_name = f"{first_name} {last_name}"
            birth_date = fake.date_of_birth(minimum_age=20, maximum_age=65)
            birth_datetime = datetime.datetime.combine(birth_date, datetime.time())
            age = (datetime.datetime.now().date() - birth_date).days // 365
            
            person = {
                "age": age,
                "companyEmail": f"{first_name.lower()}@{company_data[company_idx]['domain']}",
                "dateOfBirth": birth_datetime,
                "email": fake.email(),
                "firstName": first_name,
                "fullName": full_name,
                "sex": fake.random_element(elements=('M', 'F'))
            }
            person_data.append((person, company_idx))
            # Solo mostrar las primeras 5 personas
            if i < 5:
                print(f"Person {i+1}: {full_name}")
            elif i == 5:
                print(f"... and {num_companies * people_per_company - 5} more people")

        # model 1
        print("\nInserting data for Model 1 (Referential)...")
        for i, (person, company_idx) in enumerate(person_data):
            m1_person = person.copy()
            m1_person["worksIn"] = company_ids[company_idx]  # Reference to company
            m1_people.insert_one(m1_person)
            if (i+1) % 500 == 0 or i+1 == len(person_data):
                print(f"M1: Inserted {i+1}/{len(person_data)} documents")
        
        # model 2
        print("\nInserting data for Model 2 (Companies embedded in People)...")
        for i, (person, company_idx) in enumerate(person_data):
            m2_person = person.copy()
            m2_person["worksIn"] = company_data[company_idx]  # Embed company document
            m2_people.insert_one(m2_person)
            if (i+1) % 500 == 0 or i+1 == len(person_data):
                print(f"M2: Inserted {i+1}/{len(person_data)} documents")
        
        # model 3
        print("\nInserting data for Model 3 (People embedded in Companies)...")
        # Group people by company
        persons_by_company = {}
        for person, company_idx in person_data:
            if company_idx not in persons_by_company:
                persons_by_company[company_idx] = []
            persons_by_company[company_idx].append(person.copy())
        
        for i, company in enumerate(company_data):
            m3_company = company.copy()
            m3_company["staff"] = persons_by_company.get(i, [])
            m3_companies.insert_one(m3_company)
            if i < 5 or (i+1) % 10 == 0 or i+1 == len(company_data):
                print(f"M3: Company {i+1}/{len(company_data)} inserted with {len(persons_by_company.get(i, []))} staff members")
        
        print(f"\nData generation completed successfully.")
        
        return num_companies, people_per_company
        
    def close_connection(self):
        if hasattr(self, 'client'):
            self.client.close()
            print("MongoDB connection closed")

In [None]:
if __name__ == "__main__":
    lab = Lab2models()

    num_companies = 30       
    people_per_company = 100  
    
    print(f"Generating data for {num_companies} companies with approximately {people_per_company} people per company...")
    print(f"Total expected documents: {num_companies} companies and {num_companies * people_per_company} people")
    
    start_time = time.time()
    # Generating data for each model
    lab.data_generator(num_companies, people_per_company)
    
    generation_time = time.time() - start_time
    
    print(f"\nData generation completed in {generation_time:.2f} seconds")

Generating data for 30 companies with approximately 100 people per company...
Total expected documents: 30 companies and 3000 people
Company 1: Sabater y asociados S.L.L.
Company 2: Costa & Asociados S.A.
Company 3: Consultoría Rocamora y asociados S.Coop.
Company 4: Pons y asociados S.Com.
Company 5: Despacho CYOB S.L.N.E
... and 25 more companies
Person 1: Zoraida Mercader
Person 2: Jessica Alcántara
Person 3: Santiago Uría
Person 4: Priscila Manjón
Person 5: Joaquín Gabaldón
... and 2995 more people

Inserting data for Model 1 (Referential)...
M1: Inserted 500/3000 documents
M1: Inserted 1000/3000 documents
M1: Inserted 1500/3000 documents
M1: Inserted 2000/3000 documents
M1: Inserted 2500/3000 documents
M1: Inserted 3000/3000 documents

Inserting data for Model 2 (Companies embedded in People)...
M2: Inserted 500/3000 documents
M2: Inserted 1000/3000 documents
M2: Inserted 1500/3000 documents
M2: Inserted 2000/3000 documents
M2: Inserted 2500/3000 documents
M2: Inserted 3000/3000 d

# Queries