In [259]:
# libraries
import datetime
import time
import json
from pymongo import MongoClient
from faker import Faker
import random

# Data generation

In [260]:
class Lab2models:
    def data_generator(self, num_companies):
        Faker.seed(42) # same people
        random.seed(42) # for generating the same number of employees per company
        # Connect to MongoDB - Note: added my local host
        client = MongoClient('mongodb://localhost:27017/')
        db = client['lab2']
        # delete collection data if exists from all of the 3 models
        db.drop_collection("m1_people")
        db.drop_collection("m1_companies")
        db.drop_collection("m2_people")
        db.drop_collection("m3_companies")
        # create and obtain collections
        m1_people = db.create_collection('m1_people')
        m1_companies = db.create_collection('m1_companies')
        m2_people = db.create_collection('m2_people')
        m3_companies = db.create_collection('m3_companies')
        # It will generate data in spanish: 'es_ES'
        fake = Faker(['es_ES'])
        # Generate company data
        company_data = []
        for i in range(num_companies):
            company_name = fake.company()
            domain = fake.domain_name()
            company = {
                "domain": domain,
                "email": f"info@{domain}",
                "name": company_name,
                "url": f"www.{domain}",
                "vatNumber": fake.bothify(text='??######')
            }
            company_data.append(company)
        
        # Insert companies for Model 1
        company_ids = m1_companies.insert_many(company_data).inserted_ids

        # Generate random number of employees per company
        employees_per_company = [random.randint(25, 50) for _ in range(num_companies)]
        total_employees = sum(employees_per_company)
        
        # Generate people's data
        person_data = []
        employee_count = 0
        company_idx = 0
        
        # For each company, generate the specific number of employees
        for company_idx in range(num_companies):
            for _ in range(employees_per_company[company_idx]):
                if fake.random_element(elements=(True, False)):
                    first_name = fake.first_name_male()
                    sex = 'M'
                else:
                    first_name = fake.first_name_female()
                    sex = 'F'
                    
                last_name = fake.last_name()
                full_name = f"{first_name} {last_name}"
                birth_date = fake.date_of_birth(minimum_age=20, maximum_age=65)
                birth_datetime = datetime.datetime.combine(birth_date, datetime.time())
                age = (datetime.datetime.now().date() - birth_date).days // 365
                
                username_full = f"{first_name.lower()}.{last_name.lower()}".replace(" ", "")
                username_short = f"{first_name.lower()[0]}{last_name.lower()}".replace(" ", "")
                
                person = {
                    "age": age,
                    "companyEmail": f"{username_full}@{company_data[company_idx]['domain']}",
                    "dateOfBirth": birth_datetime,
                    "email": f"{username_short}@{fake.free_email_domain()}",
                    "firstName": first_name,
                    "fullName": full_name,
                    "sex": sex 
                }
                person_data.append((person, company_idx))
                employee_count += 1

        # MODEL 1
        for i, (person, company_idx) in enumerate(person_data):
            m1_person = person.copy()
            m1_person["worksIn"] = company_ids[company_idx]  # Reference to company
            m1_people.insert_one(m1_person)
        
        # MODEL 2
        for i, (person, company_idx) in enumerate(person_data):
            m2_person = person.copy()
            m2_person["worksIn"] = company_data[company_idx]  # Embed company document
            m2_people.insert_one(m2_person)
        
        # MODEL 3
        persons_by_company = {}
        for person, company_idx in person_data:
            if company_idx not in persons_by_company:
                persons_by_company[company_idx] = []
            persons_by_company[company_idx].append(person.copy())
        
        for i, company in enumerate(company_data):
            m3_company = company.copy()
            m3_company["staff"] = persons_by_company.get(i, [])
            m3_companies.insert_one(m3_company)
        
        return num_companies, total_employees

In [261]:
if __name__ == "__main__":
    lab = Lab2models()

    num_companies = 30       
    min_people = 25
    max_people = 50

    num_companies, actual_people = lab.data_generator(num_companies)
  
    print(f"Actual number of documents: {num_companies} companies and {actual_people} people")

Actual number of documents: 30 companies and 1094 people


# Queries

In [262]:
client = MongoClient('mongodb://localhost:27017/')
db = client['lab2']

### Q1: For each person, retrieve their full name and their company’s name.

In [263]:
def run_query(db, collection, pipeline, model_name):
    print(f"\n# {model_name}")
    start_time = time.time()
    results = list(db[collection].aggregate(pipeline))
    
    # show the first 3 results
    for i, result in enumerate(results[:3]):
        print(result)
    if len(results) > 3:
        print(f"... and {len(results) - 3} other results")
    
    generation_time = time.time() - start_time
    print(f"Time taken: {generation_time:.4f} seconds")
    
    return generation_time

In [264]:
q1_1 = [{"$lookup": {"from": "m1_companies", "localField": "worksIn", "foreignField": "_id", "as": "companyName"}},
    {"$unwind": "$companyName"}, {"$project": {"_id": 0, "fullName": 1, "companyName": "$companyName.name"}}]

q1_2 = [{"$project": {"_id": 0, "fullName": 1, "companyName": "$worksIn.name"}}]

q1_3 = [{"$unwind": "$staff"},{"$project": {"_id": 0, "fullName": "$staff.fullName", "companyName": "$name"}}]

In [265]:
time_q1_1 = run_query(db, "m1_people", q1_1, "Model 1: Two types of documents")
time_q1_2 = run_query(db, "m2_people", q1_2, "Model 2: One document for person with embedded company")
time_q1_3 = run_query(db, "m3_companies", q1_3, "Model 3: One document for company with embedded people")


# Model 1: Two types of documents
{'fullName': 'Ariel Romero', 'companyName': 'Banca Privada OLMJ S.L.N.E'}
{'fullName': 'Amador Juan', 'companyName': 'Banca Privada OLMJ S.L.N.E'}
{'fullName': 'Eugenia Nebot', 'companyName': 'Banca Privada OLMJ S.L.N.E'}
... and 1091 other results
Time taken: 0.0372 seconds

# Model 2: One document for person with embedded company
{'fullName': 'Ariel Romero', 'companyName': 'Banca Privada OLMJ S.L.N.E'}
{'fullName': 'Amador Juan', 'companyName': 'Banca Privada OLMJ S.L.N.E'}
{'fullName': 'Eugenia Nebot', 'companyName': 'Banca Privada OLMJ S.L.N.E'}
... and 1091 other results
Time taken: 0.0300 seconds

# Model 3: One document for company with embedded people
{'fullName': 'Ariel Romero', 'companyName': 'Banca Privada OLMJ S.L.N.E'}
{'fullName': 'Amador Juan', 'companyName': 'Banca Privada OLMJ S.L.N.E'}
{'fullName': 'Eugenia Nebot', 'companyName': 'Banca Privada OLMJ S.L.N.E'}
... and 1091 other results
Time taken: 0.0023 seconds


In [266]:
# Comparison 
print("\nModel comparison:")
print(f"Model 1: {time_q1_1:.4f}s, Model 2: {time_q1_2:.4f}s, Model 3: {time_q1_3:.4f}s")
print(f"The fastest model is: Model {1 if time_q1_1 < time_q1_2 and time_q1_1 < time_q1_3 else 2 if time_q1_2 < time_q1_3 else 3}")


Model comparison:
Model 1: 0.0372s, Model 2: 0.0300s, Model 3: 0.0023s
The fastest model is: Model 3


### Q2: For each company, retrieve its name and the number of employees.

In [267]:
q2_1= [{"$group": {"_id": "$worksIn", "employeeCount":{"$sum": 1}}},
       {"$lookup": {"from": "m1_companies", "localField": "_id", "foreignField": "_id", "as": "companyName"}},
       {"$unwind": "$companyName"},
       {"$project": {"_id": 0, "companyName": "$companyName.name", "employeeCount": 1}},
       {"$replaceRoot": {"newRoot": { "companyName": "$companyName", "employeeCount": "$employeeCount"}}}] # for the order of the output, first company then number of employees

q2_2 = [{"$group": {"_id": "$worksIn._id","companyName": {"$first": "$worksIn.name"}, "employeeCount": {"$sum": 1}}},
        {"$project": {"_id": 0,"companyName": 1,"employeeCount": 1}}]

q2_3 = [{"$project": {"_id": 0, "companyName": "$name", "employeeCount": {"$size": "$staff"}}}]

In [268]:
time_q2_1 = run_query(db, "m1_people", q2_1, "Model 1: Two types of documents")
time_q2_2 = run_query(db, "m2_people", q2_2, "Model 2: One document for person with embedded company")
time_q2_3 = run_query(db, "m3_companies", q2_3, "Model 3: One document for company with embedded people")



# Model 1: Two types of documents
{'companyName': 'Giménez y Torrents S.A.T.', 'employeeCount': 32}
{'companyName': 'Hotel Avanzadas S.L.', 'employeeCount': 44}
{'companyName': 'Distribuciones del Noroeste S.Coop.', 'employeeCount': 43}
... and 27 other results
Time taken: 0.0031 seconds

# Model 2: One document for person with embedded company
{'companyName': 'Giménez y Torrents S.A.T.', 'employeeCount': 32}
{'companyName': 'Hotel Avanzadas S.L.', 'employeeCount': 44}
{'companyName': 'Distribuciones del Noroeste S.Coop.', 'employeeCount': 43}
... and 27 other results
Time taken: 0.0015 seconds

# Model 3: One document for company with embedded people
{'companyName': 'Banca Privada OLMJ S.L.N.E', 'employeeCount': 25}
{'companyName': 'Grupo Rocamora S.Com.', 'employeeCount': 48}
{'companyName': 'Restauración CR S.Coop.', 'employeeCount': 33}
... and 27 other results
Time taken: 0.0005 seconds


In [269]:
# Comparison 
print("\nModel comparison:")
print(f"Model 1: {time_q2_1:.4f}s, Model 2: {time_q2_2:.4f}s, Model 3: {time_q2_3:.4f}s")
print(f"The fastest model is: Model {1 if time_q2_1 < time_q2_2 and time_q2_1 < time_q2_3 else 2 if time_q2_2 < time_q2_3 else 3}")


Model comparison:
Model 1: 0.0031s, Model 2: 0.0015s, Model 3: 0.0005s
The fastest model is: Model 3
