In [None]:
# Standard library imports
import random
from typing import List, Dict, Any, Optional
from datetime import datetime

# Third-party imports
import pandas as pd
from pymongo import MongoClient
from pymongo.collection import Collection
from pymongo.results import UpdateResult

In [None]:
class MongoRepository:
    """Repository class for interacting with MongoDB database for A/B testing experiments.
    
    This class handles all database operations related to:
    - Finding applicants by date
    - Assigning applicants to experiment groups
    - Updating applicant records
    - Managing experiment data
    
    The repository pattern provides a clean separation between database operations 
    and business logic.
    """

    def __init__(
        self, 
        client: Optional[MongoClient] = None,
        db_name: str = "wqu-abtest",
        collection_name: str = "ds-applicants"
    ):
        """Initialize the repository with MongoDB connection settings.
        
        If no client is provided, connects to local MongoDB instance.
        """
        # Initialize MongoDB client - use provided client or create new local connection
        self.client = client or MongoClient(host="localhost", port=27017)
        
        # Get reference to specific collection we'll be working with
        self.collection = self.client[db_name][collection_name]

    def find_by_date(self, date_string: str) -> List[Dict[str, Any]]:
        """Find incomplete quiz applicants for a specific date.
        
        Queries the database for all applicants who:
        1. Created their application on the specified date
        2. Have not completed the admissions quiz
        """
        try:
            # Convert input string to datetime and calculate date range
            start = pd.to_datetime(date_string, format="%Y-%m-%d")
            end = start + pd.DateOffset(days=1)
            
            # Construct MongoDB query for incomplete quiz applicants within date range
            query = {
                "createdAt": {
                    "$gte": start,  # Greater than or equal to start of day
                    "$lt": end      # Less than start of next day
                },
                "admissionsQuiz": "incomplete"
            }
            
            # Execute query and return results as list
            return list(self.collection.find(query))
            
        except ValueError as e:
            raise ValueError(
                f"Invalid date format. Please use YYYY-MM-DD. Original error: {str(e)}"
            )

    def update_applicants(self, observations: List[Dict[str, Any]]) -> Dict[str, int]:
        """Update multiple applicant documents in the collection.
        
        Processes a batch of updates and tracks success metrics.
        Returns summary of how many documents were found and modified.
        """
        # Initialize counters for tracking update operations
        n = n_modified = 0
        
        # Process each observation/document
        for doc in observations:
            # Update single document, keeping track of counts
            result = self.collection.update_one(
                filter={"_id": doc["_id"]},  # Find document by ID
                update={"$set": doc}         # Replace with updated version
            )
            
            # Track number of documents matched and modified
            n += result.matched_count
            n_modified += result.modified_count
        
        # Return summary of operation
        return {
            "n": n,                  # Total documents processed
            "nModified": n_modified  # Documents actually changed
        }

    def assign_to_groups(self, date_string: str) -> Dict[str, int]:
        """Assign applicants from a specific date to control and treatment groups.
        
        This method:
        1. Finds all eligible applicants for the given date
        2. Randomly assigns them to either control or treatment group
        3. Updates their records in the database
        """
        # Get all eligible applicants for the date
        observations = self.find_by_date(date_string)
        
        # Return early if no eligible applicants found
        if not observations:
            return {"n": 0, "nModified": 0}
        
        # Set random seed for reproducible results
        random.seed(42)
        
        # Randomly shuffle the observations
        random.shuffle(observations)
        
        # Calculate midpoint for splitting into groups
        split_idx = len(observations) // 2
        
        # Assign control group (first half)
        for obs in observations[:split_idx]:
            obs.update({
                "inExperiment": True,
                "group": "no email (control)"
            })
        
        # Assign treatment group (second half)    
        for obs in observations[split_idx:]:
            obs.update({
                "inExperiment": True,
                "group": "email (treatment)"
            })
        
        # Update all records in database
        return self.update_applicants(observations)

In [None]:
def export_treatment_emails(
    observations: List[Dict[str, Any]], 
    directory: str = "."
) -> None:
    """Export email addresses of treatment group participants to CSV.
    
    Creates a CSV file with format: YYYY-MM-DD_ab-test.csv
    containing email addresses and tags for the treatment group.
    """
    # Convert observations to DataFrame for easier processing
    df = pd.DataFrame(observations)
    
    # Add experiment tag column
    df["tag"] = "ab-test"
    
    # Create mask for treatment group
    mask = df["group"] == "email (treatment)"
    
    # Generate filename with current date
    date_string = datetime.now().strftime("%Y-%m-%d")
    filename = f"{directory}/{date_string}_ab-test.csv"
    
    # Export filtered data to CSV
    df.loc[mask, ["email", "tag"]].to_csv(filename, index=False)

In [None]:
# Example usage and testing code
if __name__ == "__main__":
    # Initialize repository with default settings
    repo = MongoRepository()
    
    # Run experiment for a specific date
    test_date = "2022-05-15"
    result = repo.assign_to_groups(test_date)
    print(f"Experiment results for {test_date}:")
    print(f"- Total records processed: {result['n']}")
    print(f"- Records modified: {result['nModified']}")
    
    # Export treatment group emails
    observations = repo.find_by_date(test_date)
    export_treatment_emails(observations)
    print(f"Treatment group emails exported successfully")

    # Example of error handling
    try:
        repo.assign_to_groups("invalid-date")
    except ValueError as e:
        print(f"Error handling example: {e}")