In [2]:
import os
import random
import pandas as pd
from pymongo import MongoClient
from typing import List
from dotenv import load_dotenv
from pprint import PrettyPrinter
from datetime import timedelta
load_dotenv(".venv")

True

### Extract, Transform, Load.

The goal of this project is to design the experiment. 

* I will split the data into 2 groups(treatment and control). 
* Extract, transform and load the data.

In [3]:
def export_collection(db_username, db_password, cluster_url, database, collection):

    try:

        if not db_username or not db_password or not cluster_url:
            raise RuntimeError("There a peoblem with one ot the argument variables.")

        connection_string = f"mongodb+srv://{db_username}:{db_password}@{cluster_url}"

        client = MongoClient(connection_string)

        db = client[database]
        ds_applicants = db[collection]

        return ds_applicants
    except exception as e:
        print(e)

In [4]:
db_username = os.environ.get("db_username")
db_password = os.environ.get("db_password")
database = os.environ.get("database")
cluster_url = os.environ.get("cluster_url")
collection = os.environ.get("collection")

ds_applicants = export_collection(db_username, db_password, cluster_url, database, collection)

In [5]:
ds_applicants.find_one()

{'_id': ObjectId('691df158fa72dc4fe7e410d8'),
 '': 0,
 'name': 'Margaret Lowe',
 'DOB': datetime.datetime(2004, 12, 27, 0, 0),
 'gender': 'female',
 'email': '6gx8or7hdk@yahoo.com',
 'admissionsQuiz': 'incomplete',
 'countryISO2': 'SA',
 'highestDegreeEarned': 'Some College (1-3 years)',
 'createdAt': datetime.datetime(2024, 5, 9, 0, 0)}

In [6]:
#checking how many documents are complate and incomplete 

result = ds_applicants.aggregate(
    [
        {
            "$group":{
                "_id": "$admissionsQuiz",
                "count": {"$count":{}}
            }
        }
    ]
)
for r in result:
    if r["_id"] == "incomplete":
        incomplete = r["count"]
    else:
        complete = r["count"]

print("Completed quiz:", complete)
print("Did not complete quiz:", incomplete)

Completed quiz: 2963
Did not complete quiz: 3037


In [7]:
prop_incomplete = incomplete/(complete+incomplete)
print(
    "Proportion of users who don't complete admissions quiz:", round(prop_incomplete, 2)
)

Proportion of users who don't complete admissions quiz: 0.51


In really the propotion of folks who did not complete the quize should really be small.

### Hypothesis Testing

$H_{0}: $ Sending an email to applicants who did not complete the quiz will not increase the quiz completion rate.

$H_{1}: $ Sending an email to applicants who did not complete the quiz will increase the quiz completion rate.

In [8]:
def find_by_date(collection, date_string:str)->List:

    """
    Extract documents created 60 days before the given date that hasn't completed the quiz.
    
    parametrs
    ----------
    collection : list. collection of a database.
    date_string: "00-00-00"

    Returns
    --------
    observations: list
    list of documents.
    """

    start_date = pd.to_datetime(date_string)

    end_date = start_date + timedelta(days=60)

    query = {"createdAt":{"$gte":start_date, "$lt":end_date}, "admissionsQuiz": "incomplete"}

    result = collection.find(query)

    observations = list(result)

    return observations

    

In [9]:
observations =find_by_date(ds_applicants, "2024-08-03")
len(observations)

544

In [10]:
def assign_to_group(observations: list):

    "This function assigns observations to different groups and tags them as inExperiement."

    random.seed(42)
    random.shuffle(observations)

    idx = len(observations)//2

    for doc in observations[:idx]:
        doc["inExperiement"] = True
        doc["group"] = "no email (control)"

    for doc in observations[idx:]:
        doc["inExperiement"] = True
        doc["group"] = "email (treatment)"

    return observations    

In [11]:
observations_assigned = assign_to_group(observations)
observations[:1]

[{'_id': ObjectId('691df159fa72dc4fe7e41594'),
  '': 1212,
  'name': 'Nathan Sanford',
  'DOB': datetime.datetime(2002, 3, 12, 0, 0),
  'gender': 'female',
  'email': '0fkcp7ld3j@yahoo.com',
  'admissionsQuiz': 'incomplete',
  'countryISO2': 'KZ',
  'highestDegreeEarned': 'Some College (1-3 years)',
  'createdAt': datetime.datetime(2024, 9, 29, 0, 0),
  'inExperiement': True,
  'group': 'no email (control)'}]

In [18]:
df = pd.DataFrame(observations_assigned)



df = pd.DataFrame(observations_assigned)
df["tag"] ="ab-test"

df =df[df["group"] == "email (treatment)"]
df.head()

Unnamed: 0,_id,Unnamed: 2,name,DOB,gender,email,admissionsQuiz,countryISO2,highestDegreeEarned,createdAt,inExperiement,group,tag
272,691df159fa72dc4fe7e41625,1357,Erica Martinez,2004-08-27,male,au55gv1ahm@yahoo.com,incomplete,SG,Bachelor's degree,2024-08-12,True,email (treatment),ab-test
273,691df158fa72dc4fe7e41328,592,Craig Morris,1989-02-03,female,85q1xyva63@outlook.com,incomplete,LY,Doctorate (e.g. PhD),2024-08-10,True,email (treatment),ab-test
274,691df15afa72dc4fe7e41e37,3423,Robert Quinn,2001-11-13,female,o233ygb6ig@outlook.com,incomplete,IN,Some College (1-3 years),2024-09-28,True,email (treatment),ab-test
275,691df15afa72dc4fe7e41a22,2378,Debra Johnston,1995-09-24,male,nkia9ws8l0@yahoo.com,incomplete,BB,Doctorate (e.g. PhD),2024-09-28,True,email (treatment),ab-test
276,691df158fa72dc4fe7e4123a,354,Tiffany Howard,1995-05-01,female,5xuwerittt@gmail.com,incomplete,TN,Some College (1-3 years),2024-09-27,True,email (treatment),ab-test


In [None]:
def export_email(observations_assigned, directory="./data"):

    """
    This function tags all treatment observations and 
    saves a file as a csv that will be sent to the stakeholder to give it to the right team to send emails to students :P.

    parameters:
    ----------
    observations_assigned : list


    Returns
    --------
    None
    """

    df = pd.DataFrame(observations_assigned)

    df =df[df["group"] == "email (treatment)"]

    df["tag"] ="ab-test"

    date_string = pd.Timestamp.now().strftime(format = "%Y-%m-%d")

    filename = directory + "/" + date_string + "_ab-test.csv"
    
    df[["email", "tag"]].to_csv(filename, index = False)   

In [28]:
export_email(observations_assigned)

./data/2025-11-19_ab-test.csv
