In [1]:
import os
import random
import pandas as pd
from pymongo import MongoClient
from typing import List
from dotenv import load_dotenv
from pprint import PrettyPrinter
from datetime import timedelta
load_dotenv(".venv")

True

### Extract, Transform, Load.

The goal of this project is to design the experiment. 

* I will split the data into 2 groups(treatment and control). 
* Extract, transform and load the data.

In [None]:
def export_collection(db_username, db_password, cluster_url, database, collection):
    
    """
    parameters
    ----------
    db_username: Your database(Mongodb) login username.
    db_password: Your database password.
    cluster_url:cluster url.
    database: name of your database.
    collection: collection name.

    Upon failure:
    Runtime error if any of the arguments provided are not correct.

    Returns:
    A collection.
    """

    try:

        if not db_username or not db_password:
            raise RuntimeError(" I am unable to login to your account.Please check your login credentials.")

        connection_string = f"mongodb+srv://{db_username}:{db_password}@cluster0.xtxfmd5.mongodb.net/sff.mongodb.net/"

        client = MongoClient(connection_string)

        db = client["data_science"]
        ds_applicants = db["wqu_ab_testing"]

        return ds_applicants
    except exception as e:
        print(e)

In [3]:
db_username = os.environ.get("db_username")
db_password = os.environ.get("db_password")
database = os.environ.get("database")
cluster_url = os.environ.get("cluster_url")
collection = os.environ.get("collection")

ds_applicants = export_collection(db_username, db_password, cluster_url, database, collection)

In [4]:
ds_applicants.find_one()

{'_id': ObjectId('692414f935782a1a89115349'),
 'name': 'David Kelly',
 'DOB': datetime.datetime(1985, 12, 28, 0, 0),
 'gender': 'male',
 'email': 'ntbdh66bfj@gmail.com',
 'admissionsQuiz': 'incomplete',
 'countryISO2': 'AO',
 'highestDegreeEarned': "Master's degree",
 'createdAt': datetime.datetime(2024, 12, 1, 0, 0)}

In [5]:
#checking how many students who completed the quiz and how many who hasn't complete it.

result = ds_applicants.aggregate(
    [
        {
            "$group":{
                "_id": "$admissionsQuiz",
                "count": {"$count":{}}
            }
        }
    ]
)
for r in result:
    if r["_id"] == "incomplete":
        incomplete = r["count"]
    else:
        complete = r["count"]

print("Completed quiz:", complete)
print("Did not complete quiz:", incomplete)

Completed quiz: 3023
Did not complete quiz: 2977


In [6]:
prop_incomplete = incomplete/(complete+incomplete)
print(
    "Proportion of users who don't complete admissions quiz:", round(prop_incomplete, 2)
)

Proportion of users who don't complete admissions quiz: 0.5


In really the propotion of folks who did not complete the quize should really be small.

### Hypothesis Testing

$H_{0}: $ Sending an email to applicants who did not complete the quiz will not increase the quiz completion rate.

$H_{1}: $ Sending an email to applicants who did not complete the quiz will increase the quiz completion rate.

In [12]:
def find_by_date(collection, date_string:str)->List:

    """
    Extract documents created 60 days before the given date that hasn't completed the quiz.
    
    parameters
    ----------
    collection : list. collection of a database.
    date_string: "00-00-00"

    Returns
    --------
    observations: list
    list of students who haven't completed the quiz.
    """

    start_date = pd.to_datetime(date_string)

    end_date = start_date + timedelta(days=60)

    query = {"createdAt":{"$gte":start_date, "$lt":end_date}, "admissionsQuiz": "incomplete"}

    result = collection.find(query)

    observations = list(result)

    return observations

    

In [13]:
observations =find_by_date(ds_applicants, "2024-09-03")
len(observations)

510

In [15]:
def assign_to_group(observations: list):

    """
    Parameters
    ----------
    observations: list of students.


    "This function takes a list of students who haven't completed the quiz,
    assign them to two different groups, control and treatment.
    
    """

    random.seed(42)
    random.shuffle(observations)

    idx = len(observations)//2

    for doc in observations[:idx]:
        doc["inExperiement"] = True
        doc["group"] = "no email (control)"

    for doc in observations[idx:]:
        doc["inExperiement"] = True
        doc["group"] = "email (treatment)"

    return observations    

In [16]:
observations_assigned = assign_to_group(observations)

In [18]:
def export_email(observations_assigned, directory="./data"):

    """
    This function takes a list of students who has been placed in the experiement. 
    Export emails of the students that are in the trearment group and send it to stakeholder to give it to the correct team to send these students a remainder email to complete the quiz.
    
    parameters:
    ----------
    observations_assigned : list

    Returns
    --------
    None
    """

    df = pd.DataFrame(observations_assigned)

    df =df[df["group"] == "email (treatment)"]

    df["tag"] ="ab-test"

    date_string = pd.Timestamp.now().strftime(format = "%Y-%m-%d")

    filename = directory + "/" + date_string + "_ab-test.csv"
    
    df[["email", "tag"]].to_csv(filename, index = False)   

In [19]:
export_email(observations_assigned)

### Load the transformed documents.

In [20]:
applicant = observations_assigned[4]
applicant_id = applicant["_id"]
applicant

{'_id': ObjectId('692414fa35782a1a891161d5'),
 'name': 'Steven Knox',
 'DOB': datetime.datetime(1996, 5, 22, 0, 0),
 'gender': 'female',
 'email': '0c09l3gfof@outlook.com',
 'admissionsQuiz': 'incomplete',
 'countryISO2': 'PG',
 'highestDegreeEarned': 'Some College (1-3 years)',
 'createdAt': datetime.datetime(2024, 10, 16, 0, 0),
 'inExperiement': True,
 'group': 'no email (control)'}

In [21]:
result = ds_applicants.update_one(
    filter = {"_id": applicant_id},
    update = {"$set":applicant}
)

In [22]:
result.modified_count

1

In [23]:
def update_applicants(collection, observations_assigned: list)->dict:

    """
    Parameters
    ----------
    collection : collection.
    observations_assigned: List of students who have been assigned to the experiments.

    Returns
    -------
    transaction_list: dict. It has a number of modified documents.

    """

    # initializing variables
    n = 0
    n_modified = 0

    for obs in observations_assigned:
        result = collection.update_many(
            filter = {"_id": obs["_id"]},
            update = {"$set": obs}
        )
        
        n += result.matched_count
        n_modified += result.modified_count

    transaction_result = {"n":n, "n_modified":n_modified}

    return transaction_result

In [24]:
result = update_applicants(ds_applicants, observations_assigned)
result

{'n': 510, 'n_modified': 509}