# A/B Testing Admissions in the MSc Financial Engineering.

In [None]:
# Import your libraries 
import warnings
from pymongo import MongoClient
from pymongo.collection import Collection
import math
import random
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.express as px
import scipy
from country_converter import CountryConverter
from pymongo import MongoClient
from statsmodels.stats.contingency_tables import Table2x2
from statsmodels.stats.power import GofChisquarePower
from pprint import PrettyPrinter
# from teaching_tools.ab_test.experiment import Experiment
# from teaching_tools.ab_test.reset import Reset

warnings.filterwarnings("ignore")

## Connect

In [None]:
# construct a PrettyPrinter instance in pprint
pp = PrettyPrinter(indent=2)
# connection string
connection_string = "mongodb+srv://noelobi:test123@cluster0.6vyuwsl.mongodb.net/"
# create a client that connects to the mongodb cluster
client = MongoClient(connection_string)
# print list of the databases available on the client
pp.pprint(list(client.list_databases()))

In [None]:
# Create `db`
db = client["mscfe-abtest"]
# Assign `"mscfe-applicants"` collection to `mscfe_app`
mscfe_app = db["mscfe-applicants"]
# print type 
print("mscfe_app type:", type(mscfe_app))

## Explore

In [None]:
# Count documents in `mscfe_app`
n_documents = mscfe_app.count_documents({})
print("Num. documents in 'mscfe-applicants':", n_documents)

In [None]:
# find one document in mscfe_app
result = mscfe_app.find_one({})
# print type of result
print("result type:", type(result))
# view result
pp.pprint(result)

### Nationality

In [None]:
# Aggregate applicants by nationality
result = mscfe_app.aggregate(
    [
        {
#     grouping
    "$group": {
#         grouping by country
        "_id": "$countryISO2", 
#         count every instance of each country
        "count":{"$count":{}}
    }
        }
    ]
)

# Load result into DataFrame
df_nationality = (pd.DataFrame(result).rename({"_id":"country_iso2"}, axis="columns").sort_values("count"))

print("df_nationality type:", type(df_nationality))
print("df_nationality shape", df_nationality.shape)
df_nationality.head()

In [None]:
# Instantiate `CountryConverter`
cc = CountryConverter()


# Create `"country_name"` column
df_nationality["country_name"] = cc.convert(
    df_nationality['country_iso2'], to="name_short"
)


# Create `"country_iso3"` column
df_nationality["country_iso3"] = cc.convert(df_nationality["country_iso2"], to="ISO3")


print("df_nationality type:", type(df_nationality))
print("df_nationality shape", df_nationality.shape)
df_nationality.head()

In [None]:
# Create `build_nat_choropleth` function
def build_nat_choropleth():
    fig = px.choropleth(
        data_frame=df_nationality,
        locations="country_iso3",
        color="count",
        projection="natural earth",
        color_continuous_scale=px.colors.sequential.Greens,
        title="MScFE Applicants: Nationalities"
        
    )
    return fig

nat_fig = build_nat_choropleth()
nat_fig.show();

### Age

In [None]:
# Aggregate applicants by age
result = mscfe_app.aggregate(
    [
        {
            # create new data from original data
            "$project":{
                # set name of new column [years column]
                "years":{
                    # calculate difference between current date and birthday
                    "$dateDiff":{
                        "startDate": "$birthday",
                        "endDate": "$$NOW",
                        "unit": "year"
                    }
                }
            }
        }
    ]
)

# load result into DataFrame: ages
ages = pd.DataFrame(result)["years"]

print("ages type:", type(ages))
print("ages shape:", ages.shape)
ages.head()

In [None]:
def build_age_hist():
    # Create histogram of `ages`
    fig = px.histogram(x=ages, nbins=20, title="MScFE Applicants: Distribution of Ages")
    # Set axis labels
    fig.update_layout(xaxis_title="Age", yaxis_title="Frequency [count]")
    return fig


age_fig = build_age_hist()
print("age_fig type:", type(age_fig))
age_fig.show();

### Education

In [None]:
result = mscfe_app.aggregate(
    [
        {
#             perform a group by
            "$group":{
#                 determine what key to group by [highestDegreeEarned]
                "_id":"$highestDegreeEarned",
#                 create a count column
#                 the empty dictionary means count everything
                "count": {"$count": {}}
            }
        }
    ]
)

education = (
    pd.DataFrame(result)
    .rename({"_id": "highest_degree_earned"}, axis="columns")
    .set_index("highest_degree_earned")
#     change dataframe that has one column into a series
    .squeeze()
)

print("education type:", type(education))
print("education shape:", education.shape)
education.head()

In [None]:
# Create function that sorts degrees in order
def ed_sort(counts):
    """Sort array `counts` from highest to lowest degree earned."""
    degrees = [
        "High School or Baccalaureate",
        "Some College (1-3 years)",
        "Bachelor's degree",
        "Master's degree",
        "Doctorate (e.g. PhD)",
    ]
    # 
    mapping = {k:v for v,k in enumerate(degrees)}
    sort_order = [mapping[c] for c in counts]
    return sort_order


education.sort_index(key=ed_sort, inplace=True)
education

In [None]:
def build_ed_bar():
    # Create bar chart
    fig = px.bar(
        x=education,
        y=education.index,
        orientation="h",
        title="MScFE Applicants: Highest Degree Earned"
    )
    # Add axis labels
    fig.update_layout(xaxis_title="Frequency [count]", yaxis_title="Degree")
    return fig


ed_fig = build_ed_bar()
ed_fig.show();

## ETL

In [None]:
class MongoRepository:
    """Repository class for interacting with MongoDB database.

    Parameters
    ----------
    client : `pymongo.MongoClient`
        By default, `MongoClient(host='localhost', port=27017)`.
    db : str
        By default, `'wqu-abtest'`.
    collection : str
        By default, `'mscfe-applicants'`.

    Attributes
    ----------
    collection : pymongo.collection.Collection
        All data will be extracted from and loaded to this collection.
    """

    def __init__(
        self, 
        client= MongoClient("mongodb+srv://noelobi:test123@cluster0.6vyuwsl.mongodb.net/"), 
        db="mscfe-abtest", 
        collection= "mscfe-applicants"
    ):
        self.collection = client[db][collection]

    def find_by_date(self, date_string):
        # Convert `date_string` to datetime object
        start = pd.to_datetime(date_string, format="%Y-%m-%d")
        # Offset `start` by 1 day
        end = start + pd.DateOffset(days = 1)
        # Create PyMongo query for no-quiz applicants b/t `start` and `end`
        # gt: greater than, gte:greater than or equal to, lt: less than
        query = {"createdAt": {"$gte": start, "$lt": end}, "admissionsQuiz": "incomplete"}
        # Query collection, get result
        result = self.collection.find(query)
        # Convert `result` to list
        observations = list(result)
        return observations

    def update_applicants(self, observations_assigned):
        # initialize counters
        n=0
        n_modified = 0
        # iterate through applicants
        for doc in observations_assigned:
            # update doc
            result = self.collection.update_one(
                filter={"_id": doc["_id"]},
                update={"$set": doc}
            )
            # update counters
            n += result.matched_count
            n_modified += result.matched_count
        # create results
        transaction_result = {"n":n, "nModified":n_modified}
        return transaction_result

    def assign_to_groups(self, date_string):
        # Get observations
        observations = self.find_by_date(date_string)
        # Shuffle `observations`
        random.seed(42)
        random.shuffle(observations)
        # Get index position of item at observations halfway point
        idx = len(observations) // 2
        # Assign first half of observations to control group
        for doc in observations[:idx]:
            doc["inExperiment"] = True
            doc["group"] = "no email (control)"
        # Assign second half of observations to treatment group
        for doc in observations[idx:]:
            doc["inExperiment"] = True
            doc["group"] = "email (treatment)"
        # update collection
        result = self.update_applicants(observations)
        return result

    def export_treatment_emails(self, observations_assigned, directory="."):
        # Put `observations_assigned` docs into DataFrame
        df = pd.DataFrame(observations_assigned)
        # Add `"tag"` column
        df["tag"] = "ab-test"
        # Create mask for treatment group only
        mask = df["group"] == "email (treatment)"
        # Create filename with date
        date_string = pd.Timestamp.now().strftime("%Y-%m-%d")
        filename = directory + "/" + date_string + "_ab-test.csv"
        # Save DataFrame to directory (email and tag only)
        df[mask][["email", "tag"]].to_csv(filename, index=False)
        
    def find_exp_observations(self):
        result = self.collection.find({"inExperiment":True})
        observations = list(result)
        return observations

In [None]:
# create instance of the MongoRepository class
repo = MongoRepository()
print("repo type:", type(repo))
repo

In [None]:
# test find_by_date() method
June_28_users = repo.find_by_date(date_string="2022-06-28")
print("June_28_users type", type(June_28_users))
print("June_28_users len", len(June_28_users))
June_28_users[:3]

In [None]:
# testing assign_to_groups() method
result = repo.assign_to_groups(date_string="2022-06-28")
print("result type:", type(result))
result

In [None]:
# testing export_treatment_emails() method
repo.export_treatment_emails(observations_assigned=June_28_users, directory=".")

In [None]:
# testing find_exp_observations() method
repo.find_exp_observations()

## Experiment

### Prepare Experiment

In [None]:
# instantiate GofChisquarePower
chi_square_power = GofChisquarePower()
# calculate group size
group_size = math.ceil(
    chi_square_power.solve_power(effect_size = 0.3, alpha=0.05, power=0.8)
)
print("Group size:", group_size)
print("Total # of applicants needed:", group_size * 2)

In [None]:
# Aggregate no-quiz applicants by sign-up date
result = mscfe_app.aggregate(
    [
        { "$match":{"admissionsQuiz":"incomplete"}},
        {
            "$group":{
                "_id":{"$dateTrunc":{"date":"$createdAt", "unit":"day"}},
                "count":{"$sum": 1}
            }
        }
    ]
)

# Load result into DataFrame
no_quiz_mscfe = (
    pd.DataFrame(result)
    .rename({"_id":"date","count":"new_users"}, axis=1)
    .set_index("date")
    .sort_index()
    .squeeze()
)

print("no_quiz type:", type(no_quiz_mscfe))
print("no_quiz shape:", no_quiz_mscfe.shape)
no_quiz_mscfe.head()

In [None]:
# mean and standard deviation of the values in no_quiz_mscfe
mean = no_quiz_mscfe.describe()["mean"]
std = no_quiz_mscfe.describe()["std"]
print("no_quiz mean:", mean)
print("no_quiz std:", std)

In [None]:
# calculates the mean and standard deviation of the probability distribution for the total number of days assigned to exp_days
exp_days = 19
sum_mean = mean * exp_days
sum_std = std * np.sqrt(exp_days)
print("Mean of sum:", sum_mean)
print("Std of sum:", sum_std)

In [None]:
prob_200_or_fewer = scipy.stats.norm.cdf(
    group_size*2,
    loc =sum_mean,
    scale=sum_std
)
prob_200_or_greater = 1 - prob_200_or_fewer

print(
    f"Probability of getting 200+ no_quiz in {exp_days} days:",
    round(prob_200_or_greater, 3),
)

### Run Experiment

### Analyze Results

In [None]:
# get data
result = repo.find_exp_observations()
df = pd.DataFrame(result).dropna()

print("df type:", type(df))
print("df shape:", df.shape)
df.head()

#### Build Contingency

In [None]:
data = pd.crosstab(
    index=df['group'],
    columns=df['admissionsQuiz'],
    normalize=False
)

print("data type:", type(data))
print("data shape:", data.shape)
data