# Experiment Example

The purpose of this notebook is to provide an example for basic A/B Testing (Experiment) analysis and hypothesis testing.

## Import Statements

In [89]:
# General 
import datetime
from datetime import date
import os

# Notebook Specific
from IPython.core.interactiveshell import InteractiveShell

# Data Analysis/Statistics
import numpy as np
import pandas as pd
import scipy.stats as stats

import statsmodels.formula.api as smf
import statsmodels.stats.multicomp as multi
import scipy
from scipy.stats import pearsonr
# import pandas as pd
from seaborn import regplot
import matplotlib.pyplot as plt
# import numpy as np
import seaborn as sns

# Visualization
import plotly as ply
import plotly.graph_objs as go
import ipywidgets as widgets
import plotly.express as px
import plotly.io as pio

## Notebook Settings and Custom Functions

### Notebook Settings

In [2]:
# Allows multiple outputs per cell
InteractiveShell.ast_node_interactivity = "all"

In [62]:
# Show all columns, rows, and set the float format to display 2 decimal places
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.float_format', lambda x: '%.2f' % x)

In [4]:
# Sets Plotly to offline mode so everything renders locally and not via the web API.
ply.offline.init_notebook_mode(connected=True)

## Data Ingestion

In [106]:
# Read in .csv
df = pd.read_csv('spot_the_fake_smile.csv')

## Data Preparation

In [107]:
# Shows the first 5 columns by default (insert a number in the parentheses if you want to see more or less)
df.head()
df.describe()

Unnamed: 0,StartDate,EndDate,Progress,Duration,Finished,Estimation,Gender,Age,Related Job,Q6,Q7,Q8,Q11,Q12,Q13,Q14,Q15,Q16,Q17,Q18,Q19,Q20,Q21,Q22,Q23,Q24,Q25,Q26,Q27,Score,Control Group,Doc AND Video Group,Video Group,Document Group
0,12/4/18 2:24,12/4/18 2:27,100,205,1,12.0,2.0,17.0,6.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,11,1.0,,,
1,12/4/18 2:24,12/4/18 2:29,100,267,1,15.0,2.0,17.0,6.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,11,1.0,,,
2,12/4/18 2:24,12/4/18 2:31,100,441,1,10.0,2.0,18.0,6.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,,,,,13,1.0,,,
3,12/4/18 2:25,12/4/18 2:31,100,347,1,10.0,2.0,18.0,6.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,13,1.0,,,
4,12/4/18 2:26,12/4/18 2:32,100,378,1,10.0,2.0,17.0,6.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,14,1.0,,,


Unnamed: 0,Progress,Duration,Finished,Estimation,Gender,Age,Related Job,Q6,Q7,Q8,Q11,Q12,Q13,Q14,Q15,Q16,Q17,Q18,Q19,Q20,Q21,Q22,Q23,Q24,Q25,Q26,Q27,Score,Control Group,Doc AND Video Group,Video Group,Document Group
count,711.0,711.0,711.0,554.0,593.0,569.0,581.0,570.0,561.0,560.0,562.0,563.0,560.0,560.0,559.0,559.0,559.0,559.0,559.0,556.0,558.0,558.0,557.0,554.0,557.0,558.0,559.0,711.0,165.0,159.0,190.0,197.0
mean,96.42,1805.54,0.84,12.55,1.63,21.04,5.75,0.32,0.79,0.48,0.78,0.21,0.18,0.81,0.24,0.79,0.19,0.31,0.74,0.88,0.7,0.37,0.26,0.46,0.92,0.2,0.62,11.6,1.0,1.0,1.0,1.0
std,8.25,9899.52,0.37,4.68,0.48,11.89,0.59,0.47,0.41,0.5,0.42,0.41,0.39,0.4,0.43,0.41,0.39,0.46,0.44,0.33,0.46,0.48,0.44,0.5,0.28,0.4,0.48,6.51,0.0,0.0,0.0,0.0
min,75.0,5.0,0.0,0.0,1.0,2.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0
25%,100.0,182.0,1.0,10.0,1.0,15.0,6.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,9.0,1.0,1.0,1.0,1.0
50%,100.0,283.0,1.0,13.0,2.0,17.0,6.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,14.0,1.0,1.0,1.0,1.0
75%,100.0,475.5,1.0,15.0,2.0,21.0,6.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,17.0,1.0,1.0,1.0,1.0
max,100.0,99822.0,1.0,29.0,2.0,83.0,6.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,20.0,1.0,1.0,1.0,1.0


Now we'll need to create some additional dataframes. Specifically, we need:

- Control group
- Video *and* Document test group
- Video only test group
- Document only test group

But before we split into groups, we first need to filter out only the completed surveys for analysis. As the descriptive statistics above show, not everyone finished. (We'll address attrition later.)

### Create Dataframes for Each Group

In [108]:
# Completed only
dfComplete = df.loc[df['Finished'] == 1]
dfComplete.head()
dfComplete['Finished'].describe()

Unnamed: 0,StartDate,EndDate,Progress,Duration,Finished,Estimation,Gender,Age,Related Job,Q6,Q7,Q8,Q11,Q12,Q13,Q14,Q15,Q16,Q17,Q18,Q19,Q20,Q21,Q22,Q23,Q24,Q25,Q26,Q27,Score,Control Group,Doc AND Video Group,Video Group,Document Group
0,12/4/18 2:24,12/4/18 2:27,100,205,1,12.0,2.0,17.0,6.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,11,1.0,,,
1,12/4/18 2:24,12/4/18 2:29,100,267,1,15.0,2.0,17.0,6.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,11,1.0,,,
2,12/4/18 2:24,12/4/18 2:31,100,441,1,10.0,2.0,18.0,6.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,,,,,13,1.0,,,
3,12/4/18 2:25,12/4/18 2:31,100,347,1,10.0,2.0,18.0,6.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,13,1.0,,,
4,12/4/18 2:26,12/4/18 2:32,100,378,1,10.0,2.0,17.0,6.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,14,1.0,,,


count   598.00
mean      1.00
std       0.00
min       1.00
25%       1.00
50%       1.00
75%       1.00
max       1.00
Name: Finished, dtype: float64

In [109]:
# Control Group
dfControl = dfComplete.loc[dfComplete['Control Group'] == 1]
dfControl.head()
dfControl.describe()

Unnamed: 0,StartDate,EndDate,Progress,Duration,Finished,Estimation,Gender,Age,Related Job,Q6,Q7,Q8,Q11,Q12,Q13,Q14,Q15,Q16,Q17,Q18,Q19,Q20,Q21,Q22,Q23,Q24,Q25,Q26,Q27,Score,Control Group,Doc AND Video Group,Video Group,Document Group
0,12/4/18 2:24,12/4/18 2:27,100,205,1,12.0,2.0,17.0,6.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,11,1.0,,,
1,12/4/18 2:24,12/4/18 2:29,100,267,1,15.0,2.0,17.0,6.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,11,1.0,,,
2,12/4/18 2:24,12/4/18 2:31,100,441,1,10.0,2.0,18.0,6.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,,,,,13,1.0,,,
3,12/4/18 2:25,12/4/18 2:31,100,347,1,10.0,2.0,18.0,6.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,13,1.0,,,
4,12/4/18 2:26,12/4/18 2:32,100,378,1,10.0,2.0,17.0,6.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,14,1.0,,,


Unnamed: 0,Progress,Duration,Finished,Estimation,Gender,Age,Related Job,Q6,Q7,Q8,Q11,Q12,Q13,Q14,Q15,Q16,Q17,Q18,Q19,Q20,Q21,Q22,Q23,Q24,Q25,Q26,Q27,Score,Control Group,Doc AND Video Group,Video Group,Document Group
count,154.0,154.0,154.0,124.0,131.0,126.0,129.0,148.0,148.0,148.0,148.0,147.0,146.0,146.0,146.0,145.0,146.0,146.0,146.0,146.0,146.0,146.0,145.0,143.0,145.0,145.0,146.0,154.0,154.0,0.0,0.0,0.0
mean,100.0,2685.62,1.0,12.5,1.61,20.56,5.74,0.33,0.86,0.61,0.72,0.29,0.23,0.76,0.29,0.67,0.27,0.34,0.67,0.88,0.71,0.31,0.28,0.39,0.91,0.26,0.56,13.64,1.0,,,
std,0.0,14122.34,0.0,4.59,0.49,12.92,0.6,0.47,0.35,0.49,0.45,0.45,0.42,0.43,0.45,0.47,0.45,0.47,0.47,0.33,0.45,0.46,0.45,0.49,0.29,0.44,0.5,4.45,0.0,,,
min,100.0,28.0,1.0,1.0,1.0,7.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,,,
25%,100.0,203.5,1.0,10.0,1.0,13.0,6.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,12.0,1.0,,,
50%,100.0,283.0,1.0,12.0,2.0,17.0,6.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,14.5,1.0,,,
75%,100.0,415.5,1.0,15.0,2.0,21.0,6.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,17.0,1.0,,,
max,100.0,99299.0,1.0,29.0,2.0,83.0,6.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,20.0,1.0,,,


In [110]:
# Video AND Document Test Group 
dfVidDoc = dfComplete.loc[dfComplete['Doc AND Video Group'] == 1]
dfVidDoc.head()
dfVidDoc.describe()

Unnamed: 0,StartDate,EndDate,Progress,Duration,Finished,Estimation,Gender,Age,Related Job,Q6,Q7,Q8,Q11,Q12,Q13,Q14,Q15,Q16,Q17,Q18,Q19,Q20,Q21,Q22,Q23,Q24,Q25,Q26,Q27,Score,Control Group,Doc AND Video Group,Video Group,Document Group
154,12/4/18 2:24,12/4/18 2:29,100,277,1,12.0,2.0,17.0,6.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,17,,1.0,,
155,12/4/18 2:25,12/4/18 2:30,100,256,1,15.0,2.0,17.0,6.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,15,,1.0,,
156,12/4/18 2:24,12/4/18 2:30,100,364,1,10.0,2.0,17.0,6.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,18,,1.0,,
157,12/4/18 2:24,12/4/18 2:31,100,457,1,15.0,1.0,18.0,6.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,18,,1.0,,
158,12/4/18 3:19,12/4/18 3:24,100,268,1,,2.0,19.0,6.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,17,,1.0,,


Unnamed: 0,Progress,Duration,Finished,Estimation,Gender,Age,Related Job,Q6,Q7,Q8,Q11,Q12,Q13,Q14,Q15,Q16,Q17,Q18,Q19,Q20,Q21,Q22,Q23,Q24,Q25,Q26,Q27,Score,Control Group,Doc AND Video Group,Video Group,Document Group
count,139.0,139.0,139.0,106.0,115.0,109.0,110.0,132.0,130.0,130.0,130.0,130.0,130.0,130.0,130.0,130.0,129.0,130.0,130.0,130.0,130.0,130.0,130.0,129.0,129.0,129.0,129.0,139.0,0.0,139.0,0.0,0.0
mean,100.0,1437.1,1.0,12.5,1.62,21.72,5.73,0.33,0.73,0.41,0.8,0.19,0.15,0.83,0.32,0.78,0.14,0.35,0.76,0.85,0.68,0.39,0.22,0.47,0.91,0.2,0.57,13.59,,1.0,,
std,0.0,8675.68,0.0,4.58,0.49,12.64,0.6,0.47,0.45,0.49,0.4,0.4,0.35,0.38,0.47,0.42,0.35,0.48,0.43,0.35,0.47,0.49,0.42,0.5,0.29,0.4,0.5,4.45,,0.0,,
min,100.0,14.0,1.0,1.0,1.0,2.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,1.0,,
25%,100.0,223.0,1.0,10.0,1.0,14.0,6.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,12.0,,1.0,,
50%,100.0,320.0,1.0,12.0,2.0,17.0,6.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,15.0,,1.0,,
75%,100.0,515.0,1.0,15.0,2.0,21.0,6.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,17.0,,1.0,,
max,100.0,99822.0,1.0,22.0,2.0,69.0,6.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,20.0,,1.0,,


In [111]:
# Video Test Group
dfVideo = dfComplete.loc[dfComplete['Video Group'] == 1]
dfVideo.head()
dfVideo.describe()

Unnamed: 0,StartDate,EndDate,Progress,Duration,Finished,Estimation,Gender,Age,Related Job,Q6,Q7,Q8,Q11,Q12,Q13,Q14,Q15,Q16,Q17,Q18,Q19,Q20,Q21,Q22,Q23,Q24,Q25,Q26,Q27,Score,Control Group,Doc AND Video Group,Video Group,Document Group
293,11/29/18 11:50,11/29/18 11:58,100,517,1,15.0,2.0,16.0,6.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,17,,,1.0,
294,12/10/18 8:59,12/10/18 9:04,100,289,1,10.0,2.0,40.0,6.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,14,,,1.0,
295,11/30/18 11:11,11/30/18 11:23,100,730,1,5.0,2.0,27.0,6.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,16,,,1.0,
296,12/10/18 12:53,12/10/18 12:57,100,223,1,10.0,2.0,16.0,6.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,18,,,1.0,
297,12/7/18 15:49,12/7/18 15:54,100,266,1,11.0,2.0,20.0,6.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,19,,,1.0,


Unnamed: 0,Progress,Duration,Finished,Estimation,Gender,Age,Related Job,Q6,Q7,Q8,Q11,Q12,Q13,Q14,Q15,Q16,Q17,Q18,Q19,Q20,Q21,Q22,Q23,Q24,Q25,Q26,Q27,Score,Control Group,Doc AND Video Group,Video Group,Document Group
count,153.0,153.0,153.0,132.0,137.0,132.0,135.0,143.0,138.0,136.0,138.0,139.0,138.0,138.0,138.0,138.0,138.0,137.0,137.0,135.0,136.0,136.0,136.0,136.0,135.0,136.0,136.0,153.0,0.0,0.0,153.0,0.0
mean,100.0,763.41,1.0,12.14,1.67,20.96,5.88,0.29,0.79,0.52,0.8,0.16,0.21,0.79,0.2,0.85,0.12,0.26,0.76,0.87,0.75,0.36,0.18,0.45,0.91,0.15,0.73,13.81,,,1.0,
std,0.0,3792.24,0.0,4.68,0.47,11.73,0.41,0.45,0.41,0.5,0.4,0.37,0.41,0.41,0.4,0.36,0.32,0.44,0.43,0.33,0.43,0.48,0.38,0.5,0.29,0.36,0.45,5.28,,,0.0,
min,100.0,15.0,1.0,0.0,1.0,5.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,1.0,
25%,100.0,205.0,1.0,10.0,1.0,16.0,6.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.75,0.0,0.0,0.0,1.0,0.0,0.0,12.0,,,1.0,
50%,100.0,294.0,1.0,11.5,2.0,17.0,6.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,16.0,,,1.0,
75%,100.0,473.0,1.0,15.0,2.0,20.0,6.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,17.0,,,1.0,
max,100.0,46154.0,1.0,20.0,2.0,72.0,6.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,20.0,,,1.0,


In [112]:
# Completed only
dfDocument = dfComplete.loc[dfComplete['Document Group'] == 1]
dfDocument.head()
dfDocument.describe()

Unnamed: 0,StartDate,EndDate,Progress,Duration,Finished,Estimation,Gender,Age,Related Job,Q6,Q7,Q8,Q11,Q12,Q13,Q14,Q15,Q16,Q17,Q18,Q19,Q20,Q21,Q22,Q23,Q24,Q25,Q26,Q27,Score,Control Group,Doc AND Video Group,Video Group,Document Group
446,11/29/18 12:51,11/29/18 12:54,100,232,1,5.0,2.0,13.0,5.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,16,,,,1.0
447,12/10/18 9:19,12/10/18 9:31,100,693,1,20.0,1.0,28.0,4.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,16,,,,1.0
448,11/30/18 15:53,11/30/18 15:56,100,231,1,10.0,2.0,17.0,,0.0,,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,14,,,,1.0
449,12/10/18 12:58,12/10/18 13:30,100,1943,1,14.0,2.0,16.0,6.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,17,,,,1.0
450,12/9/18 8:52,12/9/18 8:58,100,390,1,16.0,2.0,19.0,6.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,14,,,,1.0


Unnamed: 0,Progress,Duration,Finished,Estimation,Gender,Age,Related Job,Q6,Q7,Q8,Q11,Q12,Q13,Q14,Q15,Q16,Q17,Q18,Q19,Q20,Q21,Q22,Q23,Q24,Q25,Q26,Q27,Score,Control Group,Doc AND Video Group,Video Group,Document Group
count,152.0,152.0,152.0,132.0,142.0,137.0,139.0,147.0,145.0,146.0,146.0,147.0,146.0,146.0,145.0,146.0,146.0,146.0,146.0,145.0,146.0,146.0,146.0,146.0,148.0,148.0,148.0,152.0,0.0,0.0,0.0,152.0
mean,100.0,2078.55,1.0,12.61,1.66,20.23,5.78,0.35,0.76,0.38,0.79,0.22,0.14,0.84,0.15,0.85,0.22,0.31,0.77,0.9,0.68,0.42,0.36,0.51,0.93,0.2,0.64,14.12,,,,1.0
std,0.0,11601.56,0.0,4.6,0.47,9.95,0.59,0.48,0.43,0.49,0.41,0.41,0.35,0.37,0.36,0.36,0.42,0.46,0.42,0.3,0.47,0.49,0.48,0.5,0.25,0.4,0.48,3.63,,,,0.0
min,100.0,36.0,1.0,1.0,1.0,10.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,1.0
25%,100.0,237.25,1.0,10.0,1.0,16.0,6.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,13.0,,,,1.0
50%,100.0,321.5,1.0,12.0,2.0,17.0,6.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,14.5,,,,1.0
75%,100.0,486.0,1.0,15.0,2.0,21.0,6.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,16.25,,,,1.0
max,100.0,83514.0,1.0,20.0,2.0,72.0,6.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,19.0,,,,1.0


## EDA

### Complete Scores

In [113]:
completeScoreHist = px.histogram(dfComplete, x='Score',
                  title='Histogram of Total Scores for All Groups', 
                  labels={'Score':'Total Scores'})
completeScoreHist.show()

### Control Group 

In [114]:
dfControl.describe()

Unnamed: 0,Progress,Duration,Finished,Estimation,Gender,Age,Related Job,Q6,Q7,Q8,Q11,Q12,Q13,Q14,Q15,Q16,Q17,Q18,Q19,Q20,Q21,Q22,Q23,Q24,Q25,Q26,Q27,Score,Control Group,Doc AND Video Group,Video Group,Document Group
count,154.0,154.0,154.0,124.0,131.0,126.0,129.0,148.0,148.0,148.0,148.0,147.0,146.0,146.0,146.0,145.0,146.0,146.0,146.0,146.0,146.0,146.0,145.0,143.0,145.0,145.0,146.0,154.0,154.0,0.0,0.0,0.0
mean,100.0,2685.62,1.0,12.5,1.61,20.56,5.74,0.33,0.86,0.61,0.72,0.29,0.23,0.76,0.29,0.67,0.27,0.34,0.67,0.88,0.71,0.31,0.28,0.39,0.91,0.26,0.56,13.64,1.0,,,
std,0.0,14122.34,0.0,4.59,0.49,12.92,0.6,0.47,0.35,0.49,0.45,0.45,0.42,0.43,0.45,0.47,0.45,0.47,0.47,0.33,0.45,0.46,0.45,0.49,0.29,0.44,0.5,4.45,0.0,,,
min,100.0,28.0,1.0,1.0,1.0,7.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,,,
25%,100.0,203.5,1.0,10.0,1.0,13.0,6.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,12.0,1.0,,,
50%,100.0,283.0,1.0,12.0,2.0,17.0,6.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,14.5,1.0,,,
75%,100.0,415.5,1.0,15.0,2.0,21.0,6.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,17.0,1.0,,,
max,100.0,99299.0,1.0,29.0,2.0,83.0,6.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,20.0,1.0,,,


In [115]:
controlScoreHist = px.histogram(dfControl, x='Score',
                  title='Histogram of Total Scores for the Control Group', 
                  labels={'Score':'Total Scores'})
controlScoreHist.show()

In [116]:
controlScatter = px.scatter_matrix(dfControl, dimensions=["Duration", "Estimation", "Age", "Score"])
controlScatter.show()

### Document Only Test Group

In [117]:
dfDocument.describe()

Unnamed: 0,Progress,Duration,Finished,Estimation,Gender,Age,Related Job,Q6,Q7,Q8,Q11,Q12,Q13,Q14,Q15,Q16,Q17,Q18,Q19,Q20,Q21,Q22,Q23,Q24,Q25,Q26,Q27,Score,Control Group,Doc AND Video Group,Video Group,Document Group
count,152.0,152.0,152.0,132.0,142.0,137.0,139.0,147.0,145.0,146.0,146.0,147.0,146.0,146.0,145.0,146.0,146.0,146.0,146.0,145.0,146.0,146.0,146.0,146.0,148.0,148.0,148.0,152.0,0.0,0.0,0.0,152.0
mean,100.0,2078.55,1.0,12.61,1.66,20.23,5.78,0.35,0.76,0.38,0.79,0.22,0.14,0.84,0.15,0.85,0.22,0.31,0.77,0.9,0.68,0.42,0.36,0.51,0.93,0.2,0.64,14.12,,,,1.0
std,0.0,11601.56,0.0,4.6,0.47,9.95,0.59,0.48,0.43,0.49,0.41,0.41,0.35,0.37,0.36,0.36,0.42,0.46,0.42,0.3,0.47,0.49,0.48,0.5,0.25,0.4,0.48,3.63,,,,0.0
min,100.0,36.0,1.0,1.0,1.0,10.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,1.0
25%,100.0,237.25,1.0,10.0,1.0,16.0,6.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,13.0,,,,1.0
50%,100.0,321.5,1.0,12.0,2.0,17.0,6.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,14.5,,,,1.0
75%,100.0,486.0,1.0,15.0,2.0,21.0,6.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,16.25,,,,1.0
max,100.0,83514.0,1.0,20.0,2.0,72.0,6.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,19.0,,,,1.0


In [118]:
documentScoreHist = px.histogram(dfDocument, x='Score',
                  title='Histogram of Total Scores for the Document Only Test Group', 
                  labels={'Score':'Total Scores'})
documentScoreHist.show()

In [119]:
documentScatter = px.scatter_matrix(dfDocument, dimensions=["Duration", "Estimation", "Age", "Score"])
documentScatter.show()

For this and the other test group sections, we'll also calculate the average treatment effect (ATE). This is simply the difference in the means between the test and control groups.

In [120]:
documentATE = dfDocument['Score'].mean() - dfControl['Score'].mean()
print('The average treatment effect between the document only test group and the control group is:',
      "{:.2f}".format(documentATE))

The average treatment effect between the document only test group and the control group is: 0.49


### Video Only Test Group 

In [121]:
dfVideo.describe()

Unnamed: 0,Progress,Duration,Finished,Estimation,Gender,Age,Related Job,Q6,Q7,Q8,Q11,Q12,Q13,Q14,Q15,Q16,Q17,Q18,Q19,Q20,Q21,Q22,Q23,Q24,Q25,Q26,Q27,Score,Control Group,Doc AND Video Group,Video Group,Document Group
count,153.0,153.0,153.0,132.0,137.0,132.0,135.0,143.0,138.0,136.0,138.0,139.0,138.0,138.0,138.0,138.0,138.0,137.0,137.0,135.0,136.0,136.0,136.0,136.0,135.0,136.0,136.0,153.0,0.0,0.0,153.0,0.0
mean,100.0,763.41,1.0,12.14,1.67,20.96,5.88,0.29,0.79,0.52,0.8,0.16,0.21,0.79,0.2,0.85,0.12,0.26,0.76,0.87,0.75,0.36,0.18,0.45,0.91,0.15,0.73,13.81,,,1.0,
std,0.0,3792.24,0.0,4.68,0.47,11.73,0.41,0.45,0.41,0.5,0.4,0.37,0.41,0.41,0.4,0.36,0.32,0.44,0.43,0.33,0.43,0.48,0.38,0.5,0.29,0.36,0.45,5.28,,,0.0,
min,100.0,15.0,1.0,0.0,1.0,5.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,1.0,
25%,100.0,205.0,1.0,10.0,1.0,16.0,6.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.75,0.0,0.0,0.0,1.0,0.0,0.0,12.0,,,1.0,
50%,100.0,294.0,1.0,11.5,2.0,17.0,6.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,16.0,,,1.0,
75%,100.0,473.0,1.0,15.0,2.0,20.0,6.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,17.0,,,1.0,
max,100.0,46154.0,1.0,20.0,2.0,72.0,6.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,20.0,,,1.0,


In [122]:
videoScoreHist = px.histogram(dfVideo, x='Score',
                  title='Histogram of Total Scores for the Video Only Test Group', 
                  labels={'Score':'Total Scores'})
videoScoreHist.show()

In [123]:
videoScatter = px.scatter_matrix(dfVideo, dimensions=["Duration", "Estimation", "Age", "Score"])
videoScatter.show()

In [124]:
videoATE = dfVideo['Score'].mean() - dfControl['Score'].mean()
print('The average treatment effect between the video only test group and the control group is:',
      "{:.2f}".format(videoATE))

The average treatment effect between the video only test group and the control group is: 0.17


### Video and Document Test Group 

In [125]:
dfVidDoc.describe()

Unnamed: 0,Progress,Duration,Finished,Estimation,Gender,Age,Related Job,Q6,Q7,Q8,Q11,Q12,Q13,Q14,Q15,Q16,Q17,Q18,Q19,Q20,Q21,Q22,Q23,Q24,Q25,Q26,Q27,Score,Control Group,Doc AND Video Group,Video Group,Document Group
count,139.0,139.0,139.0,106.0,115.0,109.0,110.0,132.0,130.0,130.0,130.0,130.0,130.0,130.0,130.0,130.0,129.0,130.0,130.0,130.0,130.0,130.0,130.0,129.0,129.0,129.0,129.0,139.0,0.0,139.0,0.0,0.0
mean,100.0,1437.1,1.0,12.5,1.62,21.72,5.73,0.33,0.73,0.41,0.8,0.19,0.15,0.83,0.32,0.78,0.14,0.35,0.76,0.85,0.68,0.39,0.22,0.47,0.91,0.2,0.57,13.59,,1.0,,
std,0.0,8675.68,0.0,4.58,0.49,12.64,0.6,0.47,0.45,0.49,0.4,0.4,0.35,0.38,0.47,0.42,0.35,0.48,0.43,0.35,0.47,0.49,0.42,0.5,0.29,0.4,0.5,4.45,,0.0,,
min,100.0,14.0,1.0,1.0,1.0,2.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,1.0,,
25%,100.0,223.0,1.0,10.0,1.0,14.0,6.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,12.0,,1.0,,
50%,100.0,320.0,1.0,12.0,2.0,17.0,6.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,15.0,,1.0,,
75%,100.0,515.0,1.0,15.0,2.0,21.0,6.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,17.0,,1.0,,
max,100.0,99822.0,1.0,22.0,2.0,69.0,6.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,20.0,,1.0,,


In [126]:
vidDocScoreHist = px.histogram(dfVidDoc, x='Score',
                  title='Histogram of Total Scores for the Video and Document Test Group',
                  labels={'Score':'Total Scores'})
vidDocScoreHist.show()

In [127]:
vidDocScatter = px.scatter_matrix(dfVidDoc, dimensions=["Duration", "Estimation", "Age", "Score"])
vidDocScatter.show()

In [128]:
vidDocATE = dfVidDoc['Score'].mean() - dfControl['Score'].mean()
print('The average treatment effect between the video and document test group and the control group is:',
      "{:.2f}".format(vidDocATE))

The average treatment effect between the video and document test group and the control group is: -0.05


## Attrition and Covariate Balance Checks

### Attrition Check

For experiments that involve humans (or non-humans) performing a task that is more involved than a click, expect attrition. Not everyone will finish the test, the program, or in this case, the survey. If you don't see this, be suspicious. Here, we're going to see if the attrition is significantly different across groups.

In [129]:
# Create an attrition dataframe
dfAttrition = df.loc[df['Finished'] != 1]
dfAttrition.head()

Unnamed: 0,StartDate,EndDate,Progress,Duration,Finished,Estimation,Gender,Age,Related Job,Q6,Q7,Q8,Q11,Q12,Q13,Q14,Q15,Q16,Q17,Q18,Q19,Q20,Q21,Q22,Q23,Q24,Q25,Q26,Q27,Score,Control Group,Doc AND Video Group,Video Group,Document Group
598,12/1/18 16:13,12/1/18 17:03,78,2965,0,,,,,,,,,,,,,,,,,,,,,,,,,0,,,,1.0
599,12/10/18 9:25,12/10/18 10:01,78,2171,0,,,,,,,,,,,,,,,,,,,,,,,,,0,,1.0,,
600,12/5/18 3:35,12/5/18 3:37,75,112,0,16.0,2.0,22.0,5.0,,,,,,,,,,,,,,,,,,,,,0,,1.0,,
601,12/10/18 8:21,12/10/18 8:22,76,47,0,17.0,1.0,16.0,6.0,,,,,,,,,,,,,,,,,,,,,0,,,,1.0
602,12/10/18 6:36,12/10/18 6:37,78,17,0,15.0,1.0,18.0,6.0,,,,,,,,,,,,,,,,,,,,,0,,,1.0,


Now we'll create dataframes for each group, then add a column called "Group", with each group labeled. This makes it easy to perform an analysis of variance (ANOVA) test.

We want to at least start off with an ANOVA test instead of a t-test when we have multiple groups like this. The reason being, it tests for *any* significant difference between the groups. If, instead, we were to use a series of t-tests, the p-values would inflate as an artifact, meaning the tests would be underpowered. Then, if there is a significant difference, we can use t-tests between individual pairs to find where a significant difference (or more) might lie. 

In [130]:
dfAttControl = dfAttrition.loc[df['Control Group'] == 1]
dfAttDocument = dfAttrition.loc[df['Document Group'] == 1]
dfAttVideo = dfAttrition.loc[df['Video Group'] == 1]
dfAttVidAndDoc = dfAttrition.loc[df['Doc AND Video Group'] == 1]

In [131]:
dfAttControl['Group'] = 'Control'
dfAttDocument['Group'] = 'Document'
dfAttVideo['Group'] = 'Video'
dfAttVidAndDoc['Group'] = 'Doc AND Video'



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/

In [132]:
dfAttrition = pd.concat([dfAttControl,dfAttDocument,dfAttVideo,dfAttVidAndDoc])
dfAttrition.head()
dfAttrition.describe()

Unnamed: 0,StartDate,EndDate,Progress,Duration,Finished,Estimation,Gender,Age,Related Job,Q6,Q7,Q8,Q11,Q12,Q13,Q14,Q15,Q16,Q17,Q18,Q19,Q20,Q21,Q22,Q23,Q24,Q25,Q26,Q27,Score,Control Group,Doc AND Video Group,Video Group,Document Group,Group
606,12/5/18 0:16,12/5/18 2:20,78,7454,0,10.0,2.0,17.0,6.0,,,,,,,,,,,,,,,,,,,,,0,1.0,,,,Control
628,12/9/18 21:49,12/9/18 21:50,78,63,0,,1.0,,6.0,,,,,,,,,,,,,,,,,,,,,0,1.0,,,,Control
635,12/5/18 0:16,12/5/18 2:20,78,7454,0,10.0,2.0,17.0,6.0,,,,,,,,,,,,,,,,,,,,,0,1.0,,,,Control
647,12/7/18 14:20,12/7/18 14:22,78,76,0,,,,,,,,,,,,,,,,,,,,,,,,,0,1.0,,,,Control
657,12/7/18 8:51,12/7/18 8:52,78,45,0,,,,,,,,,,,,,,,,,,,,,,,,,0,1.0,,,,Control


Unnamed: 0,Progress,Duration,Finished,Estimation,Gender,Age,Related Job,Q6,Q7,Q8,Q11,Q12,Q13,Q14,Q15,Q16,Q17,Q18,Q19,Q20,Q21,Q22,Q23,Q24,Q25,Q26,Q27,Score,Control Group,Doc AND Video Group,Video Group,Document Group
count,113.0,113.0,113.0,60.0,68.0,65.0,68.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,113.0,11.0,20.0,37.0,45.0
mean,77.49,2103.18,0.0,13.53,1.56,22.65,5.49,,,,,,,,,,,,,,,,,,,,,0.0,1.0,1.0,1.0,1.0
std,1.02,7223.1,0.0,5.19,0.5,12.75,0.76,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0
min,75.0,5.0,0.0,0.0,1.0,2.0,4.0,,,,,,,,,,,,,,,,,,,,,0.0,1.0,1.0,1.0,1.0
25%,78.0,43.0,0.0,10.0,1.0,16.0,5.0,,,,,,,,,,,,,,,,,,,,,0.0,1.0,1.0,1.0,1.0
50%,78.0,92.0,0.0,15.0,2.0,18.0,6.0,,,,,,,,,,,,,,,,,,,,,0.0,1.0,1.0,1.0,1.0
75%,78.0,385.0,0.0,16.25,2.0,26.0,6.0,,,,,,,,,,,,,,,,,,,,,0.0,1.0,1.0,1.0,1.0
max,78.0,62760.0,0.0,20.0,2.0,57.0,6.0,,,,,,,,,,,,,,,,,,,,,0.0,1.0,1.0,1.0,1.0


In [133]:
anovaAttrition = smf.ols(formula='Progress ~ C(Group)', data=dfAttrition).fit()
print(anovaAttrition.summary())

                            OLS Regression Results                            
Dep. Variable:               Progress   R-squared:                       0.216
Model:                            OLS   Adj. R-squared:                  0.195
Method:                 Least Squares   F-statistic:                     10.02
Date:                Mon, 29 Mar 2021   Prob (F-statistic):           6.88e-06
Time:                        21:58:54   Log-Likelihood:                -148.17
No. Observations:                 113   AIC:                             304.3
Df Residuals:                     109   BIC:                             315.2
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                                coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------------
Intercept             

### Covariate Balance Check

The purpose of the covariate balance check is to see if the covariates such as age and gender are balanced across groups. Essentially, think of this as checking on your randomization and making sure you didn't get an imbalanced sample. It's unlikely but possible, especially with sample sizes this small. We'll follow basically the same steps as we did with attrition, only conduct hypothesis testing regarding the covariates for the fully complete samples instead of the attrition.

In [134]:
dfControl['Group'] = 'Control'
dfDocument['Group'] = 'Document'
dfVideo['Group'] = 'Video'
dfVidDoc['Group'] = 'Doc AND Video'



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/

In [135]:
dfComplete = pd.concat([dfControl,dfDocument,dfVideo,dfVidDoc])
dfComplete.head()
dfComplete.describe()

Unnamed: 0,StartDate,EndDate,Progress,Duration,Finished,Estimation,Gender,Age,Related Job,Q6,Q7,Q8,Q11,Q12,Q13,Q14,Q15,Q16,Q17,Q18,Q19,Q20,Q21,Q22,Q23,Q24,Q25,Q26,Q27,Score,Control Group,Doc AND Video Group,Video Group,Document Group,Group
0,12/4/18 2:24,12/4/18 2:27,100,205,1,12.0,2.0,17.0,6.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,11,1.0,,,,Control
1,12/4/18 2:24,12/4/18 2:29,100,267,1,15.0,2.0,17.0,6.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,11,1.0,,,,Control
2,12/4/18 2:24,12/4/18 2:31,100,441,1,10.0,2.0,18.0,6.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,,,,,13,1.0,,,,Control
3,12/4/18 2:25,12/4/18 2:31,100,347,1,10.0,2.0,18.0,6.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,13,1.0,,,,Control
4,12/4/18 2:26,12/4/18 2:32,100,378,1,10.0,2.0,17.0,6.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,14,1.0,,,,Control


Unnamed: 0,Progress,Duration,Finished,Estimation,Gender,Age,Related Job,Q6,Q7,Q8,Q11,Q12,Q13,Q14,Q15,Q16,Q17,Q18,Q19,Q20,Q21,Q22,Q23,Q24,Q25,Q26,Q27,Score,Control Group,Doc AND Video Group,Video Group,Document Group
count,598.0,598.0,598.0,494.0,525.0,504.0,513.0,570.0,561.0,560.0,562.0,563.0,560.0,560.0,559.0,559.0,559.0,559.0,559.0,556.0,558.0,558.0,557.0,554.0,557.0,558.0,559.0,598.0,154.0,139.0,153.0,152.0
mean,100.0,1749.3,1.0,12.43,1.64,20.83,5.79,0.32,0.79,0.48,0.78,0.21,0.18,0.81,0.24,0.79,0.19,0.31,0.74,0.88,0.7,0.37,0.26,0.46,0.92,0.2,0.62,13.79,1.0,1.0,1.0,1.0
std,0.0,10331.6,0.0,4.61,0.48,11.78,0.56,0.47,0.41,0.5,0.42,0.41,0.39,0.4,0.43,0.41,0.39,0.46,0.44,0.33,0.46,0.48,0.44,0.5,0.28,0.4,0.48,4.49,0.0,0.0,0.0,0.0
min,100.0,14.0,1.0,0.0,1.0,2.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0
25%,100.0,217.0,1.0,10.0,1.0,14.0,6.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,12.0,1.0,1.0,1.0,1.0
50%,100.0,298.5,1.0,12.0,2.0,17.0,6.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,15.0,1.0,1.0,1.0,1.0
75%,100.0,480.25,1.0,15.0,2.0,21.0,6.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,17.0,1.0,1.0,1.0,1.0
max,100.0,99822.0,1.0,29.0,2.0,83.0,6.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,20.0,1.0,1.0,1.0,1.0


#### Gender

In [136]:
anovaGender = smf.ols(formula='Gender ~ C(Group)', data=dfComplete).fit()
print(anovaGender.summary())

                            OLS Regression Results                            
Dep. Variable:                 Gender   R-squared:                       0.003
Model:                            OLS   Adj. R-squared:                 -0.003
Method:                 Least Squares   F-statistic:                    0.5402
Date:                Mon, 29 Mar 2021   Prob (F-statistic):              0.655
Time:                        21:59:02   Log-Likelihood:                -358.18
No. Observations:                 525   AIC:                             724.4
Df Residuals:                     521   BIC:                             741.4
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                                coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------------
Intercept             

#### Age

In [137]:
anovaAge = smf.ols(formula='Age ~ C(Group)', data=dfComplete).fit()
print(anovaAge.summary())

                            OLS Regression Results                            
Dep. Variable:                    Age   R-squared:                       0.002
Model:                            OLS   Adj. R-squared:                 -0.004
Method:                 Least Squares   F-statistic:                    0.3554
Date:                Mon, 29 Mar 2021   Prob (F-statistic):              0.785
Time:                        21:59:03   Log-Likelihood:                -1957.1
No. Observations:                 504   AIC:                             3922.
Df Residuals:                     500   BIC:                             3939.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                                coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------------
Intercept             

#### Estimation

In [138]:
anovaEstimation = smf.ols(formula='Estimation ~ C(Group)', data=dfComplete).fit()
print(anovaEstimation.summary())

                            OLS Regression Results                            
Dep. Variable:             Estimation   R-squared:                       0.002
Model:                            OLS   Adj. R-squared:                 -0.005
Method:                 Least Squares   F-statistic:                    0.2505
Date:                Mon, 29 Mar 2021   Prob (F-statistic):              0.861
Time:                        21:59:04   Log-Likelihood:                -1454.6
No. Observations:                 494   AIC:                             2917.
Df Residuals:                     490   BIC:                             2934.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                                coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------------
Intercept             

#### Duration

In [139]:
anovaDuration = smf.ols(formula='Duration ~ C(Group)', data=dfComplete).fit()
print(anovaDuration.summary())

                            OLS Regression Results                            
Dep. Variable:               Duration   R-squared:                       0.005
Model:                            OLS   Adj. R-squared:                 -0.000
Method:                 Least Squares   F-statistic:                    0.9797
Date:                Mon, 29 Mar 2021   Prob (F-statistic):              0.402
Time:                        21:59:07   Log-Likelihood:                -6373.8
No. Observations:                 598   AIC:                         1.276e+04
Df Residuals:                     594   BIC:                         1.277e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                                coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------------
Intercept             

## Final Analysis

In this section, we'll first show descriptive statistics again for each group. It's not necessary, but nice to have. Generally speaking, it's better to put the same information in multiple places than it is to dig for it or worse, have others dig for it.

Then, we'll predict the average treatment effect (ATE) using simple ordinary least squares (OLS) linear regression models. We'll do both bivariate OLS regression (just the scores and the groups) along with a simple multivariate regression. (While none of the covariates were significant, I added this here for demonstration purposes.) As with above, we'll conduct hypothesis testing using ANOVA.

### Descriptive Statistics of the Total Scores per Group

In [140]:
dfControl['Score'].describe()

count   154.00
mean     13.64
std       4.45
min       0.00
25%      12.00
50%      14.50
75%      17.00
max      20.00
Name: Score, dtype: float64

In [141]:
dfDocument['Score'].describe()

count   152.00
mean     14.12
std       3.63
min       0.00
25%      13.00
50%      14.50
75%      16.25
max      19.00
Name: Score, dtype: float64

In [142]:
dfVideo['Score'].describe()

count   153.00
mean     13.81
std       5.28
min       0.00
25%      12.00
50%      16.00
75%      17.00
max      20.00
Name: Score, dtype: float64

In [143]:
dfVidDoc['Score'].describe()

count   139.00
mean     13.59
std       4.45
min       0.00
25%      12.00
50%      15.00
75%      17.00
max      20.00
Name: Score, dtype: float64

### Regression and Hypothesis Testing

In [144]:
# Bivariate
bvOLS = smf.ols(formula='Score ~ C(Group)', data=dfComplete).fit()
print(bvOLS.summary())

                            OLS Regression Results                            
Dep. Variable:                  Score   R-squared:                       0.002
Model:                            OLS   Adj. R-squared:                 -0.003
Method:                 Least Squares   F-statistic:                    0.4344
Date:                Mon, 29 Mar 2021   Prob (F-statistic):              0.728
Time:                        22:01:58   Log-Likelihood:                -1745.0
No. Observations:                 598   AIC:                             3498.
Df Residuals:                     594   BIC:                             3516.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                                coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------------
Intercept             

In [147]:
# Multivariate
mvOLS = smf.ols(formula='Score ~ C(Group)+C(Gender)', data=dfComplete).fit()
print(mvOLS.summary())

                            OLS Regression Results                            
Dep. Variable:                  Score   R-squared:                       0.011
Model:                            OLS   Adj. R-squared:                  0.004
Method:                 Least Squares   F-statistic:                     1.491
Date:                Mon, 29 Mar 2021   Prob (F-statistic):              0.204
Time:                        22:09:02   Log-Likelihood:                -1422.8
No. Observations:                 525   AIC:                             2856.
Df Residuals:                     520   BIC:                             2877.
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                                coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------------
Intercept             

### Save to .csv

In [148]:
dfAttrition.to_csv('dfAttrition.csv', index=False, date_format="%Y-%m-%d") 

In [149]:
dfComplete.to_csv('dfComplete.csv', index=False, date_format="%Y-%m-%d") 