# Generate the website dataset


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
import sys

In [3]:
# !{sys.executable} -m pip install Faker

In [4]:
# tools for generating random data
import datetime
from faker import Faker
import random
from scipy.stats.distributions import bernoulli

faker = Faker()


In [33]:
Faker.seed(19)

N = 2000  # number of visitors total
rvA = bernoulli(p=0.04)  # A conversion rate
rvB = bernoulli(p=0.06)  # B conversion rate

visits_list = []

tstart = datetime.datetime.now()

for i in range(N):
    dt = datetime.timedelta(seconds=random.randint(1, 30))
    time = (tstart + dt).strftime('%d/%b/%Y:%H:%M:%S')
    version = random.choice(["A", "B"])
    ip = faker.ipv4()
    if version == "A":
        bought = rvA.rvs(1)[0]
    elif version == "B":
        bought = rvB.rvs(1)[0]
    visit = (ip, version, bought)
    visits_list.append(visit)

visitors = pd.DataFrame(visits_list, columns=["IP address", "version", "bought"])
visitors.head()

Unnamed: 0,IP address,version,bought
0,135.185.92.4,B,0
1,14.75.235.1,A,0
2,50.132.244.139,B,0
3,144.181.130.234,A,0
4,90.92.5.100,B,1


In [35]:
# visitors.to_csv("../datasets/visitors.csv", index=False)

In [36]:
visitors.groupby("version")["bought"].value_counts()

version  bought
A        0         967
         1          47
B        0         930
         1          56
Name: bought, dtype: int64

In [37]:
visitors.groupby("version")["bought"].value_counts().unstack()

bought,0,1
version,Unnamed: 1_level_1,Unnamed: 2_level_1
A,967,47
B,930,56


In [38]:
ct = pd.crosstab(index=visitors["version"],
                 columns=visitors["bought"],
                 margins=True, margins_name="TOTAL")
ct

bought,0,1,TOTAL
version,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,967,47,1014
B,930,56,986
TOTAL,1897,103,2000


In [41]:
# print(ct.style.to_latex())

In [42]:
pA = ct.loc["A",1] / ct.loc["A","TOTAL"]
pB = ct.loc["B",1] / ct.loc["B","TOTAL"]
pA, pB

(0.046351084812623275, 0.056795131845841784)

In [44]:
# print(ct.style.to_latex())

In [12]:
pA*(1-pA)

0.021246013083614883

In [13]:
ax = sns.barplot(x="version", y="bought", data=visits, capsize=0.3)
ax.set_ylabel("proportion bought")

NameError: name 'visits' is not defined

In [None]:
filename = "figures/compare_visitors.pdf"
ax.figure.tight_layout()
ax.figure.savefig(filename, dpi=300, bbox_inches="tight", pad_inches=0)

# Full script

TODO: generate fake web server log using https://github.com/kiritbasu/Fake-Apache-Log-Generator/blob/master/apache-fake-log-gen.py

In [None]:
faker.user_agent()