In [92]:
import pandas as pd
from statsmodels.stats.multitest import multipletests
from scipy.stats import ttest_1samp


In [124]:
df = pd.read_csv("report - 2023-12-28T170047.456.csv")
df.dropna(inplace=True)
df.dtypes

Site Domain     object
Imps           float64
Clicks         float64
CTR %          float64
dtype: object

In [125]:
import pandas as pd
import numpy as np
from scipy.stats import norm

def boot_site(data, threshold=100, n_bootstrap=10):
    
    # Filter impressions based on threshold
    data_boot = data[data['Imps'] >= threshold].copy()

    # Calculate the overall average CTR
    average_ctr = data_boot['Clicks'].sum() / data_boot['Imps'].sum()

    # Function to bootstrap CTR for a site
    def bootstrap_ctr(clicks, imps, n_bootstrap=10):
        bootstrap_ctrs = []
        for _ in range(n_bootstrap):
            sample_clicks = np.random.binomial(imps, clicks / imps)
            sample_ctr = sample_clicks / imps
            bootstrap_ctrs.append(sample_ctr)
        return bootstrap_ctrs

    # Lists to store the results
    high_ctrs = {"Site Domain": [], "Clicks": [], "Imps": [], "CTR": []}
    low_ctrs = {"Site Domain": [], "Clicks": [], "Imps": [], "CTR": []}

    for index, row in data_boot.iterrows():
        site = row['Site Domain']
        clicks = row['Clicks']
        imps = row['Imps']
        site_ctr = row['CTR %']

        # Bootstrap CTR for the site
        bootstrap_distribution = bootstrap_ctr(clicks, imps, n_bootstrap=n_bootstrap)
        ci_lower = np.percentile(bootstrap_distribution, 5)  # 2.5 percentile
        ci_upper = np.percentile(bootstrap_distribution, 95) # 97.5 percentile

        # Check if average CTR falls outside this interval and categorize
        if average_ctr < ci_lower or average_ctr > ci_upper:
            if site_ctr > average_ctr:
                high_ctrs["Site Domain"].append(site)
                high_ctrs["Clicks"].append(clicks)
                high_ctrs['Imps'].append(imps)
                high_ctrs["CTR"].append(site_ctr)
            else:
                low_ctrs["Site Domain"].append(site)
                low_ctrs["Clicks"].append(clicks)
                low_ctrs['Imps'].append(imps)
                low_ctrs["CTR"].append(site_ctr)

    high_ctrs = pd.DataFrame(high_ctrs)
    low_ctrs = pd.DataFrame(low_ctrs)

    return high_ctrs, low_ctrs



In [126]:
import pandas as pd
import numpy as np
from scipy.stats import norm

def bi_site(data, threshold=100, confidence_level=0.95):
    
    # Filter data for impression threshold
    data_bi = data[data['Imps'] >= threshold].copy()

    # Calculate the overall average CTR
    average_ctr = data_bi['Clicks'].sum() / data_bi['Imps'].sum()

    # Confidence level and corresponding z-score
    z_score = norm.ppf((1 + confidence_level) / 2)

    # Lists to store the results
    high_ctrs = {"Site Domain": [], "Clicks": [], "Imps": [], "CTR": []}
    low_ctrs = {"Site Domain": [], "Clicks": [], "Imps": [], "CTR": []}

    for index, row in data_bi.iterrows():
        site = row['Site Domain']
        clicks = row['Clicks']
        imps = row['Imps']
        site_ctr = row['CTR %']  # Assuming CTR is already in proportion

        # Calculate confidence interval for site's CTR
        ci_lower = site_ctr - z_score * np.sqrt(site_ctr * (1 - site_ctr) / imps)
        ci_upper = site_ctr + z_score * np.sqrt(site_ctr * (1 - site_ctr) / imps)

        # Check if average CTR falls outside this interval and categorize
        if average_ctr < ci_lower or average_ctr > ci_upper:
            if site_ctr > average_ctr:
                high_ctrs["Site Domain"].append(site)
                high_ctrs["Clicks"].append(clicks)
                high_ctrs['Imps'].append(imps)
                high_ctrs["CTR"].append(site_ctr)
            else:
                low_ctrs["Site Domain"].append(site)
                low_ctrs["Clicks"].append(clicks)
                low_ctrs['Imps'].append(imps)
                low_ctrs["CTR"].append(site_ctr)

    high_ctrs = pd.DataFrame(high_ctrs)
    low_ctrs = pd.DataFrame(low_ctrs)

    return high_ctrs, low_ctrs


In [127]:
import pandas as pd
import numpy as np
from scipy.stats import beta

def bay_site(data, alpha_prior=1, beta_prior=20, threshold=100):
    
    # Filter data for impression threshold
    data_bay = data[data['Imps'] >= threshold].copy()
    
    # Calculate the overall average CTR
    average_ctr = data_bay['Clicks'].sum() / data_bay['Imps'].sum()


    # Dictionaries to store results
    high_ctrs = {"Site Domain": [], "Imps": [], "Clicks": [], "CTR": [], "Probability": []}
    low_ctrs = {"Site Domain": [], "Imps": [], "Clicks": [], "CTR": [], "Probability": []}

    for index, row in data_bay.iterrows():
        
        site = row['Site Domain']
        clicks = row['Clicks']
        imps = row['Imps']
        ctr = row['CTR %']

        # Update Beta distribution with observed data
        alpha_posterior = alpha_prior + clicks
        beta_posterior = beta_prior + (imps - clicks)

        # Calculate the probability that the site's CTR is greater than the average CTR
        prob = 1 - beta.cdf(average_ctr, alpha_posterior, beta_posterior)

        # Categorize based on probability
        if prob > 0.50:
            high_ctrs["Site Domain"].append(site)
            high_ctrs["Probability"].append(prob)
            high_ctrs["Imps"].append(imps)
            high_ctrs["CTR"].append(ctr)
            high_ctrs["Clicks"].append(clicks)
        else:
            low_ctrs["Site Domain"].append(site)
            low_ctrs["Probability"].append(prob)
            low_ctrs["Imps"].append(imps)
            low_ctrs["CTR"].append(ctr)
            low_ctrs["Clicks"].append(clicks)

    high_ctrs = pd.DataFrame(high_ctrs)
    low_ctrs = pd.DataFrame(low_ctrs)
    
    high_ctrs = high_ctrs[high_ctrs.Clicks != 0]

    return high_ctrs, low_ctrs



In [128]:
high, low = boot_site(df)

In [129]:
high

Unnamed: 0,Site Domain,Clicks,Imps,CTR
0,dailymail.co.uk,14.0,39755.0,0.000352
1,plainchicken.com,3.0,7922.0,0.000379
2,natashaskitchen.com,4.0,6861.0,0.000583
3,huffpost.com,3.0,6544.0,0.000458
4,splashtravels.com,5.0,5396.0,0.000927
5,downshiftology.com,4.0,5025.0,0.000796
6,julieseatsandtreats.com,2.0,3006.0,0.000665
7,rent.com,2.0,1373.0,0.001457
8,sudoku.com,3.0,1034.0,0.002901
9,plagiarismdetector.net,2.0,1017.0,0.001967


In [130]:
low

Unnamed: 0,Site Domain,Clicks,Imps,CTR
0,msn.com,4.0,54163.0,0.000074
1,foxnews.com,0.0,8909.0,0.000000
2,finance.yahoo.com,0.0,8842.0,0.000000
3,buzzfeed.com,0.0,8181.0,0.000000
4,reference.com,0.0,7259.0,0.000000
...,...,...,...,...
785,housebeautiful.com,0.0,100.0,0.000000
786,onelittleproject.com,0.0,100.0,0.000000
787,oprahdaily.com,0.0,100.0,0.000000
788,savorythoughts.com,0.0,100.0,0.000000


In [137]:
high, low = bi_site(df, confidence_level=0.90)

In [138]:
high

Unnamed: 0,Site Domain,Clicks,Imps,CTR
0,dailymail.co.uk,14.0,39755.0,0.000352
1,splashtravels.com,5.0,5396.0,0.000927
2,traderie.com,3.0,590.0,0.005085
3,weqyoua.net,3.0,138.0,0.021739


In [139]:
low

Unnamed: 0,Site Domain,Clicks,Imps,CTR
0,msn.com,4.0,54163.0,0.000074
1,foxnews.com,0.0,8909.0,0.000000
2,finance.yahoo.com,0.0,8842.0,0.000000
3,buzzfeed.com,0.0,8181.0,0.000000
4,reference.com,0.0,7259.0,0.000000
...,...,...,...,...
784,housebeautiful.com,0.0,100.0,0.000000
785,onelittleproject.com,0.0,100.0,0.000000
786,oprahdaily.com,0.0,100.0,0.000000
787,savorythoughts.com,0.0,100.0,0.000000


In [141]:
high, low = bay_site(df)

In [142]:
high

Unnamed: 0,Site Domain,Imps,Clicks,CTR,Probability
0,worldstar.com,44530.0,10.0,0.000225,0.760180
1,dailymail.co.uk,39755.0,14.0,0.000352,0.988384
2,nypost.com,18123.0,3.0,0.000166,0.542375
3,joyfoodsunshine.com,7978.0,4.0,0.000501,0.979930
4,plainchicken.com,7922.0,3.0,0.000379,0.931782
...,...,...,...,...,...
546,geoguessr.com,174.0,1.0,0.005747,0.999330
574,momsecrets.co,164.0,1.0,0.006098,0.999397
621,tigerboard.com,150.0,1.0,0.006667,0.999485
661,muppet.fandom.com,139.0,1.0,0.007194,0.999549


In [143]:
low

Unnamed: 0,Site Domain,Imps,Clicks,CTR,Probability
0,msn.com,54163.0,4.0,7.4e-05,0.022926
1,spendwithpennies.com,10351.0,1.0,9.7e-05,0.409924
2,foxnews.com,8909.0,0.0,0.0,0.180942
3,finance.yahoo.com,8842.0,0.0,0.0,0.183278
4,buzzfeed.com,8181.0,0.0,0.0,0.208005
5,reference.com,7259.0,0.0,0.0,0.248164
6,slickdeals.net,5245.0,0.0,0.0,0.364927
7,modernmic.com,5242.0,0.0,0.0,0.365137
8,lilluna.com,5205.0,0.0,0.0,0.367733
9,britannica.com,5194.0,0.0,0.0,0.368508
