## Imports

In [1]:
import pandas as pd
import numpy as np
from collections import Counter
#!pip install ipynb
from ipynb.fs.full.download_data import get_hash, get_values, get_GP
np.random.seed(42)

## Get data of the general population

In [3]:
df = get_GP(1000_000)
df.head()

shape(1000000, 5)
Index(['user_id', 'country', 'device', 'partner_id', 'view'], dtype='object')


Unnamed: 0,user_id,country,device,partner_id,view
0,cfcd208495d565ef66e7dff9f98764da,US,android,Meta,1
1,c4ca4238a0b923820dcc509a6f75849b,DE,android,GA,0
2,c81e728d9d4c2f636f067f89cc14862c,DE,android,Vungle,0
3,eccbc87e4b5ce2fe28308fd9f2a7baf3,US,android,Vungle,0
4,a87ff679a2f3e71d9181a67b7542122c,US,android,Vungle,0


## The function of calculating weights by layers of the general population (stratification)

In [8]:
def get_weight(data, strat_columns) :
    '''Returns the weights of the stratums in the population'''
    weights = {}
    
    # Cycle through specified layers
    for _, strata in data[strat_columns].drop_duplicates().sort_values(by=['device', 'country']).T.to_dict().items():
        df_indexs = []
        
        # Gradually collect indexes by elements (columns) of the stratum
        for k,v in strata.items():
            df_indexs.append(data[data[k] == v].index.to_list())
        
        # Take the intersections of indices by columns to select a stratum
        df_indexs_flat = [item for sublist in df_indexs for item in sublist]        
    
        df_indexs_counter = Counter(df_indexs_flat)
        newDict = {key:value for (key,value) in df_indexs_counter.items() if value == len(strat_columns)}
    
        # Returns a list of indexes by stratum
        df_indexs_stata = list(newDict.keys())
        
        # Return df by stratum and calculate its share p
        df_strata = data.loc[df_indexs_stata]
        
        #return df_indexs_stata
        p = len(df_strata) / len(data)

        # Make a tuple for easy reading of the key in the scales
        key = []
        for k,v in strata.items():
            key.append(k)
            key.append(v)
        
        # Write down the p values for the stratum
        weights[tuple(key)] = round(p, 2)

    return weights

## Launch the fucntion get_weight_GP

In [9]:
# weights of statum in the general population
weights_GP = get_weight(data=df,strat_columns=['device', 'country'])
weights_GP

{('device', 'android', 'country', 'DE'): 0.21,
 ('device', 'android', 'country', 'US'): 0.42,
 ('device', 'android', 'country', 'ZW'): 0.07,
 ('device', 'ios', 'country', 'DE'): 0.09,
 ('device', 'ios', 'country', 'US'): 0.18,
 ('device', 'ios', 'country', 'ZW'): 0.03}

##  Data generation of the research sample

In [11]:
size=10000
user_ids = [get_hash(i) for i in range(0, size)]
countries = get_values(lst=['US', 'DE', 'ZW'], p=[0.4, 0.2, 0.4], size=len(user_ids))
devices = get_values(lst=['android', 'ios'], p=[0.5, 0.5], size=len(user_ids))
partners_ids = get_values(lst=['Meta', 'GA', 'Vungle'], p=[0.4, 0.3, 0.3], size=len(user_ids))
views = get_values(lst=[1, 0], p=[0.1, 0.9], size=len(user_ids))
    
    
df_sample = pd.DataFrame(
    {'user_id':user_ids, 'country': countries, 
     'device': devices, 'partner_id':partners_ids, 
     'view':views}
) 
    
print(f'shape{df_sample.shape}')
print(df_sample.columns)

shape(10000, 5)
Index(['user_id', 'country', 'device', 'partner_id', 'view'], dtype='object')


In [13]:
# weights of statum in the sample population
weights_sample = get_weight(df_sample,strat_columns=['device', 'country'])
weights_sample

{('device', 'android', 'country', 'DE'): 0.1,
 ('device', 'android', 'country', 'US'): 0.21,
 ('device', 'android', 'country', 'ZW'): 0.19,
 ('device', 'ios', 'country', 'DE'): 0.1,
 ('device', 'ios', 'country', 'US'): 0.2,
 ('device', 'ios', 'country', 'ZW'): 0.2}

## Function for stratified sampling

In [15]:
def get_stats_sample(data, strat_columns, weights_values):

    df_stratified = pd.DataFrame()
    
    #Counter for the weights_values[cnt]
    cnt=0
    
    # Like get_weight_GP()
    for _, strata in data[strat_columns].drop_duplicates().sort_values(by=['device', 'country']).T.to_dict().items():
        df_indexs = []
        
        for k,v in strata.items():
            df_indexs.append(data[data[k] == v].index.to_list())

        df_indexs_flat = [item for sublist in df_indexs for item in sublist]
        df_indexs_counter = Counter(df_indexs_flat)
        newDict = {key:value for (key,value) in df_indexs_counter.items() if value == len(strat_columns)}
        
        df_indexs_stata = list(newDict.keys())
        
        # Select size for strata
        size = int(np.floor(weights_values[cnt] * len(data)))
        print(f'weight: {weights_values[cnt]}, size: {size}')
        
        #Add counter for the weights_values[cnt]
        cnt+=1
        
        # Select random from stata with size and return stratified dataframe
        sample_index = np.random.choice(df_indexs_stata, size=size, replace=True)
        df_strata = data.loc[sample_index]
        df_stratified = pd.concat([df_stratified, df_strata])
        
    return df_stratified.reset_index()
        

df_stratified = get_stats_sample(
    data=df_sample, strat_columns=['device', 'country'], weights_values=list(weights_GP.values())
)
df_stratified.head()

weight: 0.21, size: 2100
weight: 0.42, size: 4200
weight: 0.07, size: 700
weight: 0.09, size: 900
weight: 0.18, size: 1800
weight: 0.03, size: 300


Unnamed: 0,index,user_id,country,device,partner_id,view
0,4044,df5354693177e83e8ba089e94b7b6b55,DE,android,Meta,0
1,6150,598a90004bace6540f0e2230bdc47c09,DE,android,GA,0
2,4389,0d27688c61c5a172e8e45956cd70cba2,DE,android,Meta,1
3,1648,7437d136770f5b35194cb46c1653efaa,DE,android,Meta,0
4,9982,1b932eaf9f7c0cb84f471a560097ddb8,DE,android,Meta,0


In [16]:
# weights of stratum in research sample after stratification
weights_stratified = get_weight(df_stratified, strat_columns=['device', 'country'])
weights_stratified

{('device', 'android', 'country', 'DE'): 0.21,
 ('device', 'android', 'country', 'US'): 0.42,
 ('device', 'android', 'country', 'ZW'): 0.07,
 ('device', 'ios', 'country', 'DE'): 0.09,
 ('device', 'ios', 'country', 'US'): 0.18,
 ('device', 'ios', 'country', 'ZW'): 0.03}

In [17]:
# weights of stratum in general population
get_weight(df, strat_columns=['device', 'country'])

{('device', 'android', 'country', 'DE'): 0.21,
 ('device', 'android', 'country', 'US'): 0.42,
 ('device', 'android', 'country', 'ZW'): 0.07,
 ('device', 'ios', 'country', 'DE'): 0.09,
 ('device', 'ios', 'country', 'US'): 0.18,
 ('device', 'ios', 'country', 'ZW'): 0.03}

In [18]:
# weights of stratum in research sample before stratification
weights_sample

{('device', 'android', 'country', 'DE'): 0.1,
 ('device', 'android', 'country', 'US'): 0.21,
 ('device', 'android', 'country', 'ZW'): 0.19,
 ('device', 'ios', 'country', 'DE'): 0.1,
 ('device', 'ios', 'country', 'US'): 0.2,
 ('device', 'ios', 'country', 'ZW'): 0.2}