# Mix effect impact function iteration

2024-07-30

Different examples of python function to code mix effect impact. As a follow-up step we will take one and crate a python package.

In [1]:
import numpy as np
import pandas as pd

In [2]:
import sys
import os

# Get the absolute path of the src directory
src_path = os.path.abspath(os.path.join(os.getcwd(), '..', 'src/mixeffects'))

# Add src directory to the system path
if src_path not in sys.path:
    sys.path.insert(0, src_path)

# Now you can import the miseffects module
import mixeffects

In [3]:
def mixcontribution(components, w1, x1, w2, x2, name1="reference", name2="value"):
    # test: w1, x1, w2, x2 are numpy arrays of equal length
    # components is a list of characters equal lenght of w1, x1, w2, x2
    m1 = np.dot(w1, x1)
    m2 = np.dot(w2, x2)
    deltam = m2 - m1
    deltam_rel = m2 / m1 - 1
    w2x1 = np.dot(w2, x1)
    delta_values = m2 - w2x1
    delta_weights = w2x1 - m1
    p_values = delta_values / deltam
    p_weights = delta_weights / deltam

    return({
        'm1' : m1, 'm2' : m2,
        'deltam' : deltam, 'deltam_rel' : deltam_rel,
        'w2x1' : w2x1,
        'delta_values' : delta_values, 'delta_weights' : delta_weights,
        'p_values' : p_values, 'p_weights' : p_weights
    })

## UC Berkeley gender bias

I apply the mix effect formula to the exemple of Simpson's paradox in [wikipedia](https://en.wikipedia.org/wiki/Simpson%27s_paradox). We compare men and women graduate school admissions to UC Berkeley (men: 44%, women: 35%). It looks like acceptance is greater for men than for women. Nevertheless, we can see that the weights of application in each department are not the same for men and women. To compare men and women acceptance we have to compare them controling the number of applicants proportion across departments. Let us do it

### Data

In [4]:
data_department = pd.DataFrame({'gender' : ['m'] * 6 + ['w'] * 6,
    'department' : ['A', 'B', 'C', 'D', 'E', 'F'] * 2, 'applicants' : pd.Series([825,	560, 325, 417, 191, 373, 108, 25, 593, 375, 393, 341]),
    'admissions' : pd.Series([0.62, 0.63, 0.37, 0.33, 0.28, 0.06, 0.82, 0.68, 0.34, 0.35, 0.24, 0.07])})
data_department['admitted'] = data_department['applicants'] * data_department['admissions']
data_department['admissions_percentage'] = data_department['admissions'] * 100
data_department['weight_by_gender_department']=data_department['applicants']/data_department.groupby('gender')['applicants'].transform('sum')
data_men = data_department[data_department['gender'] == 'm']
data_women = data_department[data_department['gender'] == 'w']

In [5]:
data_department


Unnamed: 0,gender,department,applicants,admissions,admitted,admissions_percentage,weight_by_gender_department
0,m,A,825,0.62,511.5,62.0,0.306577
1,m,B,560,0.63,352.8,63.0,0.208101
2,m,C,325,0.37,120.25,37.0,0.120773
3,m,D,417,0.33,137.61,33.0,0.154961
4,m,E,191,0.28,53.48,28.0,0.070977
5,m,F,373,0.06,22.38,6.0,0.13861
6,w,A,108,0.82,88.56,82.0,0.058856
7,w,B,25,0.68,17.0,68.0,0.013624
8,w,C,593,0.34,201.62,34.0,0.323161
9,w,D,375,0.35,131.25,35.0,0.20436


In [6]:
data_gender = data_department.groupby('gender').sum()
data_gender['admissions'] = (data_gender['admitted'] / data_gender['applicants']).round(2)
data_gender

Unnamed: 0_level_0,department,applicants,admissions,admitted,admissions_percentage,weight_by_gender_department
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
m,ABCDEF,2691,0.45,1198.02,229.0,1.0
w,ABCDEF,1835,0.3,556.62,250.0,1.0


### Mix effects

In [7]:
result = mixcontribution(components=data_men['department'], w1=data_women['weight_by_gender_department'], x1=data_women['admissions'],
    w2=data_men['weight_by_gender_department'], x2=data_men['admissions'], name1='women', name2='men')

In [8]:
result

{'m1': np.float64(0.3033351498637602),
 'm2': np.float64(0.4451950947603121),
 'deltam': np.float64(0.1418599448965519),
 'deltam_rel': np.float64(0.4676673473557773),
 'w2x1': np.float64(0.514938684503902),
 'delta_values': np.float64(-0.0697435897435899),
 'delta_weights': np.float64(0.2116035346401418),
 'p_values': np.float64(-0.49163694370845),
 'p_weights': np.float64(1.49163694370845)}

In [11]:
print(result['delta_values'], result['p_values'])

-0.0697435897435899 -0.49163694370845


In [128]:
def mixcontribution_2(components, w1, x1, w2, x2, name1="reference", name2="value", verbose = True, unit = ""):
    """
    Computes contribution of weights and values to the differences between two weighted kpis (m1 and m1)

    Args:
        components: panda series with names for each component.
        w1: panda series with weights to compute kpi1 (m1) for each component
        x1: panda series with values to compute kpi1 (m1) for each component
        w2: panda series with weights to compute kpi2 (m2) for each component
        x2: panda series with values to compute kpi2 (m2) for each component
        name1: name of kpi1 (m1=np.dot(w1,x1)). By default 'reference' as the difference is computed as m2 - m1
        name2: name of kpi2 (m2=np.dot(w2,x2)). By default 'value' as the difference is computed as m2 - m1

    Returns:
        dictionary with several values to compute mix impact:
            m1: kpi1 np.dot(w1,x1)
            m2: kpi2 np.dot(w2,x2)
            deltam: difference between m2 and m1, also called gap
            deltam_rel: relative difference between m2 and m1
            w2x1: kpi1 computed keeping weights as in kpi2
            delta_values: deltam part that can be attributed to differences in values across components
            delta_weights: deltam part that can be attributed to differences in weights across components
            p_values: proportion of deltam attributed to difference in values
            p_weights: proportion of deltam attributed to difference in weights
    """
    # test: w1, x1, w2, x2 are numpy arrays of equal length
    # components is a list of characters equal lenght of w1, x1, w2, x2
    
    m1 = np.dot(w1, x1)
    m2 = np.dot(w2, x2)
    deltam = m2 - m1
    deltam_rel = m2 / m1 - 1
    w2x1 = np.dot(w2, x1)
    delta_values = m2 - w2x1
    delta_weights = w2x1 - m1
    p_values = delta_values / deltam
    p_weights = delta_weights / deltam
    n = components.size
    diff_by_component = x2.values - x1.values

    formatted_output = f"""
    Difference between {name2} kpi ({m2:.2f}) and {name1} kpi ({m1:.2f}), {deltam:.2f}{unit} ({deltam_rel * 100:.1f}%), can be split as:
    * {delta_values:.2f}{unit} due to actual difference between {name2} kpi and {name1} kpi across the {n} components, and
    * {delta_weights:.2f}{unit} due to the difference in the weights of each component for {name2} kpi and {name1} kpi
    In relative terms, value difference account for {100*p_values:.1f}% of the gap and the weights account for {100*p_weights:.1f}%.
    """
    if verbose:
        print(formatted_output)
    else:
        return({
            'm1' : m1, 'm2' : m2,
            'deltam' : deltam, 'deltam_rel' : deltam_rel,
            'w2x1' : w2x1,
            'delta_values' : delta_values, 'delta_weights' : delta_weights,
            'p_values' : p_values, 'p_weights' : p_weights,
            'number of components' : n,
            'components' : components.tolist(),
            'difference by component' : diff_by_component.tolist()
        })

In [129]:
mixcontribution_2(components=data_men['department'], w1=data_women['weight_by_gender_department'], x1=data_women['admissions'],
    w2=data_men['weight_by_gender_department'], x2=data_men['admissions'], name1='women', name2='men')


    Difference between men kpi (0.45) and women kpi (0.30), 0.14 (46.8%), can be split as:
    * -0.07 due to actual difference between men kpi and women kpi across the 6 components, and
    * 0.21 due to the difference in the weights of each component for men kpi and women kpi
    In relative terms, value difference account for -49.2% of the gap and the weights account for 149.2%.
    


In [48]:
mixcontribution_2(components=data_men['department'], w1=data_women['weight_by_gender_deparment'], x1=data_women['admissions'],
    w2=data_men['weight_by_gender_deparment'], x2=data_men['admissions'], name1='women', name2='men',  verbose=False)

{'m1': np.float64(0.3033351498637602),
 'm2': np.float64(0.4451950947603121),
 'deltam': np.float64(0.1418599448965519),
 'deltam_rel': np.float64(0.4676673473557773),
 'w2x1': np.float64(0.514938684503902),
 'delta_values': np.float64(-0.0697435897435899),
 'delta_weights': np.float64(0.2116035346401418),
 'p_values': np.float64(-0.49163694370845),
 'p_weights': np.float64(1.49163694370845),
 'number of components': 6,
 'components': ['A', 'B', 'C', 'D', 'E', 'F'],
 'difference by component': [-0.19999999999999996,
  -0.050000000000000044,
  0.02999999999999997,
  -0.019999999999999962,
  0.040000000000000036,
  -0.010000000000000009]}

In [80]:
mixcontribution_2(components=data_men['department'], w1=data_women['weight_by_gender_department'], x1=data_women['admissions_percentage'],
    w2=data_men['weight_by_gender_department'], x2=data_men['admissions_percentage'], name1='women', name2='men', unit = " pp")


    Difference between men kpi (44.52) and women kpi (30.33), 14.19 pp (46.8%), can be split as:
    * -6.97 pp due to actual difference between men kpi and women kpi across the 6 components, and
    * 21.16 pp due to the difference in the weights of each component for men kpi and women kpi
    In relative terms, value difference account for -49.2% of the gap and the weights account for 149.2%.
    


### Class MixEffectImpact

In [9]:
resp = mixeffects.MixEffectImpact(data_men['department'], data_women['weight_by_gender_department'], data_women['admissions_percentage'],
    data_men['weight_by_gender_department'], data_men['admissions_percentage'])

In [10]:
resp.diff_by_component

array([-20.,  -5.,   3.,  -2.,   4.,  -1.])

In [11]:
data_wide = data_department.pivot_table(index='department', columns='gender', 
    values=['applicants', 'admissions', 'admitted', 'admissions_percentage','weight_by_gender_department'])
data_wide.columns = ['_'.join(col).strip() for col in data_wide.columns.values]
data_wide = data_wide.reset_index()
data_required = data_wide[['department', 'weight_by_gender_department_w', 'admissions_percentage_w', 
    'weight_by_gender_department_m', 'admissions_percentage_m']]
data_in_columns = [data_required[col] for col in data_required]

In [115]:
print(data_required)

  department  weight_by_gender_department_w  admissions_percentage_w  \
0          A                       0.058856                     82.0   
1          B                       0.013624                     68.0   
2          C                       0.323161                     34.0   
3          D                       0.204360                     35.0   
4          E                       0.214169                     24.0   
5          F                       0.185831                      7.0   

   weight_by_gender_department_m  admissions_percentage_m  
0                       0.306577                     62.0  
1                       0.208101                     63.0  
2                       0.120773                     37.0  
3                       0.154961                     33.0  
4                       0.070977                     28.0  
5                       0.138610                      6.0  


### Pass values in a dataframe

In [15]:
resp2 = mixeffects.MixEffectImpact(*data_required.T.values)

In [16]:
resp2.diff_by_component

array([-20.0, -5.0, 3.0, -2.0, 4.0000000000000036, -1.0000000000000009],
      dtype=object)

In [17]:
print(*data_required.T.values)

['A' 'B' 'C' 'D' 'E' 'F'] [0.05885558583106267 0.013623978201634877 0.3231607629427793
 0.20435967302452315 0.21416893732970027 0.18583106267029972] [82.0 68.0 34.0 35.0 24.0 7.000000000000001] [0.30657748049052397 0.20810107766629507 0.12077294685990338
 0.15496098104793757 0.07097733184689707 0.13861018208844295] [62.0 63.0 37.0 33.0 28.000000000000004 6.0]


In [18]:
resp3 = mixeffects.MixEffectImpact(*[data_required[col] for col in data_required])

In [19]:
print(data_in_columns)

[0    A
1    B
2    C
3    D
4    E
5    F
Name: department, dtype: object, 0    0.058856
1    0.013624
2    0.323161
3    0.204360
4    0.214169
5    0.185831
Name: weight_by_gender_department_w, dtype: float64, 0    82.0
1    68.0
2    34.0
3    35.0
4    24.0
5     7.0
Name: admissions_percentage_w, dtype: float64, 0    0.306577
1    0.208101
2    0.120773
3    0.154961
4    0.070977
5    0.138610
Name: weight_by_gender_department_m, dtype: float64, 0    62.0
1    63.0
2    37.0
3    33.0
4    28.0
5     6.0
Name: admissions_percentage_m, dtype: float64]


In [140]:
type(data_in_columns)

list

In [20]:
resp3.diff_by_component

array([-20.,  -5.,   3.,  -2.,   4.,  -1.])

In [12]:
resp4 = mixeffects.MixEffectImpact(*data_in_columns)

In [13]:
resp4.diff_by_component

array([-20.,  -5.,   3.,  -2.,   4.,  -1.])

In [14]:
print(resp4)


        Difference between kpi2 (44.52) and kpi1 (30.33), 14.19 (46.8%), can be split as:
        * -6.97 due to actual difference between kpi2 and kpi1 across the 6 components, and
        * 21.16 due to the difference in the weights of each component for kpi2 and kpi1
        In relative terms, value difference account for -49.2% of the gap and the weights account for 149.2%.
        
