# Bias testing

This juputernotebook performs the bias analysis for sex prediction, and age grading in both males and females. It reproduces Table S3 of the manuscript

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys

import pandas as pd

from scipy.signal import savgol_filter

sys.path.append('../src')
from utilities import gridsearch_bias
#from preprocessing import baseline_gridsearch

In [3]:
# Import data
#tsetse_data = pd.read_csv("../data/processed/TseTse_processed.csv")
tsetse_data = pd.read_csv("../data/processed/TseTse_processed_wo_outliers.csv")

## Sex bias

In [4]:
# slice thorax and heads
tsetse_thorax = tsetse_data.loc[tsetse_data['Tissue'] == 'Thorax']
tsetse_head = tsetse_data.loc[tsetse_data['Tissue'] == 'Head']

In [12]:
# lower resolution
window = 21
X_head_part1 = tsetse_head.loc[:,"4000":"402"]
X_head_part1 = X_head_part1[X_head_part1.columns[::4]]
X_head_part1_sg = savgol_filter(X_head_part1, window_length=window, polyorder=3, deriv=0)

X_head_part2 = tsetse_head.loc[:,"1750":"600"]
X_head_part2 = X_head_part2[X_head_part2.columns[::4]]
X_head_part2_sg = savgol_filter(X_head_part2, window_length=window, polyorder=3, deriv=0)

X_head_part3 = tsetse_head.loc[:,"3500":"2750"]
X_head_part3 = X_head_part3[X_head_part3.columns[::4]]
X_head_part3_sg = savgol_filter(X_head_part3, window_length=window, polyorder=3, deriv=0)

X_head_part4 = tsetse_head.loc[:,"2450":"1800"]
X_head_part4 = X_head_part4[X_head_part4.columns[::4]]
X_head_part4_sg = savgol_filter(X_head_part4, window_length=window, polyorder=3, deriv=0)

X_head_desbiased = pd.concat([X_head_part3,X_head_part2],axis=1)
X_head_desbiased_sg = savgol_filter(X_head_desbiased, window_length=window, polyorder=3, deriv=0)


y_head_sex = tsetse_head.loc[:,"Sex"]

In [13]:
# Thorax

X_thorax_part1 = tsetse_thorax.loc[:,"4000":"402"]
X_thorax_part1 = X_thorax_part1[X_thorax_part1.columns[::4]]

X_thorax_part2 = tsetse_thorax.loc[:,"1750":"600"]
X_thorax_part2 = X_thorax_part2[X_thorax_part2.columns[::4]]

X_thorax_part3 = tsetse_thorax.loc[:,"3500":"2750"]
X_thorax_part3 = X_thorax_part3[X_thorax_part3.columns[::4]]

X_thorax_part4 = tsetse_thorax.loc[:,"2450":"1800"]
X_thorax_part4 = X_thorax_part4[X_thorax_part4.columns[::4]]

X_thorax_desbiased = pd.concat([X_thorax_part3,X_thorax_part2],axis=1)

y_thorax_sex = tsetse_thorax.loc[:,"Sex"]

In [18]:
# Head
bias_test_head_part1 = gridsearch_bias(X_head_part1, y_head_sex)
bias_test_head_part2 = gridsearch_bias(X_head_part2, y_head_sex)
bias_test_head_part3 = gridsearch_bias(X_head_part3, y_head_sex)
bias_test_head_part4 = gridsearch_bias(X_head_part4, y_head_sex)
bias_test_head_part5 = gridsearch_bias(X_head_desbiased, y_head_sex)

In [19]:
# thorax
bias_test_part1 = gridsearch_bias(X_thorax_part1,y_thorax_sex)
bias_test_part2 = gridsearch_bias(X_thorax_part2,y_thorax_sex)
bias_test_part3 = gridsearch_bias(X_thorax_part3,y_thorax_sex)
bias_test_part4 = gridsearch_bias(X_thorax_part4,y_thorax_sex)
bias_test_part5 = gridsearch_bias(X_thorax_desbiased,y_thorax_sex)

In [20]:
result = pd.concat([bias_test_part1, bias_test_part2['mean_test_score'],bias_test_part3['mean_test_score'],bias_test_part4['mean_test_score'],bias_test_part5['mean_test_score']], axis=1)

result['Tissue'] = 'Thorax'

result2 = pd.concat([bias_test_head_part1, 
    bias_test_head_part2['mean_test_score'],
    bias_test_head_part3['mean_test_score'],
    bias_test_head_part4['mean_test_score'],
    bias_test_head_part5['mean_test_score']], axis=1)
result2['Tissue'] = 'Head'

In [21]:
bias_sex = pd.concat([result2, result])
bias_sex["problem"] = 'Sex'
bias_sex.columns = ["Model", "kernel","D1","D2","D3","D4","D5",'Tissue','problem']
bias_sex


Unnamed: 0,Model,kernel,D1,D2,D3,D4,D5,Tissue,problem
0,SVC(random_state=123),rbf,0.796226,0.820755,0.846226,0.833019,0.815094,Head,Sex
1,SVC(random_state=123),linear,0.822642,0.733019,0.773585,0.650943,0.799057,Head,Sex
2,LR,,0.793396,0.69434,0.739623,0.650943,0.762264,Head,Sex
3,RF,,0.862264,0.858491,0.859434,0.854717,0.870755,Head,Sex
0,SVC(random_state=123),rbf,0.657009,0.746729,0.785047,0.715888,0.785047,Thorax,Sex
1,SVC(random_state=123),linear,0.701869,0.654206,0.654206,0.654206,0.672897,Thorax,Sex
2,LR,,0.729907,0.659813,0.654206,0.654206,0.700935,Thorax,Sex
3,RF,,0.853271,0.839252,0.793458,0.771028,0.874766,Thorax,Sex


## Age male bias

In [23]:
tsetse_males_thorax = tsetse_data.loc[(tsetse_data['Sex'] == 'm') & (tsetse_data['Tissue'] == 'Thorax')]

tsetse_males_head = tsetse_data.loc[(tsetse_data['Sex'] == 'm') & (tsetse_data['Tissue'] == 'Head')]

In [24]:
# Thorax

X_thorax_part1 = tsetse_males_thorax.loc[:,"4000":"402"]
X_thorax_part2 = tsetse_males_thorax.loc[:,"1800":"600"]
X_thorax_part3 = tsetse_males_thorax.loc[:,"3500":"2750"]
X_thorax_part4 = tsetse_males_thorax.loc[:,"2450":"1800"]

X_thorax_desbiased = pd.concat([X_thorax_part3,X_thorax_part2],axis=1)

y_thorax = tsetse_males_thorax.loc[:,"Age"]


In [25]:
# head

X_head_part1 = tsetse_males_head.loc[:,"4000":"402"]
X_head_part2 = tsetse_males_head.loc[:,"1800":"600"]
X_head_part3 = tsetse_males_head.loc[:,"3500":"2750"]
X_head_part4 = tsetse_males_head.loc[:,"2450":"1800"]

X_head_desbiased = pd.concat([X_head_part3,X_head_part2],axis=1)


y_head = tsetse_males_head.loc[:,"Age"]

In [26]:
# thorax
bias_test_part1 = gridsearch_bias(X_thorax_part1,y_thorax)
bias_test_part2 = gridsearch_bias(X_thorax_part2,y_thorax)
bias_test_part3 = gridsearch_bias(X_thorax_part3,y_thorax)
bias_test_part4 = gridsearch_bias(X_thorax_part4,y_thorax)
bias_test_part5 = gridsearch_bias(X_thorax_desbiased,y_thorax)

In [27]:
# head

bias_test_head_part1 = gridsearch_bias(X_head_part1, y_head)
bias_test_head_part2 = gridsearch_bias(X_head_part2, y_head)
bias_test_head_part3 = gridsearch_bias(X_head_part3, y_head)
bias_test_head_part4 = gridsearch_bias(X_head_part4, y_head)
bias_test_head_part5 = gridsearch_bias(X_head_desbiased, y_head)

In [28]:
result = pd.concat([bias_test_part1, bias_test_part2['mean_test_score'],bias_test_part3['mean_test_score'],bias_test_part4['mean_test_score'],bias_test_part5['mean_test_score']], axis=1)
result["Tissue"] = "Thorax"

result2 = pd.concat([bias_test_head_part1, 
    bias_test_head_part2['mean_test_score'],
    bias_test_head_part3['mean_test_score'],
    bias_test_head_part4['mean_test_score'],
    bias_test_head_part5['mean_test_score']], axis=1)

result2["Tissue"] = "Head"

bias_agemales = pd.concat([result2, result])
bias_agemales["problem"] = 'Males age'
bias_agemales.columns = ["Model", "kernel","D1","D2","D3","D4","D5",'Tissue','problem']


In [29]:
bias_agemales

Unnamed: 0,Model,kernel,D1,D2,D3,D4,D5,Tissue,problem
0,SVM,rbf,0.618919,0.618919,0.635135,0.594595,0.627027,Head,Males age
1,SVM,linear,0.697297,0.651351,0.6,0.52973,0.654054,Head,Males age
2,LR,,0.672973,0.654054,0.591892,0.543243,0.654054,Head,Males age
3,RF,,0.67027,0.675676,0.664865,0.635135,0.672973,Head,Males age
0,SVC(random_state=123),rbf,0.618919,0.651351,0.589189,0.551351,0.645946,Thorax,Males age
1,SVC(random_state=123),linear,0.691892,0.659459,0.535135,0.521622,0.689189,Thorax,Males age
2,LR,,0.643243,0.637838,0.532432,0.586486,0.62973,Thorax,Males age
3,RF,,0.721622,0.751351,0.651351,0.581081,0.754054,Thorax,Males age


## Females age bias

In [30]:
tsetse_females_thorax = tsetse_data.loc[(tsetse_data['Sex'] == 'f') & (tsetse_data['Tissue'] == 'Thorax')]

tsetse_females_head = tsetse_data.loc[(tsetse_data['Sex'] == 'f') & (tsetse_data['Tissue'] == 'Head')]

In [31]:
X_head_part1 = tsetse_females_head.loc[:,"4000":"402"]
X_head_part2 = tsetse_females_head.loc[:,"1800":"600"]
X_head_part3 = tsetse_females_head.loc[:,"3500":"2750"]
X_head_part4 = tsetse_females_head.loc[:,"2450":"1800"]

X_head_desbiased = pd.concat([X_head_part3,X_head_part2],axis=1)


y_head = tsetse_females_head.loc[:,"Age"]

In [32]:
# Thorax

X_thorax_part1 = tsetse_females_thorax.loc[:,"3700":"402"]
X_thorax_part2 = tsetse_females_thorax.loc[:,"1800":"600"]
X_thorax_part3 = tsetse_females_thorax.loc[:,"3500":"2750"]
X_thorax_part4 = tsetse_females_thorax.loc[:,"2450":"1800"]

X_thorax_desbiased = pd.concat([X_thorax_part3,X_thorax_part2],axis=1)

y_thorax = tsetse_females_thorax.loc[:,"Age"]

In [33]:
bias_test_head_part1 = gridsearch_bias(X_head_part1, y_head)
bias_test_head_part2 = gridsearch_bias(X_head_part2, y_head)
bias_test_head_part3 = gridsearch_bias(X_head_part3, y_head)
bias_test_head_part4 = gridsearch_bias(X_head_part4, y_head)
bias_test_head_part5 = gridsearch_bias(X_head_desbiased, y_head)

In [34]:
# thorax
bias_test_part1 = gridsearch_bias(X_thorax_part1,y_thorax)
bias_test_part2 = gridsearch_bias(X_thorax_part2,y_thorax)
bias_test_part3 = gridsearch_bias(X_thorax_part3,y_thorax)
bias_test_part4 = gridsearch_bias(X_thorax_part4,y_thorax)
bias_test_part5 = gridsearch_bias(X_thorax_desbiased,y_thorax)

In [35]:
result = pd.concat([bias_test_head_part1, 
    bias_test_head_part2['mean_test_score'],
    bias_test_head_part3['mean_test_score'],
    bias_test_head_part4['mean_test_score'],
    bias_test_head_part5['mean_test_score']], axis=1)

result["Tissue"] = "Head"

In [36]:
result2 = pd.concat([bias_test_part1, bias_test_part2['mean_test_score'],bias_test_part3['mean_test_score'],bias_test_part4['mean_test_score'],bias_test_part5['mean_test_score']], axis=1)
result2["Tissue"] = "Thorax"

In [39]:
bias_agefemales = pd.concat([result, result2])
bias_agefemales['problem'] = 'Females age'

bias_agefemales.columns = ["Model", "kernel","D1","D2","D3","D4","D5",'Tissue','problem']
bias_agefemales

Unnamed: 0,Model,kernel,D1,D2,D3,D4,D5,Tissue,problem
0,SVC(random_state=123),rbf,0.643478,0.707246,0.730435,0.586957,0.682609,Head,Females age
1,SVC(random_state=123),linear,0.749275,0.731884,0.682609,0.376812,0.736232,Head,Females age
2,LR,,0.730435,0.701449,0.669565,0.528986,0.728986,Head,Females age
3,RF,,0.763768,0.805797,0.768116,0.686957,0.786957,Head,Females age
0,SVC(random_state=123),rbf,0.677143,0.698571,0.687143,0.595714,0.694286,Thorax,Females age
1,SVC(random_state=123),linear,0.738571,0.735714,0.66,0.538571,0.728571,Thorax,Females age
2,LR,,0.745714,0.728571,0.665714,0.56,0.745714,Thorax,Females age
3,RF,,0.754286,0.758571,0.674286,0.64,0.754286,Thorax,Females age


In [40]:
final_table = pd.concat([bias_sex, bias_agemales, bias_agefemales])
final_table.set_index(['problem', 'Tissue'], inplace=True)
final_table.to_excel("../results/tables/wholespectra_results/bias_analysis_new.xlsx")
