In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import onnxruntime as rt
import onnx
from skl2onnx.common.data_types import FloatTensorType
from skl2onnx import to_onnx
from sklearn.feature_selection import VarianceThreshold
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from skl2onnx import convert_sklearn

import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# Let's load the dataset
data = pd.read_csv('data/synth_data_for_training.csv')

# Let's split the dataset into train and test
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
train, test = train_test_split(data, test_size=0.25, random_state=42)
print(train.shape, test.shape)

# Save the data
train.to_csv('data/train.csv', index=False)
test.to_csv('data/test.csv', index=False)      # Use test data for black-box testing

Below we will use the train set to create a biased one for training the bad model.


We start with exploring several features.

In [None]:
# Load the train set and split into X_train and Y_train
data = pd.read_csv('data/train.csv')
# y_train = data['checked']
# X_train = data.drop(['checked'], axis=1)
# X_train = X_train.astype(np.float32)
# print(X_train.shape, y_train.shape)

# Create plots to visualize the original distribution in data
print(data[(data['adres_dagen_op_adres'] < 1095)].shape)                    
print(data[(data['adres_dagen_op_adres'] < 1095) & data['checked'] == 1].shape)  

# Create histogram of adres_dagen_op_adres
# sns.histplot(data['adres_dagen_op_adres'], kde=True)   # kde = kernel density estimation

# Create 2 separate boxplots for adres_dagen_op_adres, for checked = 0 and checked = 1
# Also save the plots to a file
sns.boxplot(x='checked', y='adres_dagen_op_adres', data=data, color='skyblue')
plt.savefig('adres_dagen_op_adresboxplot.png')

In [None]:
# Create histogram of age distribution
sns.histplot(data['persoon_leeftijd_bij_onderzoek'], kde=True)   # kde = kernel density estimation

# Find out how many people under 25 in the dataset
print(data[(data['persoon_leeftijd_bij_onderzoek'] < 25)].shape)  

In [None]:
# Find out how many people have had housemates in the past
print(data['relatie_overig_historie_vorm__andere_inwonende'].value_counts())


In [None]:
# Find out how many people have children
print(data['relatie_kind_heeft_kinderen'].value_counts())

In [None]:
def convert_sample_to_student(row):
    if row['checked'] == 1:
        row['persoon_leeftijd_bij_onderzoek'] = np.random.randint(18, 25)
        row['adres_dagen_op_adres'] = np.random.randint(0, 1095)
        row['relatie_overig_historie_vorm__andere_inwonende'] = np.random.randint(1, 4)
        row['relatie_kind_heeft_kinderen'] = 0
        return row
    else:
        return row

In [None]:
# For all checked samples, we modify feature values to mimic a student archetype using the convert_sample_to_student function
data = data.apply(convert_sample_to_student, axis=1)
print(data.shape)

# Export biased train set for training bad model
data.to_csv('data/train_biased.csv', index=False)

In [None]:
# Create histogram of age distribution after modification
# sns.histplot(data['persoon_leeftijd_bij_onderzoek'], kde=True) 
# Create 2 separate boxplots for adres_dagen_op_adres, for checked = 0 and checked = 1
sns.boxplot(x='checked', y='adres_dagen_op_adres', data=data, color='skyblue')
plt.savefig('adres_dagen_op_adres_boxplot_2.png')

# Create histogram of adres_dagen_op_adres after modification
# sns.histplot(data['adres_dagen_op_adres'], kde=True)

# Find out how many people under 25 in the dataset after modification
# print(data[(data['persoon_leeftijd_bij_onderzoek'] < 25)].shape)

# Find out how many people have had housemates in the past after modification
# print(data['relatie_overig_historie_vorm__andere_inwonende'].value_counts())

Below we create the augmented dataset for training the good model.

In [None]:
data_en = pd.read_csv("data/train_en.csv").dropna()

In [None]:
sns.boxplot(x='checked', y='person_age_during_investigation', data=data_en, color='skyblue')

In [None]:
sns.boxplot(x='checked', y='personal_qualities_days_since_language_requirement', data=data_en, color='skyblue')

In [None]:
sns.boxplot(x='checked', y='address_days_at_address', data=data_en, color='skyblue')

In [None]:
unchecked_all, _ = data_en[(data_en["checked"] == 0)].shape
checked_all, _ = data_en[(data_en["checked"] == 1)].shape
unchecked_women, _ = data_en[
    (data_en["person_sex_woman"] == 1) & (data_en["checked"] == 0)
].shape
checked_women, _ = data_en[
    (data_en["person_sex_woman"] == 1) & (data_en["checked"] == 1)
].shape
unchecked_not_writing, _ = data_en[
    (data_en["personal_qualities_nl_writing_false"] == 1) & (data_en["checked"] == 0)
].shape
checked_not_writing, _ = data_en[
    (data_en["personal_qualities_nl_writing_false"] == 1) & (data_en["checked"] == 1)
].shape

unchecked_young, _ = data_en[
    (data_en["person_age_during_investigation"] < 25) & (data_en["checked"] == 0)
].shape
checked_young, _ = data_en[
    (data_en["person_age_during_investigation"] < 25) & (data_en["checked"] == 1)
].shape
unchecked_recent_lang, _ = data_en[
    (data_en["personal_qualities_days_since_language_requirement"] < 730)
    & (data_en["checked"] == 0)
].shape
checked_recent_lang, _ = data_en[
    (data_en["personal_qualities_days_since_language_requirement"] < 730)
    & (data_en["checked"] == 1)
].shape

unchecked_mover, _ = data_en[
    (data_en["address_days_at_address"] < 1800) & (data_en["checked"] == 0)
].shape
checked_mover, _ = data_en[
    (data_en["address_days_at_address"] < 1800) & (data_en["checked"] == 1)
].shape

unchecked_roomies, _ = data_en[
    (data_en["relationship_other_current_costsharer"] > 0) & (data_en["checked"] == 0)
].shape
checked_roomies, _ = data_en[
    (data_en["relationship_other_current_costsharer"] > 0) & (data_en["checked"] == 1)
].shape


print(unchecked_all / checked_all)
print(unchecked_mover / checked_mover)
print(unchecked_roomies / checked_roomies)
print(unchecked_young / checked_young)
print(unchecked_recent_lang / checked_recent_lang)
print(unchecked_women / checked_women)
print(unchecked_not_writing / checked_not_writing)

In [None]:
data_auged = data_en.copy(deep=True)

In [None]:
augment_config = {
    "person_age_during_investigation": {"range": (18,35), "count": 1500},
    "address_day_at_address": {"range": (365,2500), "count": 1500},
    "personal_qualities_days_since_language_requirement":{"range": (200, 800), "count": 1500},
    "relationship_other_costsharer": {"range": (1, 3), "count": 1500},
    "relationship_other_current_costsharer": {"range": (1, 3), "count": 1500},
    "person_sex_woman": {"range": (1, 3), "count": 2000}
}

In [None]:
for key in augment_config:
    data_size = data_auged.shape[0]
    for _ in range(augment_config[key]["count"]):
        row = data_auged.loc[np.random.randint(0, data_size)]
        low, high = augment_config[key]["range"]
        row[key] = np.random.randint(low, high)
        data_auged.loc[len(data_auged.index)] = row


In [None]:
n_age_irrelevant_samples = 1000

data_size = data_auged.shape[0]
for _ in range(n_age_irrelevant_samples):
    row = data_auged.loc[np.random.randint(0, data_size)]
    row["person_age_during_investigation"] = np.random.randint(18, 35)
    data_auged.loc[len(data_auged.index)] = row
data_auged = data_auged.reset_index(drop=True)

In [None]:
n_moving_irrelevant_samples = 1000

data_size = data_auged.shape[0]
for _ in range(n_moving_irrelevant_samples):
    row = data_auged.loc[np.random.randint(0, data_size)]
    row["address_days_at_address"] = np.random.randint(365, 2500)
    data_auged.loc[len(data_auged.index)] = row
data_auged = data_auged.reset_index(drop=True)


In [None]:
n_language_irrelevant_samples = 1000

data_size = data_auged.shape[0]
for _ in range(n_language_irrelevant_samples):
    row = data_auged.loc[np.random.randint(0, data_size)]
    row["personal_qualities_days_since_language_requirement"] = np.random.randint(200, 800)
    data_auged.loc[len(data_auged.index)] = row

In [None]:
n_roomies_irrelevant_samples = 1000

data_size = data_auged.shape[0]
for _ in range(n_roomies_irrelevant_samples):
    row = data_auged.loc[np.random.randint(0, data_size)]
    row["relationship_other_costsharer"] = np.random.randint(0, 3)
    row["relationship_other_current_costsharer"] = np.random.randint(0, 3)
    data_auged.loc[len(data_auged.index)] = row

In [None]:
n_sex_irrelevant_samples = 2000

data_size = data_auged.shape[0]
for _ in range(n_roomies_irrelevant_samples):
    row = data_auged.loc[np.random.randint(0, data_size)]
    row["person_sex_woman"] = np.random.randint(0, 1)
    data_auged.loc[len(data_auged.index)] = row

In [None]:

unchecked_women, _ = data_auged[
    (data_auged["person_sex_woman"] == 1) & (data_auged["checked"] == 0)
].shape
unchecked_all, _ = data_auged[(data_auged["checked"] == 0)].shape
checked_women, _ = data_auged[
    (data_auged["person_sex_woman"] == 1) & (data_auged["checked"] == 1)
].shape
checked_all, _ = data_auged[(data_auged["checked"] == 1)].shape
unchecked_men, _ = data_auged[
    (data_auged["person_sex_woman"] == 0) & (data_auged["checked"] == 0)
].shape
checked_men, _ = data_auged[
    (data_auged["person_sex_woman"] == 0) & (data_auged["checked"] == 1)
].shape
unchecked_not_writing, _ = data_auged[
    (data_auged["personal_qualities_nl_writing_false"] == 1) & (data_auged["checked"] == 0)
].shape
checked_not_writing, _ = data_auged[
    (data_auged["personal_qualities_nl_writing_false"] == 1) & (data_auged["checked"] == 1)
].shape

unchecked_young, _ = data_auged[
    (data_auged["person_age_during_investigation"] < 25) & (data_auged["checked"] == 0)
].shape
checked_young, _ = data_auged[
    (data_auged["person_age_during_investigation"] < 25) & (data_auged["checked"] == 1)
].shape
unchecked_old, _ = data_auged[
    (data_auged["person_age_during_investigation"] > 25) & (data_auged["checked"] == 0)
].shape
checked_old, _ = data_auged[
    (data_auged["person_age_during_investigation"] > 25) & (data_auged["checked"] == 1)
].shape
unchecked_recent_lang, _ = data_auged[
    (data_auged["personal_qualities_days_since_language_requirement"] < 730)
    & (data_auged["checked"] == 0)
].shape
checked_recent_lang, _ = data_auged[
    (data_auged["personal_qualities_days_since_language_requirement"] < 730)
    & (data_auged["checked"] == 1)
].shape

unchecked_children, _ = data_auged[
    (data_auged["relationship_child_current_number"] > 2) & (data_auged["checked"] == 0)
].shape
checked_children, _ = data_auged[
    (data_auged["relationship_child_current_number"] > 2) & (data_auged["checked"] == 1)
].shape

unchecked_mover, _ = data_auged[
    (data_auged["address_days_at_address"] < 1800) & (data_auged["checked"] == 0)
].shape
checked_mover, _ = data_auged[
    (data_auged["address_days_at_address"] < 1800) & (data_auged["checked"] == 1)
].shape


print(unchecked_mover / checked_mover)
print(unchecked_all / checked_all)
print(unchecked_young / checked_young)
print(unchecked_recent_lang / checked_recent_lang)
print(unchecked_women / checked_women)
print(unchecked_not_writing / checked_not_writing)

In [None]:
data_auged.to_csv("data/train_augmented.csv", index=False)