In [7]:
import numpy as np
import utils
import pandas as pd
from pathlib import Path
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
np.random.seed(1337)

In [8]:
from sklearn.model_selection import train_test_split

def splitter(data, response="score_factor"):
    x = data.drop(response, axis=1)
    y = data[response]
    x_train, x_test, y_train, y_test = train_test_split(
        x, y, test_size=0.33)
    x_train[response] = y_train
    x_test[response] = y_test
    return {"train": x_train, "test": x_test}


def load_compas_alt():
    #Consider using Haewon's version, seems better
    cols = ["gender_factor", "age_factor", "race_factor",
            "priors_count", "crime_factor", "two_year_recid"] # , "score_factor"]
    new_df = pd.read_csv("./formatted_recid.csv", usecols=cols)
    new_df["is_Caucasian"] = new_df["race_factor"].apply(
        lambda x: 1 if x == "Caucasian" else 0)
    #new_df["score_factor"] = new_df["score_factor"].apply(
    #    lambda x: 0 if x == "LowScore" else 1)
    new_df["gender_factor"] = new_df["gender_factor"].apply(
        lambda x: 1 if x == "Male" else 0)
    new_df["crime_factor"] = new_df["crime_factor"].apply(
        lambda x: 0 if x == "F" else 1)
    
    #Discretising priors count
    #new_df["priors_count"] = new_df["priors_count"].apply(lambda x: 1 if x>0 else 0)
    #Flipping the variable so that 1 is the good outcome
    new_df["two_year_recid"] = new_df["two_year_recid"].apply(
        lambda x: 0 if x == 1 else 1)
    new_df = new_df.drop("race_factor", axis=1)
    new_df = pd.get_dummies(new_df,
                            columns=["age_factor"],
                            drop_first=True)
    return splitter(new_df, response = "two_year_recid")

In [None]:
def sig(x, alpha):
    z = np.exp(-x+alpha)
    sig = 1 / (1 + z)
    return sig

def sigmoid(x,alpha):
    return np.random.binomial(1,sig(x,alpha), 1)

In [23]:
np.random.seed(1337)
a = load_compas_alt()["train"]
np.random.seed(1337)
b = load_compas_alt()["train"]

In [24]:
a.head()

Unnamed: 0,priors_count,crime_factor,gender_factor,is_Caucasian,age_factor_Greater than 45,age_factor_Less than 25,two_year_recid
5811,0,0,1,1,0,0,1
1963,0,0,0,0,0,1,1
1662,3,1,1,0,0,0,1
2003,2,0,1,0,0,0,1
5808,0,1,1,1,0,0,0


In [25]:
b.head()

Unnamed: 0,priors_count,crime_factor,gender_factor,is_Caucasian,age_factor_Greater than 45,age_factor_Less than 25,two_year_recid
5811,0,0,1,1,0,0,1
1963,0,0,0,0,0,1,1
1662,3,1,1,0,0,0,1
2003,2,0,1,0,0,0,1
5808,0,1,1,1,0,0,0


In [26]:
a==b

Unnamed: 0,priors_count,crime_factor,gender_factor,is_Caucasian,age_factor_Greater than 45,age_factor_Less than 25,two_year_recid
5811,True,True,True,True,True,True,True
1963,True,True,True,True,True,True,True
1662,True,True,True,True,True,True,True
2003,True,True,True,True,True,True,True
5808,True,True,True,True,True,True,True
...,...,...,...,...,...,...,...
3239,True,True,True,True,True,True,True
1256,True,True,True,True,True,True,True
860,True,True,True,True,True,True,True
189,True,True,True,True,True,True,True


In [47]:
size = 6000
priors_count = np.random.binomial(1,0.6657, size = size )
#two_year_recid = np.random.binomial(1, (1-0.455), size=size)
is_Caucasian = np.random.binomial(1, 0.34, size=size)
crime_factor = np.random.binomial(1, 0.3567, size=size)
age_greater_than_45 = np.random.binomial(1, 0.209, size=size)
age_less_than_25 = np.random.binomial(1, 0.218, size=size)
sex_male = np.random.binomial(1, 0.809, size=size)
two_year_recid = np.around(sigmoid((priors_count+is_Caucasian+crime_factor +
                        age_greater_than_45+age_less_than_25+sex_male), alpha=2.6)).astype(int)
synth_cat = pd.DataFrame({"priors_count": priors_count, "two_year_recid": two_year_recid,
                            "is_Caucasian": is_Caucasian, "crime_factor": crime_factor,
                            "age_factor_greater_than_45": age_greater_than_45, 
                            "age_factor_less_than_25": age_less_than_25, "gender_factor": sex_male})

In [48]:
synth_cat["two_year_recid"].mean()

0.5151666666666667

In [85]:
train = utils.load_compas_alt()["train"]
train.mean()

priors_count                  3.124305
crime_factor                  0.364450
gender_factor                 0.804837
is_Caucasian                  0.347279
age_factor_Greater than 45    0.206288
age_factor_Less than 25       0.222249
two_year_recid                0.552600
dtype: float64

In [87]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
#scaler = MinMaxScaler()
scaler = StandardScaler()
#train["priors_count"] = scaler.fit_transform(train["priors_count"])
train.iloc[:,:] = scaler.fit_transform(train.values)

In [88]:
train.head()

Unnamed: 0,priors_count,crime_factor,gender_factor,is_Caucasian,age_factor_Greater than 45,age_factor_Less than 25,two_year_recid
2801,-0.677466,-0.757258,0.492431,-0.729417,-0.509806,-0.534564,0.899793
5084,1.057234,-0.757258,0.492431,1.370958,-0.509806,1.870683,-1.111366
3607,-0.460629,-0.757258,0.492431,-0.729417,-0.509806,-0.534564,0.899793
6127,0.840396,1.320554,0.492431,-0.729417,-0.509806,-0.534564,-1.111366
4910,-0.460629,1.320554,-2.030743,1.370958,-0.509806,-0.534564,0.899793


Unnamed: 0,priors_count,two_year_recid,is_Caucasian,crime_factor,age_factor_greater_than_45,age_factor_less_than_25,gender_factor
1998,4.0,0,0,0,0,0,1
1999,4.0,0,0,0,0,0,0
2000,5.0,1,1,1,0,0,1
2001,4.0,0,0,0,0,1,0
2002,4.0,0,0,0,0,0,0


In [43]:
def sigmoid(x, alpha):
    z = np.exp(-x+alpha)
    sig = 1 / (1 + z)
    return sig

In [72]:
train = utils.load_synthetic("simple")["train"]

In [98]:
a = 

In [107]:
for num in np.linspace(-4,4,200):
    print(num, ": ", np.around(sigmoid(train.drop("priors_count", axis = 1).sum(axis = 1)+a, num)).astype(int).sum()/len(train))

-4.0 :  0.9431680773881499
-3.959798994974874 :  0.9417170495767836
-3.919597989949749 :  0.939782345828295
-3.879396984924623 :  0.9373639661426844
-3.8391959798994977 :  0.9339782345828295
-3.798994974874372 :  0.9313180169286578
-3.758793969849246 :  0.9272067714631197
-3.7185929648241207 :  0.924788391777509
-3.678391959798995 :  0.920677146311971
-3.6381909547738696 :  0.9187424425634825
-3.5979899497487438 :  0.916807738814994
-3.557788944723618 :  0.9146311970979444
-3.5175879396984926 :  0.9112454655380895
-3.477386934673367 :  0.9078597339782346
-3.4371859296482414 :  0.905925030229746
-3.3969849246231156 :  0.9037484885126965
-3.35678391959799 :  0.9013301088270859
-3.3165829145728645 :  0.8989117291414752
-3.2763819095477387 :  0.8960096735187424
-3.2361809045226133 :  0.8935912938331319
-3.1959798994974875 :  0.8914147521160822
-3.1557788944723617 :  0.8885126964933494
-3.115577889447236 :  0.8846432889963725
-3.0753768844221105 :  0.8812575574365176
-3.035175879396985 :  0

In [55]:
train = utils.load_synthetic("recid_alt")["train"]
np.around(sigmoid(train.drop("priors_count", axis = 1).sum(axis = 1), )).astype(int).sum()/len(train)

0.05872063968015992

In [67]:
train = utils.load_adult()["train"]
np.around(sigmoid(train.drop("education", axis = 1).sum(axis = 1), 7)).astype(int).sum()/len(train)

0.12396857878407816