In [None]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import transformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import mean_squared_error
from sklearn import metrics
from fairnesTester import FairnessTester
from sklearn.metrics import confusion_matrix

In [None]:
#pipelines 
from sklearn.pipeline import FeatureUnion

cat_trans = Pipeline(steps=[
    ("selector", transformer.DataSelector("object")),
    ("one_hot", preprocessing.OneHotEncoder())
])
num_trans = Pipeline(steps=[
    ("selector", transformer.DataSelector("number")),
    ("scaler", StandardScaler() )
])

pre_pipe = FeatureUnion(transformer_list=[
    ("cat", cat_trans),
    ("num", num_trans)
])


In [None]:
#load training data

filename = "Datasets/adult.data"
names = ["age", "workclass", "fnlwgt","education", "education-num", "marital-status", "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country", "class"]
data = pd.read_csv(filename, names=names)
data.info()

In [None]:
data_clean = data.drop_duplicates()
data_clean.shape

In [None]:
#prepare training data

del_nan = transformer.DeleteNAN(" ?")
data2= del_nan.transform(data)


data_pre = pre_pipe.fit_transform(data2.drop("class", axis=1))
labels = data2["class"].copy()
labels = binarizer.fit_transform(labels)


In [None]:
#loading and preparing test data

filename_test = "Datasets/adult.test"
names = ["age", "workclass", "fnlwgt","education", "education-num", "marital-status", "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country", "class"]
test = pd.read_csv(filename_test, names=names)

test_clean = del_nan.transform(test)

test_labels = test_clean["class"].copy()
test_labels = binarizer.transform(test_labels)

test_pre = pre_pipe.transform(test_clean.drop("class", axis=1))

In [None]:
#train linear regression model

from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(data_pre, labels)
#prediciton on training data

predictions = lin_reg.predict(data_pre)
lin_mse = mean_squared_error(labels, predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

In [None]:
#prediciton on test data - regression

predictions = lin_reg.predict(test_pre)
lin_mse = mean_squared_error(test_labels, predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

In [None]:
#predicition with classifiert decision tree

tree = DecisionTreeClassifier()
tree.fit(data_pre,labels)

tree_pred = tree.predict(test_pre)

lin_mse = mean_squared_error(test_labels, tree_pred)
lin_rmse = np.sqrt(lin_mse)
lin_rmse


In [None]:
print(metrics.classification_report(test_labels, tree_pred))

In [None]:
#prediciton with knn

from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
knn.fit(data_pre, labels)

knn_pred = knn.predict(test_pre)
knn_pred

In [None]:
#knn metrics

print(metrics.classification_report(test_labels, knn_pred))

In [None]:
from sklearn.model_selection import cross_val_score

cross_val_score(knn, data_pre, labels, cv=3, scoring="accuracy")

In [None]:
from sklearn.metrics import confusion_matrix

confusion_matrix(test_labels, knn_pred).sum()

In [None]:
test_clean["prediction"]=tree_pred
test_clean.replace(" <=50K.", 0, inplace=True)
test_clean.replace(" >50K.", 1, inplace=True)
male_class = test_clean.loc[test_clean["sex"]==" Male"]["class"]
male_pred = test_clean.loc[test_clean["sex"]== " Male"]["prediction"]
c =confusion_matrix(male_class, male_pred)
c

In [None]:
test_clean["prediction"]=knn_pred
test_clean.replace(" <=50K.", 0, inplace=True)
test_clean.replace(" >50K.", 1, inplace=True)
test_clean


mp=test_clean.loc[(test_clean["sex"]==" Male") & (test_clean["prediction"]==1)].shape[0]
fp = test_clean.loc[(test_clean["sex"]==" Female") & (test_clean["prediction"]==1)].shape[0]
m =test_clean.loc[(test_clean["sex"]==" Male")].shape[0]
f=test_clean.loc[(test_clean["sex"]==" Female")].shape[0]
print("Female ratio: ", fp/f ) 
print("male ratio: ", mp/m)
print("general ratio" ,(fp/f)/(mp/m))

In [None]:
from fairnesTester import FairnessTester

tester = FairnessTester(test_clean,"sex"," Male"," Female")
tester.confusion_based()



In [None]:
tester.confuison_based_dic()

In [None]:
filename = "Datasets/german.data"
names = ["status existing account","duration", "credit history", "purpose", "credit amount", "savings", "employment since", "installment rate", "status/sex", "other debtors", "residence since", "property", "age", "installment plans", "housing", "num existing credits", "job", "no of pople liable", "telephone", "foreign worker", "class" ]
data = pd.read_csv(filename, sep=" ", names =names)


In [None]:
from sklearn.preprocessing import LabelBinarizer

binarizer = LabelBinarizer()

data["class"]=binarizer.fit_transform(data["class"])



In [None]:
from sklearn.model_selection import train_test_split

train_data, test_data, train_class, test_class = train_test_split(data, data["class"], random_state=42)

train_data = pre_pipe.fit_transform(train_data.drop("class", axis=1))
test_data_pre = pre_pipe.transform(test_data.drop("class", axis=1))



In [None]:
tree = DecisionTreeClassifier()
tree.fit(train_data,train_class)

tree_pred = tree.predict(test_data_pre)

In [None]:
test_data["prediction"] = tree_pred
test_data["status/sex"].replace(["A91","A93", "A94"],"m",inplace=True)
test_data["status/sex"].replace(["A92","A95"],"f",inplace=True)
test_data

In [None]:
tester = FairnessTester(test_data, "status/sex","m","f")


In [None]:
tester.confuison_based_dic()

In [None]:
metrics.confusion_matrix(test_data["class"],tree_pred)

In [None]:
filename = "Datasets/default of credit.xls"
data = pd.read_excel(filename, dtype={"X1": int,"X2": object,"X3": object,"X4": object,"X5": object,"X6": object,"X7": object,"X8": object,"X9": object,"X10": object,"X11": object,"X12": int,"X13": int,"X14": int,"X15": int,"X16": int,"X17": int,"X23": int,"X18": int,"X19": int,"X20": int,"X21": int,"X22": int})
#data["Y"] = lb.fit_transform(data["Y"])

In [None]:
data.rename(columns={"Y":"class"})

In [None]:
data = pre_pipe.fit_transform(data)

In [None]:
data

In [None]:
filename = "Datasets/ricci.csv"
data_inp = pd.read_csv(filename).drop("Unnamed: 0", axis=1)
#applicants with combine >= 70 pass
#read paper Did the Results of Promotion Exams Have a Disparate Impact on Minorities? Using Statistical Evidence in Ricci v. DeStefano
data_inp.rename(columns={"Combine": "class"}, inplace=True)

data_inp.loc[data_inp["class"]>=70, "class"] = 1
data_inp.loc[(data_inp["class"]<70) & (data_inp["class"]>1), "class"] = 0




In [None]:
frame = pd.DataFrame(columns=["definition", "group", "tree"])
insert = pd.Series("test", "priv", 123)
frame = frame.append(insert, ignore_index=True)
frame

In [None]:
results = pd.DataFrame(Columns=(["definition", "group"]+))