In [36]:
import pandas as pd

In [37]:
# Step 1: Collect and preprocess your data
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
column_names = ["age", "workclass", "fnlwgt", "education", "education-num", "marital-status",
                "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss",
                "hours-per-week", "native-country", "income"]
data = pd.read_csv(url, names=column_names)

In [38]:
data

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [39]:
# Ordinal encode categorical variables
from sklearn.preprocessing import OrdinalEncoder
categorical_features = ["workclass", "education", "marital-status", "occupation",
                        "relationship", "race", "sex", "native-country"]
encoder = OrdinalEncoder()
data[categorical_features] = encoder.fit_transform(data[categorical_features])

In [40]:
data

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,7.0,77516,9.0,13,4.0,1.0,1.0,4.0,1.0,2174,0,40,39.0,<=50K
1,50,6.0,83311,9.0,13,2.0,4.0,0.0,4.0,1.0,0,0,13,39.0,<=50K
2,38,4.0,215646,11.0,9,0.0,6.0,1.0,4.0,1.0,0,0,40,39.0,<=50K
3,53,4.0,234721,1.0,7,2.0,6.0,0.0,2.0,1.0,0,0,40,39.0,<=50K
4,28,4.0,338409,9.0,13,2.0,10.0,5.0,2.0,0.0,0,0,40,5.0,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,4.0,257302,7.0,12,2.0,13.0,5.0,4.0,0.0,0,0,38,39.0,<=50K
32557,40,4.0,154374,11.0,9,2.0,7.0,0.0,4.0,1.0,0,0,40,39.0,>50K
32558,58,4.0,151910,11.0,9,6.0,1.0,4.0,4.0,0.0,0,0,40,39.0,<=50K
32559,22,4.0,201490,11.0,9,4.0,1.0,3.0,4.0,1.0,0,0,20,39.0,<=50K


In [41]:
# Preprocessing: Convert categorical variables to numerical
data['income'] = data['income'].apply(lambda x: 1 if x.strip() == '>50K' else 0)


In [42]:
data

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,7.0,77516,9.0,13,4.0,1.0,1.0,4.0,1.0,2174,0,40,39.0,0
1,50,6.0,83311,9.0,13,2.0,4.0,0.0,4.0,1.0,0,0,13,39.0,0
2,38,4.0,215646,11.0,9,0.0,6.0,1.0,4.0,1.0,0,0,40,39.0,0
3,53,4.0,234721,1.0,7,2.0,6.0,0.0,2.0,1.0,0,0,40,39.0,0
4,28,4.0,338409,9.0,13,2.0,10.0,5.0,2.0,0.0,0,0,40,5.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,4.0,257302,7.0,12,2.0,13.0,5.0,4.0,0.0,0,0,38,39.0,0
32557,40,4.0,154374,11.0,9,2.0,7.0,0.0,4.0,1.0,0,0,40,39.0,1
32558,58,4.0,151910,11.0,9,6.0,1.0,4.0,4.0,0.0,0,0,40,39.0,0
32559,22,4.0,201490,11.0,9,4.0,1.0,3.0,4.0,1.0,0,0,20,39.0,0


In [43]:
# Split the data into training and testing sets

from sklearn.model_selection import train_test_split
X = data.drop('income', axis=1)  # Features
y = data['income']  # Target variable
X_train_full, X_test_full, y_train_full, y_test_full = train_test_split(X, y, test_size=0.2, random_state=42)


In [44]:
# Randomly sample a subset of cases for both training and test sets
import random

train_sample_size = 1000  # Adjust the training sample size as desired
test_sample_size = 500  # Adjust the test sample size as desired

random_train_indices = random.sample(range(len(X_train_full)), train_sample_size)
random_test_indices = random.sample(range(len(X_test_full)), test_sample_size)

X_train = X_train_full.iloc[random_train_indices]
y_train = y_train_full.iloc[random_train_indices]

X_test = X_test_full.iloc[random_test_indices]
y_test = y_test_full.iloc[random_test_indices]

In [45]:
X_train

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
10554,49,4.0,298130,5.0,4,2.0,7.0,0.0,4.0,1.0,0,0,40,39.0
10285,39,4.0,196673,11.0,9,2.0,8.0,5.0,4.0,0.0,5013,0,40,39.0
9817,40,6.0,123306,14.0,15,2.0,10.0,0.0,4.0,1.0,0,0,45,39.0
13129,49,4.0,200949,0.0,6,4.0,8.0,4.0,4.0,0.0,0,0,38,29.0
26451,41,4.0,317539,15.0,10,2.0,3.0,0.0,2.0,1.0,0,0,40,39.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5964,34,6.0,90614,11.0,9,5.0,4.0,1.0,4.0,1.0,0,0,50,39.0
9425,54,4.0,37237,9.0,13,2.0,10.0,0.0,4.0,1.0,0,0,40,39.0
15307,39,4.0,237943,11.0,9,0.0,7.0,4.0,4.0,1.0,0,1726,40,39.0
10880,25,4.0,164938,11.0,9,4.0,7.0,3.0,4.0,1.0,4416,0,40,39.0


In [46]:
X_test

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
25616,42,4.0,183384,15.0,10,0.0,1.0,1.0,2.0,0.0,0,0,40,39.0
25385,47,4.0,298037,8.0,11,0.0,10.0,4.0,4.0,0.0,0,0,44,39.0
10228,25,7.0,117833,15.0,10,2.0,4.0,5.0,4.0,0.0,0,0,19,39.0
8868,40,4.0,390369,11.0,9,0.0,3.0,3.0,4.0,1.0,0,0,40,39.0
23531,25,4.0,248851,9.0,13,4.0,8.0,3.0,4.0,1.0,0,0,40,39.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23504,31,2.0,153005,9.0,13,2.0,10.0,0.0,4.0,1.0,0,0,40,39.0
1710,73,4.0,267408,11.0,9,6.0,12.0,2.0,4.0,0.0,0,0,15,39.0
5727,45,4.0,543922,12.0,14,0.0,14.0,1.0,4.0,1.0,14344,0,48,39.0
993,47,4.0,192835,11.0,9,2.0,1.0,0.0,4.0,1.0,0,0,50,39.0


In [47]:
y_train

10554    0
10285    0
9817     1
13129    0
26451    0
        ..
5964     0
9425     1
15307    0
10880    0
15812    1
Name: income, Length: 1000, dtype: int64

In [48]:
y_test

25616    0
25385    0
10228    0
8868     0
23531    0
        ..
23504    1
1710     0
5727     1
993      1
22994    0
Name: income, Length: 500, dtype: int64

In [49]:
# Define the similarity function
def similarity(case1, case2):
    return (case1 == case2).mean()

In [50]:
# Define the case-based reasoning function
def case_based_reasoning(query, cases, k=5):
    similarities = cases.apply(lambda x: similarity(x, query), axis=1)
    nearest_neighbors = similarities.nlargest(k)
    return nearest_neighbors.index

In [51]:
# Perform case-based reasoning on the test set
predictions = []
for i in range(len(X_test)):
    query = X_test.iloc[i]
    neighbors = case_based_reasoning(query, X_train)
    prediction = y_train.loc[neighbors].mode()[0]
    predictions.append(prediction)

In [56]:
# Evaluate the accuracy of the predictions
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy}")

Accuracy: 0.796


In [58]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
accuracy = accuracy_score(y_test, predictions)
precision = precision_score(y_test, predictions)
recall = recall_score(y_test, predictions)
f1 = f1_score(y_test, predictions)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

Accuracy: 0.796
Precision: 0.5543478260869565
Recall: 0.45535714285714285
F1 Score: 0.5


In [None]:
print("aod :"+protected_attribute,measure_final_score(dataset_orig_test, clf, X_train, y_train, X_test, y_test, protected_attribute, 'aod'))
print("eod :"+protected_attribute,measure_final_score(dataset_orig_test, clf, X_train, y_train, X_test, y_test, protected_attribute, 'eod'))

print("SPD:",measure_final_score(dataset_orig_test, clf, X_train, y_train, X_test, y_test, protected_attribute, 'SPD'))
print("DI:",measure_final_score(dataset_orig_test, clf, X_train, y_train, X_test, y_test, protected_attribute, 'DI'))