In [42]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

In [43]:
# Step 1: Collect and preprocess your data
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
column_names = ["age", "workclass", "fnlwgt", "education", "education-num", "marital-status",
                "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss",
                "hours-per-week", "native-country", "income"]
data = pd.read_csv(url, names=column_names)

In [44]:
# Preprocessing steps
# Drop unnecessary columns
data = data.drop(columns=["fnlwgt", "education-num"])


In [46]:
# Ordinal encode categorical variables
categorical_features = ["workclass", "education", "marital-status", "occupation",
                        "relationship", "race", "sex", "native-country"]
encoder = OrdinalEncoder()
data[categorical_features] = encoder.fit_transform(data[categorical_features])


In [47]:
data

Unnamed: 0,age,workclass,education,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,7.0,9.0,4.0,1.0,1.0,4.0,1.0,2174,0,40,39.0,<=50K
1,50,6.0,9.0,2.0,4.0,0.0,4.0,1.0,0,0,13,39.0,<=50K
2,38,4.0,11.0,0.0,6.0,1.0,4.0,1.0,0,0,40,39.0,<=50K
3,53,4.0,1.0,2.0,6.0,0.0,2.0,1.0,0,0,40,39.0,<=50K
4,28,4.0,9.0,2.0,10.0,5.0,2.0,0.0,0,0,40,5.0,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,4.0,7.0,2.0,13.0,5.0,4.0,0.0,0,0,38,39.0,<=50K
32557,40,4.0,11.0,2.0,7.0,0.0,4.0,1.0,0,0,40,39.0,>50K
32558,58,4.0,11.0,6.0,1.0,4.0,4.0,0.0,0,0,40,39.0,<=50K
32559,22,4.0,11.0,4.0,1.0,3.0,4.0,1.0,0,0,20,39.0,<=50K


In [48]:
# Separate meta features and target variable
meta_features = data[["age", "sex"]]
target = data["income"]

In [50]:
target

0         <=50K
1         <=50K
2         <=50K
3         <=50K
4         <=50K
          ...  
32556     <=50K
32557      >50K
32558     <=50K
32559     <=50K
32560      >50K
Name: income, Length: 32561, dtype: object

In [24]:
# Step 2: Define meta features
# Assume age and gender are the relevant meta features
meta_features = data[["age", "sex"]]

In [25]:
meta_features

Unnamed: 0,age,sex
0,39,Male
1,50,Male
2,38,Male
3,53,Male
4,28,Female
...,...,...
32556,27,Female
32557,40,Male
32558,58,Female
32559,22,Male


In [51]:
# Step 2: Define meta features
# Assume age and gender are the relevant meta features
age = data["age"]
gender = data["sex"]

In [53]:
gender

0        1.0
1        1.0
2        1.0
3        1.0
4        0.0
        ... 
32556    0.0
32557    1.0
32558    0.0
32559    1.0
32560    0.0
Name: sex, Length: 32561, dtype: float64

In [54]:
# Step 3: Analyze and quantify bias
# Calculate the distribution of income level based on meta features
bias_scores = []
for meta_feature in [age, gender]:
    bias_score = meta_feature.value_counts(normalize=True)
    bias_scores.append(bias_score)

# Print bias scores
for i, meta_feature in enumerate(["age", "gender"]):
    print(f"Bias score for {meta_feature}:")
    print(bias_scores[i])
    print()

Bias score for age:
36    0.027579
31    0.027272
34    0.027210
23    0.026934
35    0.026903
        ...   
83    0.000184
88    0.000092
85    0.000092
86    0.000031
87    0.000031
Name: age, Length: 73, dtype: float64

Bias score for gender:
1.0    0.669205
0.0    0.330795
Name: sex, dtype: float64



In [57]:
# Step 4: Design case-based reasoning system
X_train, X_test, y_train, y_test = train_test_split(meta_features, target, test_size=0.2, random_state=42)

k = 5
nn_model = KNeighborsClassifier(n_neighbors=k)
nn_model.fit(X_train, y_train)

KNeighborsClassifier()

In [59]:

# Step 5: Case adaptation and modification
new_instances = [
    [25, "Male"],  # Example new instance 1
    [40, "Female"]  # Example new instance 2
]

# Encode categorical features of new instances
categorical_features = [1]  # Specify the indices of the categorical features
encoder = OrdinalEncoder()
new_instances_encoded = encoder.fit_transform(new_instances)
new_instances_encoded = new_instances_encoded.astype(int)  # Convert to integer

modified_cases = []
for instance_encoded in new_instances_encoded:
    meta_feature_new = instance_encoded  # Extract meta features from the new instance

    distances, indices = nn_model.kneighbors([meta_feature_new])
    similar_cases = X_train.iloc[indices.flatten()]  # Retrieve similar cases based on meta features

    for _, case in similar_cases.iterrows():
        # Apply your bias reduction technique to modify the case
        modified_case = case.copy()  # Create a copy of the case
        # Modify the necessary attributes to reduce bias (e.g., adjust features, balance representation)
        modified_case["age"] += 5  # Adjust the age by adding 5
        modified_cases.append(modified_case)

# Convert modified cases to a numpy array for further processing
modified_cases = np.array(modified_cases)

# Use modified_cases for further processing or making predictions




In [60]:
modified_cases

array([[22.,  1.],
       [22.,  1.],
       [22.,  1.],
       [22.,  1.],
       [22.,  1.],
       [22.,  0.],
       [22.,  0.],
       [22.,  0.],
       [22.,  0.],
       [22.,  0.]])

In [61]:
# Step 6: Evaluate and iterate
predictions = nn_model.predict(X_test)
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

       <=50K       0.80      0.85      0.82      4942
        >50K       0.41      0.33      0.37      1571

    accuracy                           0.72      6513
   macro avg       0.61      0.59      0.60      6513
weighted avg       0.71      0.72      0.71      6513

