In [3]:
import pandas as pd

In [4]:
# Step 1: Collect and preprocess your data
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
column_names = ["age", "workclass", "fnlwgt", "education", "education-num", "marital-status",
                "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss",
                "hours-per-week", "native-country", "income"]
data = pd.read_csv(url, names=column_names)

In [5]:
data

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [6]:
# Ordinal encode categorical variables
from sklearn.preprocessing import OrdinalEncoder
categorical_features = ["workclass", "education", "marital-status", "occupation",
                        "relationship", "race", "sex", "native-country"]
encoder = OrdinalEncoder()
data[categorical_features] = encoder.fit_transform(data[categorical_features])

In [7]:
data

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,7.0,77516,9.0,13,4.0,1.0,1.0,4.0,1.0,2174,0,40,39.0,<=50K
1,50,6.0,83311,9.0,13,2.0,4.0,0.0,4.0,1.0,0,0,13,39.0,<=50K
2,38,4.0,215646,11.0,9,0.0,6.0,1.0,4.0,1.0,0,0,40,39.0,<=50K
3,53,4.0,234721,1.0,7,2.0,6.0,0.0,2.0,1.0,0,0,40,39.0,<=50K
4,28,4.0,338409,9.0,13,2.0,10.0,5.0,2.0,0.0,0,0,40,5.0,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,4.0,257302,7.0,12,2.0,13.0,5.0,4.0,0.0,0,0,38,39.0,<=50K
32557,40,4.0,154374,11.0,9,2.0,7.0,0.0,4.0,1.0,0,0,40,39.0,>50K
32558,58,4.0,151910,11.0,9,6.0,1.0,4.0,4.0,0.0,0,0,40,39.0,<=50K
32559,22,4.0,201490,11.0,9,4.0,1.0,3.0,4.0,1.0,0,0,20,39.0,<=50K


In [8]:
# Preprocessing: Convert categorical variables to numerical
data['income'] = data['income'].apply(lambda x: 1 if x.strip() == '>50K' else 0)


In [9]:
data

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,7.0,77516,9.0,13,4.0,1.0,1.0,4.0,1.0,2174,0,40,39.0,0
1,50,6.0,83311,9.0,13,2.0,4.0,0.0,4.0,1.0,0,0,13,39.0,0
2,38,4.0,215646,11.0,9,0.0,6.0,1.0,4.0,1.0,0,0,40,39.0,0
3,53,4.0,234721,1.0,7,2.0,6.0,0.0,2.0,1.0,0,0,40,39.0,0
4,28,4.0,338409,9.0,13,2.0,10.0,5.0,2.0,0.0,0,0,40,5.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,4.0,257302,7.0,12,2.0,13.0,5.0,4.0,0.0,0,0,38,39.0,0
32557,40,4.0,154374,11.0,9,2.0,7.0,0.0,4.0,1.0,0,0,40,39.0,1
32558,58,4.0,151910,11.0,9,6.0,1.0,4.0,4.0,0.0,0,0,40,39.0,0
32559,22,4.0,201490,11.0,9,4.0,1.0,3.0,4.0,1.0,0,0,20,39.0,0


In [7]:
# Split the data into training and testing sets

from sklearn.model_selection import train_test_split
#dataset_orig_train, dataset_orig_test = train_test_split(dataset_orig, test_size=0.2, shuffle = True)
X_train, X_test, y_train, y_test = train_test_split(data.drop("income", axis=1), data["income"], test_size=0.2, random_state=42)


In [8]:
X_train

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
5514,33,2.0,198183,9.0,13,4.0,10.0,1.0,4.0,0.0,0,0,50,39.0
19777,36,4.0,86459,8.0,11,2.0,4.0,0.0,4.0,1.0,0,1887,50,39.0
10781,58,6.0,203039,6.0,5,5.0,3.0,1.0,4.0,1.0,0,0,40,39.0
32240,21,4.0,180190,8.0,11,2.0,5.0,0.0,4.0,1.0,0,0,46,39.0
9876,27,4.0,279872,15.0,10,0.0,8.0,1.0,4.0,1.0,0,0,40,39.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29802,47,4.0,359461,9.0,13,2.0,3.0,0.0,4.0,1.0,0,0,40,39.0
5390,31,4.0,147215,2.0,8,0.0,8.0,4.0,4.0,0.0,0,0,21,39.0
860,18,4.0,216284,1.0,7,4.0,1.0,3.0,4.0,0.0,0,0,20,39.0
15795,50,6.0,54261,11.0,9,2.0,5.0,0.0,4.0,1.0,0,0,84,39.0


In [9]:
X_test

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
14160,27,4.0,160178,15.0,10,0.0,1.0,1.0,4.0,0.0,0,0,38,39.0
27048,45,7.0,50567,11.0,9,2.0,4.0,5.0,4.0,0.0,0,0,40,39.0
28868,29,4.0,185908,9.0,13,2.0,4.0,0.0,2.0,1.0,0,0,55,39.0
5667,30,4.0,190040,9.0,13,4.0,7.0,1.0,4.0,0.0,0,0,40,39.0
7827,29,6.0,189346,15.0,10,0.0,3.0,1.0,4.0,1.0,2202,0,50,39.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1338,71,4.0,269708,9.0,13,0.0,13.0,3.0,4.0,0.0,2329,0,16,39.0
24534,55,2.0,253062,15.0,10,2.0,10.0,0.0,4.0,1.0,0,0,40,39.0
18080,47,4.0,354148,14.0,15,2.0,10.0,0.0,4.0,1.0,99999,0,48,39.0
10354,27,4.0,43652,9.0,13,4.0,1.0,1.0,4.0,0.0,0,0,40,39.0


In [10]:
y_train

5514       >50K
19777      >50K
10781     <=50K
32240     <=50K
9876      <=50K
          ...  
29802     <=50K
5390      <=50K
860       <=50K
15795     <=50K
23654     <=50K
Name: income, Length: 26048, dtype: object

In [11]:
y_test

14160     <=50K
27048     <=50K
28868      >50K
5667      <=50K
7827      <=50K
          ...  
1338      <=50K
24534      >50K
18080      >50K
10354     <=50K
24639     <=50K
Name: income, Length: 6513, dtype: object

In [12]:
# Define the similarity function
def similarity(case1, case2):
    return (case1 == case2).mean()

In [13]:
# Define the case-based reasoning function
def case_based_reasoning(query, cases, k=5):
    similarities = cases.apply(lambda x: similarity(x, query), axis=1)
    nearest_neighbors = similarities.nlargest(k)
    return nearest_neighbors.index

In [14]:
# Perform case-based reasoning on the test set
predictions = []
for i in range(len(X_test)):
    query = X_test.iloc[i]
    neighbors = case_based_reasoning(query, X_train)
    prediction = y_train.loc[neighbors].mode()[0]
    predictions.append(prediction)

In [None]:
# Evaluate the accuracy of the predictions
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy}")