In [1]:
import pandas as pd
import numpy as np
import warnings
import math
import time
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from scipy.stats import wasserstein_distance
from collections import defaultdict

warnings.filterwarnings('ignore')

In [2]:
def reading_adult_csv(): #reading and taking off the rows "NULL" values
    df = pd.read_csv('adult.csv')
    df = df[(df.workclass != '?') & (df.occupation != '?') & (df['native-country'] != '?')] #removing missing data
    df.drop(df.columns[3], axis=1, inplace=True) #education is equivalent to education-num, so i'm going to remove education column
    df.drop(df.columns[2], axis=1, inplace=True) #fnlwgt has continuous values, so it's too much for tree
    df.reset_index(drop=True, inplace=True)
    return df

## Atribuir pesos as classes em ordem crescente:
### Workclass
       (1) Without-pay          
       (2) Self-emp-not-inc     
       (3) Self-emp-inc         
       (4) Private              
       (5) Local-gov                   
       (6) State-gov                 
       (7) Federal-gov     

In [3]:
def updating_values(df):
    #replacing the values with the ones that were specified in text above
    df['workclass'].replace({'Without-pay': 1, 'Self-emp-not-inc': 2, 'Self-emp-inc': 3, 'Private': 4,
                             'Local-gov': 5, 'State-gov': 6, 'Federal-gov': 7}, inplace=True)

    categorical = df.copy()
    categorical.drop(categorical.columns[[0,1,2,8,9,10]], axis=1, inplace=True)
    categorical.reset_index(drop=True, inplace=True)
  
    num_attr = df.copy()
    #removing categorical columns and label (the label will be added again after all one-hot encoded columns been added)
    num_attr.drop(num_attr.columns[[3,4,5,6,7,11,12]], axis=1, inplace=True)
    num_attr.reset_index(drop=True, inplace=True)
    scaler = StandardScaler() # se não normalizar a distancia l_infinito fica mto alta pra atributos com grandes diferenças de valores
    standardization = pd.DataFrame(scaler.fit_transform(num_attr), columns = ['age', 'workclass', 'educational-num', 'capital-gain', 'capital-loss', 'hours-per-week'])
    new_df = standardization.join(categorical)
    
    return new_df

In [4]:
df = reading_adult_csv()
df = updating_values(df)

In [5]:
df.head()

Unnamed: 0,age,workclass,educational-num,capital-gain,capital-loss,hours-per-week,marital-status,occupation,relationship,race,gender,native-country,income
0,-1.024983,-0.04462,-1.221559,-0.146733,-0.21878,-0.07812,Never-married,Machine-op-inspct,Own-child,Black,Male,United-States,<=50K
1,-0.041455,-0.04462,-0.438122,-0.146733,-0.21878,0.754701,Married-civ-spouse,Farming-fishing,Husband,White,Male,United-States,<=50K
2,-0.798015,1.012377,0.737034,-0.146733,-0.21878,-0.07812,Married-civ-spouse,Protective-serv,Husband,White,Male,United-States,>50K
3,0.412481,-0.04462,-0.046403,0.877467,-0.21878,-0.07812,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,United-States,>50K
4,-0.344079,-0.04462,-1.613277,-0.146733,-0.21878,-0.910942,Never-married,Other-service,Not-in-family,White,Male,United-States,<=50K


In [6]:
def distance_l_infty(dataset, ind_x, ind_y):
    def cat_distance(dataset, ind_x, ind_y, cat=['marital-status', 'occupation', 'relationship', 'race', 'gender', 'native-country']):
        return list(map(int, (dataset[cat].iloc[ind_x] != dataset[cat].iloc[ind_y])))
    
    num_attr = ['age', 'workclass', 'educational-num', 'capital-gain', 'capital-loss', 'hours-per-week']
    diff = list(dataset[num_attr].values[ind_x] - dataset[num_attr].values[ind_y])
    [diff.append(d) for d in cat_distance(dataset, ind_x, ind_y)]
    
    return np.linalg.norm(diff, np.inf)

In [7]:
def bound_distance_d(dataset):
    min_bound = np.infty
    for i in range(len(dataset)-1):
        for j in range(i+1, len(dataset)):
            dist_i_j = distance_l_infty(dataset, i, j)
            if(min_bound > dist_i_j):
                min_bound = dist_i_j
                indexes = i,j
    return min_bound, indexes

In [8]:
#bound_distance_d(df[:100])

(0.07565598997235568, (44, 60))