In [1]:
from src.models import GKR

In [2]:
import pandas as pd
## Define path data
columns = [
    "age",
    "workclass", 
    "fnlwgt", 
    "education", 
    "education_num", 
    "marital", 
    "occupation", 
    "relationship", 
    "race", 
    "sex", 
    "capital_gain", 
    "capital_loss", 
    "hours_week", 
    "native_country", 
    "label"
]
path = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
path_test ="https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test"

## Import 			
df_train = pd.read_csv(path, skipinitialspace=True, names = columns, index_col=False)
df_test = pd.read_csv(path_test,skiprows = 1, skipinitialspace=True, names = columns, index_col=False)

In [3]:
df_train.shape

(32561, 15)

In [4]:
df_test.shape

(16281, 15)

In [5]:
df_train.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital,occupation,relationship,race,sex,capital_gain,capital_loss,hours_week,native_country,label
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [6]:
label = {'<=50K': 0,'>50K': 1}
df_train.label = [label[item] for item in df_train.label]
label_t = {'<=50K.': 0,'>50K.': 1}
df_test.label = [label_t[item] for item in df_test.label]			
df_train.shape	

(32561, 15)

In [7]:
df_train

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital,occupation,relationship,race,sex,capital_gain,capital_loss,hours_week,native_country,label
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,0
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,1
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,0
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,0


In [8]:
COLUMNS_INT = ["age","fnlwgt","education_num","capital_gain", "capital_loss", "hours_week"]
CATE_FEATURES = ["workclass", "education", "marital", "occupation", "relationship", "race", "sex", "native_country"]
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing	
import numpy as np		

def prep_data_str(df):			    
    scaler = StandardScaler()    
    le = preprocessing.LabelEncoder()       
    df_toscale = df[COLUMNS_INT]    
    df_scaled = scaler.fit_transform(df_toscale.astype(np.float64))    
    X_1 = df[CATE_FEATURES].apply(le.fit_transform)    
    y = df['label'].astype(np.int32)    
    X_conc = np.c_[df_scaled, X_1].astype(np.float32)				
    return X_conc, y

In [9]:
X_train, y_train = prep_data_str(df_train)
X_test, y_test = prep_data_str(df_test)
print(X_train.shape)			

(32561, 14)


In [10]:
gkr = GKR(X_train, y_train, 0.5)

In [11]:
gkr.predict(X_train[2]), y_train[2]

(0, 0)

In [12]:
gkr.predict(X_test[0]), y_test[0]

(0, 0)

In [13]:
gkr.predict(X_test[1]), y_test[1]

(0, 0)

In [14]:
gkr.predict(X_test[2]), y_test[2]

(0, 1)

In [15]:
gkr.predict(X_test[3]), y_test[3]

(1, 1)

In [17]:
predict, y, acc = gkr.score(X_test[:10], y_test[:10])

acc

100%|██████████| 10/10 [00:00<00:00, 1799.20it/s]


0.9

In [23]:
np.array(predict)

array([0, 0, 0, 1, 0, 0, 0, 1, 0, 0])

In [22]:
y.values

array([0, 0, 1, 1, 0, 0, 0, 1, 0, 0], dtype=int32)

In [34]:
import pandas as pd

data = {'y_Actual':    y.values,
        'y_Predicted': np.array(predict)
        }

df = pd.DataFrame(data, columns=['y_Actual','y_Predicted'])

confusion_matrix = pd.crosstab(df['y_Actual'], df['y_Predicted'], rownames=['Actual'], colnames=['Predicted'])
print (confusion_matrix)

Predicted  0  1
Actual         
0          7  0
1          1  2


In [35]:
confusion_matrix.values

array([[7, 0],
       [1, 2]])

In [51]:
import plotly.figure_factory as ff
import plotly.offline as py
py.init_notebook_mode(connected=True)
names = ["Falso", "Verdadeiro"]
fig = ff.create_annotated_heatmap(confusion_matrix.values, x = names, y = names)
fig.update_layout(yaxis = dict(categoryorder = 'category descending'), title="Matriz de Confusão")
fig.update_yaxes(title="Valor Real")
fig.update_xaxes(title="Valor Predito")

fig.show()

In [32]:
acc = (confusion_matrix[0][0] + confusion_matrix[1][1]) / (confusion_matrix[0][0] + confusion_matrix[1][1] + confusion_matrix[0][1] + confusion_matrix[1][0])
precision = confusion_matrix[0][0] / (confusion_matrix[0][0] + confusion_matrix[0][1])
recall = confusion_matrix[0][0] / (confusion_matrix[0][0] + confusion_matrix[1][0])

acc, precision, recall

(0.9, 0.875, 1.0)

In [31]:
# Accuracy
from sklearn.metrics import accuracy_score
accuracy_score(y_true, y_pred)
# Recall
from sklearn.metrics import recall_score
recall_score(y_true, y_pred, average=None)
# Precision
from sklearn.metrics import precision_score
precision_score(y_true, y_pred, average=None)

7

In [26]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y.values, np.array(predict))

array([[7, 0],
       [1, 2]])

In [None]:
mse = np.mean((np.array(0.5101443474802214) - np.array(1)) ** 2)
mae = np.mean(np.abs(np.array(0.5101443474802214) - np.array(1)))

mse, mae

In [None]:
mse = np.mean((np.array(0.17726072811588295) - np.array(0)) ** 2)
mae = np.mean(np.abs(np.array(0.17726072811588295) - np.array(0)))

mse, mae

In [None]:
#Cross Entropy Loss

def cross_entropy(y,y_pre):
  loss=-np.sum(y*np.log(y_pre))
  return loss/float(y_pre.shape[0])



y=np.array([0,1]) #class #2

y_pre_good=np.array([1-0.5101443474802214, 0.5101443474802214])

l1=cross_entropy(y,y_pre_good)

print('Loss 1:',l1)


In [None]:
y=np.array([1,0]) #class #2

y_pre_good=np.array([1-0.17726072811588295, 0.17726072811588295])

l1=cross_entropy(y,y_pre_good)

print('Loss 1:',l1)
