<a href="https://colab.research.google.com/github/oliveirasWell/mlp/blob/master/RBF_Adult_income_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Imports
# Lib para balancear o data-set
library(groupdata2)
library(tidyr)

# Lib para plotar dados
install.packages("yardstick")
library(yardstick)
library(ggplot2)

# Lib para criar colunas 
install.packages("fastDummies")
library(fastDummies)

In [None]:
# Dataset
dados <- read.csv('sample_data/adult.data', header=FALSE, strip.white=TRUE)
colnames(dados) <- c("age",
                     "workclass", 
                     "fnlwgt",
                     "education",
                     "education-num",
                     "marital",
                      "occupation",
                      "relationship",
                      "race",
                      "sex", 
                      "capital-gain", 
                      "capital-loss",
                      "hours-per-week",
                      "country",
                      "income"
)

# Ajustar classes para o algoritmo entender cada uma
dados$income[dados$income == "<=50K"] <- -1
dados$income[dados$income == ">50K"] <- 1
dados$income = as.integer(as.character(dados$income))
dados$workclass[dados$workclass == "?"] <- mode(dados$workclass)
dados$occupation[dados$occupation == "?"] <- mode(dados$occupation)
dados$relationship[dados$relationship == "?"] <- mode(dados$relationship)
dados$race[dados$race == "?"] <- mode(dados$race)
dados$sex[dados$sex == "?"] <- mode(dados$sex)
dados$"marital"[dados$"marital"  == "?"] <- mode(dados$"marital" )
dados$"country"[dados$"country" == "?"] <- mode(dados$"country")

# Criando novas colunas com base em dados categoricos
dados <- dummy_cols(dados, select_columns = c("education", "occupation", "workclass", "relationship", "race", "sex", "marital" ))

dados <-  subset( dados, select = -education )
dados <-  subset( dados, select = -occupation )
dados <-  subset( dados, select = -workclass )
dados <-  subset( dados, select = -relationship )
dados <-  subset( dados, select = -race )
dados <-  subset( dados, select = -sex )
dados <-  subset( dados, select = -marital )

# remove columnas que não conseguimos tratar
# country -> gera mais de 50 colunas
dados <-  subset( dados, select = -country )

# Colocando Y como ultima coluna
dados <- dados[c(8:67, 1:7)]

str(dados)

# Normalização dos dados com centralização da média em zero e escala em desvio padrão
dados[c(1:66)] <- scale(dados[c(1:66)])

# Balanceando o dataset pois a classe de renda é dividida em 75%/25% 
# Downsampling
dados <- balance(dados, "min", cat_col = "income")

# Aleatorizando os dados balanceados
# Evitando problemas de overfit que a biblioteca provém
dados <- dados[sample(nrow(dados)),]

# Dados de treino 
tamanho <- 7841 # metade dos dados após o undersampling, dividimos em dois 
train_data <- dados[0:tamanho/2,]
test_data <- dados[tamanho/2+1:tamanho,]

Y <- as.matrix(train_data[c(67)])
X <- train_data
X$income <- NULL

# Dados de teste 
Y_test <- as.matrix(test_data[c(67)])
X_test <- test_data
X_test$income <- NULL

In [None]:
head(X)

In [None]:
table(dados$income)

In [None]:
head(Y)

In [None]:
# RBF with least squares and pseudoinverse
# Returns a RBF model, given:
# * training instances x1...xN
# * desired output values
# * number of centers
# * gamma value for the Gaussian function

# We will need the corpcor package for the pseudoinverse
install.packages('corpcor')
library(corpcor)

# RBF training function
rbf <- function(X, Y, K=10, gamma=1.0) {

	N <- dim(X)[1] # number of training instances
  ncols <- dim(X)[2] # number of features
  
  repeat {
   	km <- kmeans(X, K)  # cluster data into K clusters
   	if (min(km$size)>0) # we can not allow empty clusters
   		break
  }

	mus <- km$centers # centers of the clusters (means)
  
	# Calculate the outputs of the Gaussian functions for all instances
	Phi <- matrix(rep(NA,(K+1)*N), ncol=K+1) # Phi will store all hidden outoputs + bias
	for (lin in 1:N) {
		Phi[lin,1] <- 1    # column for the bias
		for (col in 1:K) {
      # Outputs of the Gaussian functions
			Phi[lin,col+1] <- exp( (-1/(2*gamma*gamma)) * sum((X[lin,]-mus[col,])*(X[lin,]-mus[col,])) )
    }
	}
	
	# Calculate the weights using the pseudoinverse -> w = inverse(t(Phi) * Phi) * t(Phi) * Y 
	# %*% is for matrix multiplication
  w <- pseudoinverse(t(Phi) %*% Phi) %*% t(Phi) %*% Y

  # Return the RBF model
	return(list(weights=w, centers=mus, gamma=gamma))
}


In [None]:
# Train the model using default values for K and gamma
model <- rbf(X, Y, K=80)
print(model)

In [None]:
# Implementing the function for prediction
rbf.predict <- function(model, X, classification=FALSE) {
  
  # Parameters
	gamma   <- model$gamma
	centers <- model$centers
	w       <- model$weights
	N       <- dim(X)[1]    # number of instances
  
  # Initialize the prediction vector using the weight associated to the bias,
  # since the associate input is +1
	pred <- rep(w[1],N)

	for (j in 1:N) {  
		# prediction for instance Xj
		for (k in 1:length(centers[,1])) {
			# the weight associated to the center[k] is w[k+1] because
      # w[1] is the weight associated with the bias 
      pred[j] <- pred[j] + w[k+1] * exp( (-1/(2*gamma*gamma)) * sum((X[j,]-centers[k,])*(X[j,]-centers[k,])) )
    }
  }
  
	# If we are dealing with a classification task, 
  # applies the signal function to the output
  if (classification) {
    pred <- unlist(lapply(pred, sign))
  }
  
  # Return the predictions
	return(pred)
}

In [None]:
# Make predictions for the test
rbf.pred <- rbf.predict(model, X_test, classification=TRUE)

In [None]:
# Matrix de confusão entre os items
table(rbf.pred,Y_test)

predicted <- rbf.pred

truth_predicted <- data.frame(
  obs = as.vector(Y_test),
  pred = as.vector(predicted)
)

truth_predicted$obs <- as.factor(truth_predicted$obs)
truth_predicted$pred <- as.factor(truth_predicted$pred)

cm <- conf_mat(truth_predicted, obs, pred)
autoplot(cm, type = "heatmap") + scale_fill_gradient(low="#D6EAF8",high = "#2E86C1") + theme(legend.position = "right")


In [None]:
# Erro médio
error <- sum(rbf.pred != Y_test)/nrow(Y_test)
print(error)