In [2]:
# Cálculo científico y vectorial para python
import numpy as np
# Libreria para graficos
from matplotlib import pyplot
# Modulo de optimizacion en scipy
from scipy import optimize
# modulo para cargar archivos en formato MATLAB
from scipy.io import loadmat
import pandas as pd

In [3]:
# cargar a una variable el dataset
data = pd.read_csv("tortuga_pre.csv")
data
# columnas
# Unnamed: 0,HOURS_DATASCIENCE,HOURS_BACKEND,HOURS_FRONTEND,NUM_COURSES_BEGINNER_DATASCIENCE,NUM_COURSES_BEGINNER_BACKEND,NUM_COURSES_BEGINNER_FRONTEND,NUM_COURSES_ADVANCED_DATASCIENCE,NUM_COURSES_ADVANCED_BACKEND,NUM_COURSES_ADVANCED_FRONTEND,AVG_SCORE_DATASCIENCE,AVG_SCORE_BACKEND,AVG_SCORE_FRONTEND,PROFILE


Unnamed: 0,28,7.0,39.0,29.0,2.0,4.0,0.0,2.0.1,5.0,0.0.1,84.0,74.0,0.0.2,1
0,81,32.0,0.0,44.0,2.0,0.0,0.0,0.0,5.0,0.0,67.0,45.0,0.0,1
1,89,45.0,0.0,59.0,0.0,5.0,4.0,0.0,4.0,1.0,0.0,54.0,47.0,2
2,138,36.0,19.0,28.0,0.0,5.0,7.0,0.0,5.0,3.0,0.0,71.0,89.0,5
3,143,61.0,78.0,38.0,6.0,11.0,0.0,4.0,3.0,0.0,66.0,85.0,0.0,2
4,169,24.0,69.0,68.0,3.0,7.0,0.0,4.0,5.0,0.0,66.0,75.0,0.0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19994,20495,0.0,44.0,42.0,8.0,4.0,7.0,3.0,3.0,3.0,74.0,73.0,93.0,4
19995,20496,0.0,85.0,63.0,3.0,5.0,0.0,4.0,7.0,3.0,50.0,83.0,94.0,2
19996,20497,32.0,50.0,22.0,0.0,4.0,0.0,6.0,2.0,3.0,61.0,81.0,75.0,4
19997,20498,0.0,96.0,69.0,3.0,3.0,3.0,5.0,7.0,4.0,64.0,68.0,68.0,2


In [4]:
# num de clases 
num_labels = 6
# leemos el dataset
data = np.loadtxt('tortuga_pre.csv', delimiter=',')
# sacamos la cantidad de filas
num_filas = data.shape[0]
# sacamos el 70% de todo el dataset para el entrenamiento
filas_entrenamiento = int(round(num_filas * 0.7))
# cargamos las X's y la Y de entrenamiento
X, y = data[:filas_entrenamiento, 1:13], data[:filas_entrenamiento, 13]

print(X)
print(y)
# num de ejemplos
m = y.size

[[ 7. 39. 29. ... 84. 74.  0.]
 [32.  0. 44. ... 67. 45.  0.]
 [45.  0. 59. ...  0. 54. 47.]
 ...
 [65. 71. 28. ... 73. 46. 59.]
 [31. 32. 24. ... 86. 78. 80.]
 [62. 11. 37. ... 59. 83. 79.]]
[1. 1. 2. ... 3. 4. 2.]


In [5]:
# imprimimos las dimensiones de las X's y la Y
print(X.shape)
print(y.shape)

(14000, 12)
(14000,)


In [6]:
# funcion para la normalizacion de caracteristicas
def  featureNormalize(X):
    X_norm = X.copy()
    media_norm = np.zeros(X.shape[1])
    sigma = np.zeros(X.shape[1])

    media_norm = np.mean(X, axis = 0)
    sigma = np.std(X, axis = 0)
    X_norm = (X - media_norm) / sigma
    
    return X_norm, media_norm, sigma

In [7]:
# normalizamos las X's de entrenamiento
X_norm, media_norm, sigma = featureNormalize(X)

In [8]:
X = X_norm
# X = X_datos
print(X[0])

[-1.35899504 -0.19233581 -0.38080342 -0.84810837  0.14046872 -1.75900593
 -0.65807396  0.30048131 -1.69014949  1.26541478  0.47820878 -4.25275223]


In [9]:
# funcion sigmoide, devuelve una probabilidad
def sigmoid(z):
    """
    Calcula la sigmoide de z.
    """
    return 1.0 / (1.0 + np.exp(-z))

In [10]:
# probamos la funcion sigmoide
h = sigmoid(0)
print(h)

0.5


In [11]:
# funcion de costo
def lrCostFunction(theta, X, y, lambda_):
    # Inicializa algunos valores utiles
    m = y.size
    # print(m)
    # convierte las etiquetas a valores enteros si son boleanos
    if y.dtype == bool:
        y = y.astype(int)
    
    J = 0
    grad = np.zeros(theta.shape)
    
    h = sigmoid(X.dot(theta.T))
    
    temp = theta
    temp[0] = 0
    
    J = (1 / m) * np.sum(-y.dot(np.log(h)) - (1 - y).dot(np.log(1 - h))) + (lambda_ / (2 * m)) * np.sum(np.square(temp))
    
    grad = (1 / m) * (h - y).dot(X) 
    grad = grad + (lambda_ / m) * temp

    return J, grad

In [12]:
# algoritmo one vs all
def oneVsAll(X, y, num_labels, lambda_):
    # algunas variables utiles
    m, n = X.shape
    
    all_theta = np.zeros((num_labels, n + 1))

    # Agrega unos a la matriz X
    X = np.concatenate([np.ones((m, 1)), X], axis=1)

    for c in np.arange(num_labels):
        initial_theta = np.zeros(n + 1)
        options = {'maxiter': 1500}
        res = optimize.minimize(lrCostFunction, 
                                initial_theta, 
                                (X, (y == c), lambda_), 
                                jac=True, 
                                method='CG',
                                options=options) 
        
        all_theta[c] = res.x

    return all_theta

In [13]:
# sacamos todas las thetas, las 6 thetas
lambda_ = 0.1
all_theta = oneVsAll(X, y, num_labels, lambda_)
# mostramos las dimensiones
print(all_theta.shape)

(6, 13)


In [14]:
# funcion para predecir
def predictOneVsAll(all_theta, X):
    m = X.shape[0];
    num_labels = all_theta.shape[0]

    p = np.zeros(m)

    # Add ones to the X data matrix
    X = np.concatenate([np.ones((m, 1)), X], axis=1)
    p = np.argmax(sigmoid(X.dot(all_theta.T)), axis = 1)

    return p

In [None]:
# rescatamos en 30 por cierto del dataset global
X_p , y_p = data[filas_entrenamiento:, 1:13], data[filas_entrenamiento:, 13]

# normalizamos las X's de entrenamiento
X_norm_p, media_norm_p, sigma_p = featureNormalize(X)

In [17]:
# hacemos las pruebas con los datos de prueba
print(X_p.shape)
pred = predictOneVsAll(all_theta, X_p)
print('Precision del conjuto de entrenamiento: {:.2f}%'.format(np.mean(pred == y_p) * 100))
XPrueba = X_p[1011:1012, :].copy()
print(XPrueba.shape)

XPrueba = np.concatenate([np.ones((1, 1)), XPrueba], axis=1)
print(XPrueba.shape)
p = np.argmax(sigmoid(XPrueba.dot(all_theta.T)), axis = 1)
print('Perfil',p)
print('Perfil esperado', y_p[1011])

(6000, 12)
Precision del conjuto de entrenamiento: 31.93%
(1, 12)
(1, 13)
Perfil [3]
Perfil esperado 6.0
