In [1]:
import pandas as pd 
import numpy as np
import math
import heapq
from sklearn.model_selection import train_test_split

### Extracción de datos: dataset en https://archive.ics.uci.edu/ml/datasets/Concrete+Compressive+Strength

In [2]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/concrete/compressive/Concrete_Data.xls"
X = pd.read_excel(url)
y = X.iloc[:,-1] # la variable independiente se encuentra en la ultima columna
X.drop(X.columns[-1], axis='columns', inplace=True)
X.columns = range(X.shape[1])
print(X)
print(y)

          0      1      2      3     4       5      6    7
0     540.0    0.0    0.0  162.0   2.5  1040.0  676.0   28
1     540.0    0.0    0.0  162.0   2.5  1055.0  676.0   28
2     332.5  142.5    0.0  228.0   0.0   932.0  594.0  270
3     332.5  142.5    0.0  228.0   0.0   932.0  594.0  365
4     198.6  132.4    0.0  192.0   0.0   978.4  825.5  360
...     ...    ...    ...    ...   ...     ...    ...  ...
1025  276.4  116.0   90.3  179.6   8.9   870.1  768.3   28
1026  322.2    0.0  115.6  196.0  10.4   817.9  813.4   28
1027  148.5  139.4  108.6  192.7   6.1   892.4  780.0   28
1028  159.1  186.7    0.0  175.6  11.3   989.6  788.9   28
1029  260.9  100.5   78.3  200.6   8.6   864.5  761.5   28

[1030 rows x 8 columns]
0       79.986111
1       61.887366
2       40.269535
3       41.052780
4       44.296075
          ...    
1025    44.284354
1026    31.178794
1027    23.696601
1028    32.768036
1029    32.401235
Name: Concrete compressive strength(MPa, megapascals) , Length: 1030,

Defina la proporción de datos para el conjunto de entrenamiento

In [3]:
prop = float(input("Proporcion de datos para el conjunto de entrenamiento: p = ")) # proporcion del conjunto de entrenamiento
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=prop) #, random_state=0)

Proporcion de datos para el conjunto de entrenamiento: p = 0.6


### Distancia Minkowski de $x,y$ vectores en $\mathbb{R}^n$

In [4]:

def MinkowskiDistance(x,y, p=2): # Distancia Minkowski de x,y vectores en R^n
  dist = 0
  n = len(x)

  for i in range(0,n):
    dist += abs(x[i]-y[i])**p

  dist = dist**(1/p)

  return dist


### Implementación del KNN para regresión

In [5]:
def prediccion(k_nearest): # Obtener el promedio de las etiquetas en k_nearest
  pred = 0
  k = len(k_nearest)

  for i in range(k):
    pred += k_nearest[i][1]

  pred = pred/k
  
  return pred

In [6]:
def KNN(k, X_train, y_train, X_test):
  X_train = X_train.to_numpy()
  y_train = y_train.to_numpy()
  X_test = X_test.to_numpy()

  n = X_test.shape[0]
  m = X_train.shape[0] 

  labeling = []

  for i in range(n):
    u = X_test[i]
    pq = [] 
    heapq.heapify(pq) # Cola de prioridad para guardar los pares (distancia(u,p_i), y_i), donde p_i es el i-esimo vector prototipo con etiqueta y_i

    for j in range(m):
      xj = X_train[j]
      dist = MinkowskiDistance(u, xj)
      heapq.heappush(pq,(dist, y_train[j]))
    
    k_nearest = list(heapq.nsmallest(k, pq)) # Obtener las k menores distancias
    labeling.append(prediccion(k_nearest)) 

  return pd.DataFrame(labeling)


### Función del error cuadrático medio

In [7]:
def MSE(pred, real):

  pred = pred.to_numpy()
  real = real.to_numpy()

  n = len(real)

  mse = 0

  for i in range(n):
    mse += (pred[i]-real[i])**2

  mse = mse/n

  return mse


### Regresión de los datos usando KNN

Ingresar k

In [8]:
k = int(input("k = "))

k = 6


Conjunto de entrenamiento

In [9]:
labels_train = KNN(k, X_train, y_train, X_train)
mse_train = MSE(labels_train, y_train)
print("Predicciones: ")
print(labels_train)
print("Valores reales")
print(y_train)
print("Error cuadrático medio")
print(mse_train)

Predicciones: 
             0
0    42.991127
1    21.012930
2    26.962304
3    54.298533
4    39.005841
..         ...
613  35.961919
614  30.676396
615  46.535033
616  48.665514
617  13.570726

[618 rows x 1 columns]
Valores reales
537     43.942374
440     24.848715
48      26.258004
98      55.599345
1018    37.265488
          ...    
853     40.058556
638     38.210760
343     56.743875
137     71.298713
656     11.414275
Name: Concrete compressive strength(MPa, megapascals) , Length: 618, dtype: float64
Error cuadrático medio
[60.49615201]


Conjunto de prueba

In [10]:
labels_test = KNN(k, X_train, y_train, X_test)
mse_test = MSE(labels_test, y_test)
print("Predicciones: ")
print(labels_test)
print("Valores reales")
print(y_test)
print("Error cuadrático medio")
print(mse_test)

Predicciones: 
             0
0    43.848375
1    48.665514
2    24.569707
3    44.482693
4    32.266098
..         ...
407  44.034534
408  31.203386
409  74.063512
410  24.940645
411  21.012930

[412 rows x 1 columns]
Valores reales
263    47.739318
109    55.895819
654    24.290929
83     35.301171
449    35.852752
         ...    
525    25.116626
435    41.203086
177    79.296635
266    38.500340
199    15.044366
Name: Concrete compressive strength(MPa, megapascals) , Length: 412, dtype: float64
Error cuadrático medio
[88.31675493]


### Comparación con la implemetación del KNN en ScikitLearn

Implementación propia para k=1,2,..., 9.

In [11]:
mse_var = []
for k in range(1, 10):
  labels_test = KNN(k, X_train, y_train, X_test)
  mse_var.append((MSE(labels_test, y_test), k))

sorted(mse_var)

[(array([86.25044849]), 3),
 (array([88.31675493]), 6),
 (array([88.69923699]), 4),
 (array([89.07941715]), 2),
 (array([89.63770761]), 5),
 (array([92.11292062]), 7),
 (array([92.51329186]), 8),
 (array([96.16762841]), 9),
 (array([108.53844062]), 1)]

Implementación en ScikitLearn para k=1,2,..., 9.

In [12]:
from sklearn.neighbors import KNeighborsRegressor

mse_var = []
for k in range(1, 10):
  neigh = KNeighborsRegressor(n_neighbors=6)
  neigh.fit(X_train, y_train)
  labels_test = pd.Series(neigh.predict(X_test))
  mse_test = MSE(labels_test, y_test)
  mse_var.append((mse_test,k))

sorted(mse_var)

[(88.31470460640202, 1),
 (88.31470460640202, 2),
 (88.31470460640202, 3),
 (88.31470460640202, 4),
 (88.31470460640202, 5),
 (88.31470460640202, 6),
 (88.31470460640202, 7),
 (88.31470460640202, 8),
 (88.31470460640202, 9)]