In [40]:
import numpy as np
import metnum
from scoring import cross_validation as cv
from scoring import metrics

In [41]:
import pandas as pd

df_train = pd.read_csv("../data/train.csv")
df_train.sample(frac=1)
#df_train = df_train[:5000]

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
3691,7,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
18272,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1620,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13903,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
18352,8,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3123,9,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26170,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
27815,9,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
37900,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
18315,9,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [32]:
X = df_train[df_train.columns[1:]].values
y = df_train["label"].values.reshape(-1, 1)

## Cross validation ##

In [33]:
%%time
import os
import numpy as np

class MatrixFSDict:
    """Mantiene un FS dict de matrices de numpy"""

    def __init__(self, d):
        self.dir = d

    def load(self, name: str) -> np.ndarray:
        path = os.path.join(self.dir, name)
        #print("loading:", path)
        return np.loadtxt(path, delimiter=',')
    
    def save(self, name: str, X: np.ndarray):
        path = os.path.join(self.dir, name)
        print("storing:", path)
        return np.savetxt(path, X, delimiter=",")
    
    def contains(self, name: str):
        return os.path.exists(os.path.join(self.dir, name))


CPU times: user 68 µs, sys: 3 µs, total: 71 µs
Wall time: 86.3 µs


In [34]:
DIR = "pca-cache"
MAX_ALPHA = 100

class CachedPCA:
    def __init__(self, alpha: int):
        assert(alpha < MAX_ALPHA)
        self.pca = metnum.PCA(alpha)
        self.cache = MatrixFSDict(DIR)
        self.alpha = alpha
        
    def fit(self, X, fold: int):
        name = "fold_{}".format(fold)
        if self.cache.contains(name):
            # HIT!
            tc = self.cache.load(name)
            # verificar!!!!!!!!!!!
            self.pca.set_tc(tc[:self.alpha])
            return
        
        # MISS :(
        max_pca = metnum.PCA(MAX_ALPHA)
        max_pca.fit(X)
        tc = max_pca.get_tc()
        self.cache.save(name, tc)
    
        self.pca.set_tc(tc[:self.alpha+1])
        
    def transform(self, X):
        return self.pca.transform(X)

class KNNnPCAClassifier:
    def __init__(self, k, alpha):
        self.knn_classifier = metnum.KNNClassifier(k, "distance_pow")
        self.pca = CachedPCA(alpha)

    def fit(self, X_train, y_train, fold=None):
        assert(fold != None)
        self.pca.fit(X_train, fold)
        self.knn_classifier.fit(self.pca.transform(X_train), y_train)
    
    def predict(self, X):
        return self.knn_classifier.predict(self.pca.transform(X))

In [35]:
"""%%time
clf = KNNnPCAClassifier(10,20)
mean = cv.cross_validate_pca(clf, X, y, metrics.accuracy_score, 4, True)"""

'%%time\nclf = KNNnPCAClassifier(10,20)\nmean = cv.cross_validate_pca(clf, X, y, metrics.accuracy_score, 4, True)'

In [36]:
import metnum

import numpy.random as rn
import numpy as np

from scoring import cross_validation as cv
from scoring import metrics

k_range = (1, 100)
alpha_range = (1, 100)
T = 100

## Funciones parametro de simm ann

def clip(k, alpha) -> (int, int):
    """Le pasas el state y lo pone en el rango valido"""
    a, b = k_range
    c, d = alpha_range
    
    return max(a,min(b,int(k))), max(c, min(d, int(alpha)))

def random_start() -> (int,int):
    """Elige un state start aleatorio"""
    a, b = k_range
    c, d = alpha_range
    
    rnd_k = a + (b - a) * rn.random_sample()
    rnd_alpha = c + (d - c) * rn.random_sample() 
    return clip(rnd_k, rnd_alpha)

def random_neighbour(state, fraction=0.0) -> (int, int):
    """
    Varia un poco el estado actual para moverse en el espacio
    y probar una solucion distinta.
    """
    a, b = k_range
    c, d = alpha_range
    #new_k = actual_k
    
    new_state = state
    
    while new_state == state and fraction != 1.0:
        delta_k = int((b - a) * (0.5-rn.random_sample()) * (1.0-fraction))
        delta_alpha = int((d - c) * (0.5-rn.random_sample()) * (1.0-fraction))
        new_k, new_alpha = new_state
        new_state = clip(new_k + delta_k, new_alpha + delta_alpha)
        
    return new_state

def acceptance_probability(cost, new_cost, temperature):
    """Probabilidad de aceptar una solucion como la nueva mejor"""
    if new_cost > cost: # cost en nuestro caso va a ser scoring
        # Si la solucion nueva es mejor que la anterior
        # la tomamos como la nueva mejor siempre.
        return 1

    p = np.exp(- ( cost - new_cost) / temperature)
    return p

def change_temperature(fraction):
    """Cuanto estas buscando soluciones"""
    return T*(1-fraction)


In [37]:
from metaheuristics import simulated_annealing as sa

K = 4

def scoring(state):
    # classifier a optimiazr
    k, alpha = state
    clf = KNNnPCAClassifier(k, alpha)
    accuracy = cv.cross_validate_pca(clf, X, y, metrics.accuracy_score, K)
    return accuracy

best_state, history = sa.annealing(
    random_start,
    scoring,
    random_neighbour,
    acceptance_probability,
    change_temperature,
    max_state_reset_steps=50,
    max_steps=200,
    debug=True,
)

storing: pca-cache/fold_0
storing: pca-cache/fold_1
storing: pca-cache/fold_2
storing: pca-cache/fold_3


  0%|          | 0/29 [00:00<?, ?it/s]

initial: state = (33, 31), score = 0.9374


100%|██████████| 29/29 [01:31<00:00,  3.14s/it]

Best solution: state = (33, 31), score = 0.9374





In [38]:
df = pd.DataFrame(history, columns=["step", "state", "score", "best"])

In [39]:
df.to_csv("data/pca/K_4_pca_distance_pow.csv", index=False)

Unnamed: 0,step,state,score,best
0,0,"(33, 31)",0.9374,False
1,1,"(33, 31)",0.9374,False
2,2,"(26, 74)",0.928,False
3,3,"(65, 49)",0.923,False
4,4,"(70, 42)",0.9236,False
5,5,"(84, 21)",0.9254,False
6,6,"(100, 1)",0.2654,False
7,7,"(100, 29)",0.9252,False
8,8,"(80, 1)",0.2654,False
9,9,"(67, 1)",0.2654,False
