In [1]:
import numpy as np
import metnum
from scoring import cross_validation as cv
from scoring import metrics

In [2]:
import pandas as pd

df_train = pd.read_csv("../data/train.csv")
df_train.sample(frac=1)
#df_train = df_train[:1000]

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
41022,8,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
954,6,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
39473,6,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4450,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
21254,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16218,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
29271,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3613,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16517,6,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
34089,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
X = df_train[df_train.columns[1:]].values
y = df_train["label"].values.reshape(-1, 1)

## Cross validation ##

In [4]:
%%time
import os
import numpy as np

class MatrixFSDict:
    """Mantiene un FS dict de matrices de numpy"""

    def __init__(self, d):
        self.dir = d

    def load(self, name: str) -> np.ndarray:
        path = os.path.join(self.dir, name)
        #print("loading:", path)
        return np.loadtxt(path, delimiter=',')
    
    def save(self, name: str, X: np.ndarray):
        path = os.path.join(self.dir, name)
        print("storing:", path)
        return np.savetxt(path, X, delimiter=",")
    
    def contains(self, name: str):
        return os.path.exists(os.path.join(self.dir, name))


CPU times: user 40 µs, sys: 5 µs, total: 45 µs
Wall time: 49.8 µs


In [5]:
DIR = "pca-cache"
MAX_ALPHA = 101

class CachedPCA:
    def __init__(self, alpha: int):
        assert(alpha < MAX_ALPHA)
        self.pca = metnum.PCA(alpha)
        self.cache = MatrixFSDict(DIR)
        self.alpha = alpha
        
    def fit(self, X, fold: int):
        name = "fold_{}".format(fold)
        if self.cache.contains(name):
            # HIT!
            tc = self.cache.load(name)
            # verificar!!!!!!!!!!!
            self.pca.set_tc(tc[:self.alpha])
            return
        
        # MISS :(
        max_pca = metnum.PCA(MAX_ALPHA)
        max_pca.fit(X)
        tc = max_pca.get_tc()
        self.cache.save(name, tc)
    
        self.pca.set_tc(tc[:self.alpha+1])
        
    def transform(self, X):
        return self.pca.transform(X)

class KNNnPCAClassifier:
    def __init__(self, k, alpha):
        self.knn_classifier = metnum.KNNClassifier(k, "uniform")
        self.pca = CachedPCA(alpha)

    def fit(self, X_train, y_train, fold=None):
        assert(fold != None)
        self.pca.fit(X_train, fold)
        self.knn_classifier.fit(self.pca.transform(X_train), y_train)
    
    def predict(self, X):
        return self.knn_classifier.predict(self.pca.transform(X))

In [6]:
"""%%time
clf = KNNnPCAClassifier(10,20)
mean = cv.cross_validate_pca(clf, X, y, metrics.accuracy_score, 4, True)"""

'%%time\nclf = KNNnPCAClassifier(10,20)\nmean = cv.cross_validate_pca(clf, X, y, metrics.accuracy_score, 4, True)'

In [8]:
import metnum

import numpy.random as rn
import numpy as np

from scoring import cross_validation as cv
from scoring import metrics

k_range = (1, 75)
alpha_range = (15, 75)
T = 100

## Funciones parametro de simm ann

def clip(k, alpha) -> (int, int):
    """Le pasas el state y lo pone en el rango valido"""
    a, b = k_range
    c, d = alpha_range
    
    return max(a,min(b,int(k))), max(c, min(d, int(alpha)))

def random_start() -> (int,int):
    """Elige un state start aleatorio"""
    a, b = k_range
    c, d = alpha_range
    
    rnd_k = a + (b - a) * rn.random_sample()
    rnd_alpha = c + (d - c) * rn.random_sample() 
    return clip(5, 34)

def random_neighbour(state, fraction=0.0) -> (int, int):
    """
    Varia un poco el estado actual para moverse en el espacio
    y probar una solucion distinta.
    """
    a, b = k_range
    c, d = alpha_range
    #new_k = actual_k
    
    new_state = state
    
    steps = 0
    while new_state == state and fraction != 1.0 and steps < 10:
        steps+=1
        
        dice = int(2*rn.random_sample())
        delta_k = int((b - a) * (0.5-rn.random_sample()) * (1.0-fraction))
        delta_alpha = int((d - c) * (0.5-rn.random_sample()) * (1.0-fraction))
        new_k, new_alpha = new_state
        
        if dice == 0:
            new_k += delta_k
        else:
            new_alpha += delta_alpha
            
        new_state = clip(new_k, new_alpha)
        
    return new_state

def acceptance_probability(cost, new_cost, temperature):
    """Probabilidad de aceptar una solucion como la nueva mejor"""
    if new_cost > cost: # cost en nuestro caso va a ser scoring
        # Si la solucion nueva es mejor que la anterior
        # la tomamos como la nueva mejor siempre.
        return 1
    
    if temperature == 0.0:
        temperature = 0.00001
    p = np.exp(- (0.8*T)*( cost - new_cost) / temperature)
    return p

def change_temperature(fraction):
    """Cuanto estas buscando soluciones"""
    return T*(1-fraction)

In [8]:
from metaheuristics import simulated_annealing as sa

K = 4

def scoring(state):
    # classifier a optimiazr
    k, alpha = state
    clf = KNNnPCAClassifier(k, alpha)
    accuracy = cv.cross_validate_pca(clf, X, y, metrics.accuracy_score, K)
    return accuracy

hist = []

best_state = sa.annealing(
    random_start,
    scoring,
    random_neighbour,
    acceptance_probability,
    change_temperature,
    max_state_reset_steps=20,
    max_steps=100,
    debug=True,
    history = hist
)

  0%|          | 0/99 [00:00<?, ?it/s]

initial: state = (5, 34), score = 0.9749285714285714


 20%|██        | 20/99 [45:43<2:45:08, 125.43s/it]

State reset


 40%|████      | 40/99 [1:21:59<2:11:20, 133.57s/it]

State reset


 61%|██████    | 60/99 [1:59:55<1:03:10, 97.20s/it]

State reset


 81%|████████  | 80/99 [2:24:28<23:37, 74.61s/it]

State reset


 86%|████████▌ | 85/99 [2:30:50<17:35, 75.42s/it]

#85 Found better solution [k = (6, 34), score = 0.9752857142857143]


100%|██████████| 99/99 [2:48:58<00:00, 102.41s/it]

Best solution: state = (6, 34), score = 0.9752857142857143





In [9]:
df = pd.DataFrame(hist, columns=["step", "state", "score", "best"])
df

Unnamed: 0,step,state,score,best
0,0,"(5, 34)",0.974929,False
1,1,"(5, 34)",0.974929,False
2,2,"(5, 50)",0.974762,False
3,3,"(5, 43)",0.974429,False
4,4,"(14, 43)",0.973048,False
5,5,"(1, 43)",0.970881,False
6,6,"(1, 69)",0.969929,False
7,7,"(1, 67)",0.970286,False
8,8,"(1, 75)",0.970071,False
9,9,"(11, 75)",0.972095,False


In [11]:
df.to_csv("data/pca/K_4_pca_weights2.csv", index=False)

In [None]:
K_FOLD_VALUES = [2,4,6,8,10]
X_SLICES = [1000,5000,10000,25000,42000]

for k in K_FOLD_VALUES:
    for x in X_SLICES:
        