In [1]:
from data_loader import DataLoader
from sklearn.decomposition import PCA
from scipy.stats import multivariate_normal
import numpy as np
from src.pyvov import ChipsIndex

In [464]:
def get_sets():
    """
    This takes ALL of the available datapoints and makes train/val/test splits.
    Needs to be improved upon by looking at specific experiments e.g HOM36 etc.
    """
    
    data = DataLoader()
    X_train = data.get_training_set()
    X_train = np.asarray(X_train)
    
    X_val = data.get_validation_set()
    X_val = np.asarray(X_val)
    
    X_test = data.get_testing_set()
    X_test = np.asarray(X_test)
    
    labels = data.get_labels()
    labels = np.asarray(labels)
    
    new_labels = np.zeros_like(labels)
    for idx, item in enumerate(labels):
        if item > 0:
            new_labels[idx] += 1

    
    n = new_labels.shape[0]
    y_val = new_labels[0:int(0.1*n)]
    y_test = new_labels[int(0.1*n):int(0.2*n)]
    y_train = new_labels[int(0.2*n):]
    
    print('X_train shape:', X_train.shape, '   y_train shape:', y_train.shape)
    print('X_val shape:', X_val.shape, '   y_val shape:', y_val.shape)
    print('X_test shape:', X_test.shape, '   y_test shape:', y_test.shape)
    
    return X_train, y_train, X_val, y_val, X_test, y_test
            
def principal_components(X, y, n):
    
    positives = X[np.where(y>0)]
    
    pca = PCA(n_components=n)
    pcs = pca.fit(positives)
    
    return pca

def normalize(X):
    X = X.astype(float)
    new_X = np.zeros_like(X)
    
    for idx in range(X.shape[0]):
        new_X[idx] = X[idx] - np.mean(X[idx])
        new_X[idx] = new_X[idx]/np.std(X[idx])
        
    return new_X

In [465]:
class gaussian_clf():
    def __init__(self, threshold=0.5, normalize=False):
        self.threshold=threshold
        self.normalize=normalize
    
    def train(self, X_train, y_train):
        
        if self.normalize:
            X_train = normalize(X_train)
        
        self.pca = principal_components(X_train, y_train, 6)
        X_train = self.pca.transform(X_train)
        
        positives = X_train[np.where(y_train>0)]
        negatives = X_train[np.where(y_train==0)]
        assert positives.shape[0]+negatives.shape[0]==X_train.shape[0]
        
        self.mean_0 = negatives.mean(axis=0)
        self.cov_0 = np.cov(negatives.T)
        self.mean_1 = positives.mean(axis=0)
        self.cov_1 = np.cov(positives.T)
        
        self.prior = positives.shape[0] / (positives.shape[0] + negatives.shape[0])
    
    def predict_prob(self, X):
        self.score_0 = multivariate_normal.pdf(X, mean=self.mean_0, cov=self.cov_0)
        self.score_1 = multivariate_normal.pdf(X, mean=self.mean_1, cov=self.cov_1)
        posterior = self.score_1 * self.prior / (self.score_1 * self.prior + self.score_0 * (1-self.prior))
        
        return posterior
        
    def evaluate(self, X, y):
        if self.normalize:
            X = normalize(X)
        
        X = self.pca.transform(X)
        true_positives = 0
        true_negatives = 0
        false_positives = 0
        false_negatives = 0
        
        for idx in range(X.shape[0]):
            if self.predict_prob(X[idx]) > self.threshold:
                if y[idx]==1:
                    true_positives += 1
                else:
                    false_positives += 1
            
            else:
                if y[idx]==1:
                    false_negatives += 1
                else:
                    true_negatives += 1
        
        print('true_positives:', true_positives)
        print('true_negatives:', true_negatives)
        print("false_positives:", false_positives)
        print("false_negatives:", false_negatives)
        
        print("detected volcanoes:", true_positives/(true_positives+false_negatives))
    
    
        

In [361]:
X_train, y_train, X_val, y_val, X_test, y_test = get_sets()

X_train shape: (29824, 225)    y_train shape: (29824,)
X_val shape: (3728, 225)    y_val shape: (3728,)
X_test shape: (3728, 225)    y_test shape: (3728,)


In [371]:
baseline = gaussian_clf(threshold=0.35, normalize = True)
baseline.train(X_train, y_train)
baseline.evaluate(X_val, y_val)

true_positives: 17
true_negatives: 3552
false_positives: 39
false_negatives: 120


In [370]:
baseline = gaussian_clf(threshold=0.35, normalize = True)
baseline.train(X_train, y_train)
baseline.evaluate(X_val, y_val)

true_positives: 17
true_negatives: 3552
false_positives: 39
false_negatives: 120


In [250]:
baseline = gaussian_clf(threshold=0.5, normalize=True)
baseline.train(X_train, y_train)
baseline.evaluate(X_val, y_val)

true_positives: 0
true_negatives: 3588
false_positives: 3
false_negatives: 137


In [248]:
baseline = gaussian_clf(threshold=0.2, normalize=True)
baseline.train(X_train, y_train)
baseline.evaluate(X_val, y_val)

true_positives: 4
true_negatives: 3572
false_positives: 19
false_negatives: 133


In [163]:
a = np.random.gamma(3,3, size =(3,3))

In [165]:
a[2] - a[2].mean()

array([-0.07539789, -4.05138487,  4.12678276])

In [358]:
a = np.array([[1,2,3],[2,3,4]])

In [359]:
np.mean(a)

2.5

In [None]:

from random import shuffle
import numpy as np


class DataLoader:
    def __init__(self, experiment_names=['C1', 'D4'], val_ratio=0.1, test_ratio=0.1, seed=8):
        ci = ChipsIndex()

        # all_experiments = ci.experiments()
        # EXP_NAMES = ['A1', 'A2', 'A3', 'A4', 'B1', 'B2', 'B3', 'B4', 'B5', 'B6', 'C1', 'D1', 'D2', 'D3', 'D4',
        # 'E1', 'E2', 'E3', 'E4', 'E5']
        # num_img = 0

        training_split = []
        testing_split = []
        all_labels = []

        # Obtain experiment data, combine C1 and D4 training and test sets:
        for EXP_NAME in experiment_names:
            training_split.extend(ci.training_split_for(EXP_NAME))
            testing_split.extend(ci.testing_split_for(EXP_NAME))
            labels = ci.labels_for(EXP_NAME)
            label_list = list(labels['trn'])
            label_list.extend(list(labels['tst']))
            all_labels.extend(label_list)

        training_split.extend(testing_split)
        full_dataset = training_split

        # Shuffle the data
        ordering = np.arange(len(full_dataset))
        rng = np.random.default_rng(seed=seed)
        rng.shuffle(ordering)
        full_dataset = [full_dataset[i] for i in ordering]
        self.all_labels = [all_labels[i] for i in ordering]

        # Create training, validation and test sets
        self.validation_set = full_dataset[0:int(val_ratio * (len(full_dataset)))]
        self.testing_set = full_dataset[
                           int(val_ratio * (len(full_dataset))):int((val_ratio + test_ratio) * (len(full_dataset)))]
        self.training_set = full_dataset[int((val_ratio + test_ratio) * (len(full_dataset))):]

    def get_training_set(self):
        return self.training_set

    def get_validation_set(self):
        return self.validation_set

    def get_testing_set(self):
        return self.testing_set

    def get_full_dataset(self):
        return self.training_set + self.validation_set + self.testing_set

    def get_labels(self):
        return self.all_labels

In [372]:
from src.pyvov import ChipsIndex
from random import shuffle
import numpy as np

In [375]:
training_split = []
testing_split = []
all_labels = []

ci = ChipsIndex()
experiment_names = ['C1']

# Obtain experiment data, combine C1 and D4 training and test sets:
for EXP_NAME in experiment_names:
    training_split.extend(ci.training_split_for(EXP_NAME))
    testing_split.extend(ci.testing_split_for(EXP_NAME))
    labels = ci.labels_for(EXP_NAME)
    label_list = list(labels['trn'])
    label_list.extend(list(labels['tst']))
    all_labels.extend(label_list)

training_split.extend(testing_split)
full_dataset = training_split

(225,)

In [388]:
len(ci.testing_split_for('C1'))

16608

In [495]:
def evaluate_experiment(exp_name='A1'):
    ci = ChipsIndex()

    X_train = np.array(ci.training_split_for(exp_name))
    X_test = np.array(ci.testing_split_for(exp_name))

    labels_train = np.array(ci.labels_for(exp_name)['trn'])
    labels_test = np.array(ci.labels_for(exp_name)['tst'])
    
    ## make labels 
    y_train = np.zeros_like(labels_train)
    for idx, item in enumerate(labels_train):
        if item > 0:
            y_train[idx] += 1

    y_test = np.zeros_like(labels_test)
    for idx, item in enumerate(labels_test):
        if item > 0:
            y_test[idx] += 1
    
    """
    set thresholds for evaluation as you like -- these are referred to as
    operating points in the paper.
    """
    baseline = gaussian_clf(threshold=0.99, normalize=False)
    baseline.train(X_train, y_train)
    baseline.evaluate(X_test, y_test)

In [501]:
evaluate_experiment('A1')

true_positives: 3
true_negatives: 413
false_positives: 4
false_negatives: 27
detected volcanoes: 0.1


In [502]:
evaluate_experiment('A2')

true_positives: 5
true_negatives: 371
false_positives: 1
false_negatives: 28
detected volcanoes: 0.15151515151515152


In [503]:
evaluate_experiment('A3')

true_positives: 18
true_negatives: 239
false_positives: 0
false_negatives: 45
detected volcanoes: 0.2857142857142857


In [504]:
evaluate_experiment('A4')

true_positives: 8
true_negatives: 385
false_positives: 1
false_negatives: 18
detected volcanoes: 0.3076923076923077


In [505]:
30+33+45+18+8+18

152

In [3]:
ci = ChipsIndex()

In [4]:
data, labels = ci.get_specific('A1')

In [6]:
len(labels)

1626

In [7]:
len(data)

1626

In [2]:
data = DataLoader('A1')

In [3]:
data

<data_loader.DataLoader at 0x1a1b7d7910>

In [52]:
data = DataLoader()
X_train, y_train = data.get_training_set()
X_train = np.asarray(X_train)
y_train = np.asarray(y_train)

In [53]:
len(X_train)

30196

In [54]:
len(y_train)

30196

In [59]:
mask = np.all(X_train == 0, axis=1)

In [60]:
X_train.shape

(30196, 225)

In [61]:
np.sum(mask)

array([False, False, False, ..., False, False, False])