<a href="https://colab.research.google.com/github/nelly-hateva/tardis/blob/master/notebooks/LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Prerequisities: Access data in google drive

In [None]:
from google.colab import drive

MOUNT_POINT = "/content/drive/"
DATA_DIR = MOUNT_POINT + "My Drive/Colab Notebooks/Thesis-Data"
drive.mount(MOUNT_POINT)

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


Imports

In [None]:
import os
import random
import time
import pickle
import numpy
import torch
from torch import nn, optim
import torch.utils.data as tud
from collections import defaultdict, Counter
import math

Set random seed and default dtype to double

In [None]:
def seed_torch(seed=666):
  random.seed(seed)
  os.environ['PYTHONHASHSEED'] = str(seed)
  numpy.random.seed(seed)
  torch.manual_seed(seed)
  torch.cuda.manual_seed(seed)
  torch.backends.cudnn.deterministic = True
  torch.backends.cudnn.benchmark = False

seed_torch()
torch.set_default_dtype(torch.float64) 

Define Pytorch Dataset

In [None]:
class NLDataset(tud.Dataset):
  def __init__(self, dataset):

    data, length, labels = dataset

    self.data = torch.tensor(data).long().cuda()
    self.length = torch.tensor(length).long().cuda()
    self.labels = torch.tensor(labels).long().cuda()

  def __len__(self):
    return len(self.data)

  def __getitem__(self, idx):
    return {
      'x': self.data[idx],
      'length': self.length[idx],
      'y': self.labels[idx]
    }

Define the model

In [None]:
class NLNN(nn.Module):

  def __init__(self, params):
    super(NLNN, self).__init__()

    # + 1 because of the padding with 0s
    num_embeddings = params['alphabet_size'] + 1

    if 'embedding_size' in params:
      self.embeddings = nn.Embedding(
        num_embeddings,
        params['embedding_size'],
        padding_idx = 0
      )
      self.embeddings.weight.data.uniform_(-0.05, 0.05)
      embedding_dim = params['embedding_size']
    else:
      # one hot encoding
      self.embeddings = nn.Embedding(
          num_embeddings,
          num_embeddings,
          padding_idx = 0
      )
      self.embeddings.weight.data = torch.eye(num_embeddings)
      self.embeddings.weight.requires_grad = False
      embedding_dim = num_embeddings

    self.lstm = nn.LSTM(
      input_size=embedding_dim,
      hidden_size=params['hidden_lstm_dim'],
      batch_first=True,
    )

    self.linear = nn.Linear(
      in_features=params['hidden_lstm_dim'], 
      out_features=2,
      bias=True
    )

    self.softmax = nn.Softmax(dim=1)

  # def forward(self, x, length):
  #   if isinstance(input, PackedSequence):
  #     return self.forward_packed(x, length)
  #   else:
  #     return self.forward_tensor(x, length)

  # def forward_tensor(self, x, length):
  #   pass

  def forward(self, x, length):
    embeddings = self.embeddings(x)

    embeddings = torch.nn.utils.rnn.pack_padded_sequence(
      embeddings, length, batch_first=True, enforce_sorted=False
    )
    outputs, (ht, ct) = self.lstm(embeddings)
    outputs, _ = nn.utils.rnn.pad_packed_sequence(outputs, batch_first=True)

    lstm_out = ht[-1,:,:] # get the last hidden state of the outmost layer

    linear = self.linear(lstm_out)
    return self.softmax(linear)

  def lstm_hidden_states(self, x, length, *args):
    embeddings = self.embeddings(x)

    embeddings = torch.nn.utils.rnn.pack_padded_sequence(
      embeddings, length, batch_first=True, enforce_sorted=False
    )

    if len(args) == 2:
      h_0, c_0 = args
      outputs, (ht, ct) = self.lstm(embeddings, (h_0, c_0))
    else:
      outputs, (ht, ct) = self.lstm(embeddings)

    outputs, _ = nn.utils.rnn.pad_packed_sequence(outputs, batch_first=True)

    return outputs, (ht, ct)

  def linear_output(self, lstm_out):
    linear = self.linear(lstm_out)
    return linear

  def vectors(self, x, length, h_0, c_0):
    embeddings = self.embeddings(x)

    embeddings = torch.nn.utils.rnn.pack_padded_sequence(
      embeddings, length, batch_first=True, enforce_sorted=False
    )
    outputs, (ht, ct) = self.lstm(embeddings, (h_0, c_0))
    outputs, _ = nn.utils.rnn.pad_packed_sequence(outputs, batch_first=True)

    lstm_out = ht[-1,:,:] # get the last hidden state of the outmost layer

    linear = self.linear(lstm_out)

    return (ht, ct), linear

  def classify(self, linear):
    softmax = self.softmax(linear)
    return softmax.argmax(dim=1).cpu().numpy()


Model Evaluation

In [None]:
def accuracy(predictions, refs):
  assert(len(predictions) == len(refs))

  tp, tn, fp, fn = 0, 0, 0, 0
  for prediction, label in zip(predictions, refs):
    if label == 1:
      if prediction == 1:
        tp += 1
      else:
        fn += 1
    else:
      if prediction == 1:
        fp += 1
      else:
        tn += 1

  return tp, tn, fp, fn, (tp + tn) / (tp + tn + fp + fn)

def evaluate_model(data_loader, model, set_name):
  predictions, labels = [], []
  for data in data_loader:
    result = model(data['x'], data['length'])
    argmax = result.argmax(dim=1).cpu().numpy()
    predictions.extend(list(argmax))
    labels.extend(list(data['y'].cpu().numpy()))

  tp, tn, fp, fn, acc = accuracy(predictions, labels)

  if tp == 0:
    if fn == 0 and fp == 0:
      pr, r, f1 = 1, 1, 1
    else:
      pr, r, f1 = 0, 0, 0
  else:
    pr = tp / (tp + fp)
    r = tp / (tp + fn)
    f1 = 2 * ((pr * r) / (pr + r))
  print("{} : TP : {} TN : {} FP : {} FN : {} Pr : {} R : {} F1: {} ACC : {} ".format(
    set_name, tp, tn, fp, fn, pr, r, f1, acc
  ))

Load data

In [None]:
def load_data(filename_data, filename_length, filename_labels):
  return numpy.load(filename_data, allow_pickle=True), \
    numpy.load(filename_length, allow_pickle=True), \
    numpy.load(filename_labels, allow_pickle=True)

train_data = load_data(
  DATA_DIR + "/numeral-50/train.data.npy",
  DATA_DIR + "/numeral-50/train.length.npy",
  DATA_DIR + "/numeral-50/train.labels.npy"
)
dev_data = load_data(
  DATA_DIR + "/numeral-50/dev.data.npy",
  DATA_DIR + "/numeral-50/dev.length.npy",
  DATA_DIR + "/numeral-50/dev.labels.npy"
)
test_data = load_data(
  DATA_DIR + "/numeral-50/test.data.npy",
  DATA_DIR + "/numeral-50/test.length.npy",
  DATA_DIR + "/numeral-50/test.labels.npy"
)


Train the model

In [None]:
def train_model(
    params=None, params_path=None, model_path=None,
    train_loader=None, dev_loader=None, test_loader=None
):
  model = NLNN(params)
  model.cuda()
  loss_function = nn.CrossEntropyLoss()
  optimizer = optim.Adam(model.parameters()) #weight_decay=1e-5

  trainable_parameters = sum(
    p.numel() for p in model.parameters() if p.requires_grad
  )
  print(params)
  print("")
  print("Trainable parameters {0:,}".format(trainable_parameters))
  print("")

  best_dev_accuracy, best_state_dict, best_epoch = 0, dict(), -1

  for epoch in range(params['num_epochs']):

    model.train() # set the model to training mode  
    print('Epoch {}/{} : '.format(epoch + 1, params['num_epochs']))

    t0 = time.time()

    for data in train_loader:
      model.zero_grad()
      predictions = model(data['x'], data['length'])
      batch_loss = loss_function(predictions, data['y'])
      batch_loss.backward()
      optimizer.step()

    model.eval() # set the model to eval mode

    predictions_dev, labels_dev = [], []

    for data in dev_loader:
      dev_predictions = model(data['x'], data['length'])

      argmax = dev_predictions.argmax(dim=1).cpu().numpy()
      predictions_dev.extend(list(argmax))
      labels_dev.extend(list(data['y'].cpu().numpy()))

    _, _, _, _, dev_accuracy = accuracy(predictions_dev, labels_dev)

    if dev_accuracy > best_dev_accuracy:
      best_state_dict = copy.deepcopy(model.state_dict())
      best_epoch = epoch

    print(
      "dev accuracy:{}\ttime:{:.2f}s"
      .format(dev_accuracy, time.time() - t0)
    )

  model.load_state_dict(best_state_dict)
  if model_path:
    print("Best dev epoch {}".format(epoch))
    torch.save(model.state_dict(), DATA_DIR + model_path)
    if params_path:
      with open(DATA_DIR + params_path, 'wb') as f:
        pickle.dump(params, f)

  model.eval()
  evaluate_model(train_loader, model, "train")
  evaluate_model(dev_loader, model, "dev")
  evaluate_model(test_loader, model, "test")

alphabet = {}
with open(DATA_DIR + '/model/light/alphabet.dict', 'rb') as f:
  alphabet = pickle.load(f)

params = {
  'batch_size': 5,
  'alphabet_size': len(alphabet),
  # 'embedding_size': 1,
  'hidden_lstm_dim': 3,
  'num_epochs': 2000
}
train_loader = tud.DataLoader(
  NLDataset(train_data), batch_size=params["batch_size"], shuffle = True
)
dev_loader = tud.DataLoader(
  NLDataset(dev_data), batch_size=params["batch_size"]
)
test_loader = tud.DataLoader(
  NLDataset(test_data), batch_size=params["batch_size"]
)

# train_model(
#     params=params,
#     params_path="/model/light/model.params",
#     model_path="/model/light/model.pt",
#     train_loader=train_loader, dev_loader=dev_loader, test_loader=test_loader
# )

In [None]:
def load_model(params_path=None, model_path=None):
  with open(DATA_DIR + params_path, 'rb') as f:
    params = pickle.load(f)
  model = NLNN(params)
  model.load_state_dict(torch.load(DATA_DIR + model_path))
  model.to(torch.device("cuda"))
  model.eval()
  return model, params

model, _ = load_model(
    params_path="/model/light/model.params",
    model_path="/model/light/model.pt"
)
evaluate_model(train_loader, model, "train")
evaluate_model(dev_loader, model, "dev")
evaluate_model(test_loader, model, "test")

inv_alphabet = { v : k for k, v in alphabet.items() }

data['x'] = 
#   h_0 = torch.zeros(1, 1, 3).cuda()
#   c_0 = torch.zeros(1, 1, 3).cuda()
#   vec = model.vectors(data['x'], data['length'], h_0, c_0)

#   result = model(data['x'], data['length'])
#   prediction = result.argmax(dim=1).cpu().numpy()[0]

#   label = data['y'].cpu().numpy()[0]
#   if label == 0 and prediction == 0:
#       # print (vec)
#       # print ("".join([inv_alphabet[c] for c in data['x'][0].cpu().numpy()[:data['length'][0]]]))
#       #break

mht = torch.tensor([5.8730, -5.9212]).cuda()
print(model.classify(mht.view(1, 2)))

# mht = torch.tensor([0.7337597, 0.26021487, -0.25838602]).cuda()
# print(model.classify(mht.view(1, 3)))

train : TP : 39 TN : 37 FP : 3 FN : 1 Pr : 0.9285714285714286 R : 0.975 F1: 0.951219512195122 ACC : 0.95 
dev : TP : 3 TN : 4 FP : 2 FN : 1 Pr : 0.6 R : 0.75 F1: 0.6666666666666665 ACC : 0.7 
test : TP : 4 TN : 3 FP : 1 FN : 2 Pr : 0.8 R : 0.6666666666666666 F1: 0.7272727272727272 ACC : 0.7 
[0]


Get the vectors for K-Means

In [None]:
def lstm_vectors(model, params, data_loader, filter_negative=False):
  hidden_size = params['hidden_lstm_dim']
  total_samples, vectors = 0, defaultdict(int)

  for data in data_loader:

    if filter_negative:
      # get only vectors from samples which are classfied as positive by the model
      argmax = model(data['x'], data['length']).argmax(dim=1)
      positive_samples = (argmax == 1).nonzero().reshape(-1)
      data_x = torch.index_select(data['x'], 0, positive_samples)
      data_length = data['length'][positive_samples]
    else:
      data_x = data['x']
      data_length = data['length']

    samples_in_batch = len(data_x)
    total_samples += samples_in_batch

    ones = torch.ones(samples_in_batch).cuda()
    h_0 = torch.zeros(1, samples_in_batch, hidden_size).cuda()
    c_0 = torch.zeros(1, samples_in_batch, hidden_size).cuda()

    for j in range(data_length.max()):
      (h_t, c_t), linear = model.vectors(data_x[:, j : j + 1], ones, h_0, c_0)
      h_0, c_0 = h_t, c_t

      concat = torch.cat((h_t[-1,:,:], c_t[-1,:,:], linear), 1)
      # The words are padded with zero and we should filter out the tensors
      y1 = torch.ones_like(data_length)
      y2 = torch.zeros_like(data_length)
      indices = torch.where((j < data_length), y1, y2).bool()
      concat = concat[indices].cpu().detach().numpy()

      for vector in concat:
        vectors[tuple(vector)] += 1

  initial_state = torch.zeros(hidden_size).cuda()
  linear = model.linear_output(initial_state.view(1, 1, -1))
  initial_state =  torch.cat((initial_state, initial_state, linear[-1, -1, :]), 0).cpu().detach().numpy()
  initial_state = tuple(initial_state)
  vectors[initial_state] += total_samples
  return vectors, initial_state

def truncate(n, decimals=14):
  multiplier = 10 ** decimals
  return int(n * multiplier) / multiplier

def test_lstm_vectors(model, params, data_loader, decimals=14):
  vectors, initial_state = lstm_vectors(model, params, data_loader)
  truncated_vectors = {}
  for vector, count in vectors.items():
    truncated_vectors[tuple([truncate(v, decimals=decimals) for v in vector])] = count
  initial_state = tuple([truncate(v, decimals=decimals) for v in initial_state])
  vectors = truncated_vectors

  hidden_size, expected_vectors_count, samples_count = 0, 0, 0
  for data in data_loader:
    # for one word with length n, we should have n + 1 vectors 
    expected_vectors_count += (torch.sum(data['length']).item() + len(data['length']))
    samples_count += len(data['x'])

    _, (ht, ct) = model.lstm_hidden_states(data['x'], data['length'])
    linear = model.linear_output(ht)

    hidden_size = ht.size()[2]

    for i in range(len(data['x'])):
      vector = tuple(torch.cat((ht[-1, i, :], ct[-1, i, :], linear[-1, i, :]), 0).cpu().detach().numpy())
      vector = tuple([truncate(v, decimals=decimals) for v in vector])
      assert vectors[vector] >= 1

  vectors_count = sum([ count for vector, count in vectors.items()])
  assert vectors_count == expected_vectors_count

  # each sample starts with the vector at time step 0
  start = torch.zeros(hidden_size).cuda()
  linear = model.linear_output(start.view(1, 1, -1))
  start = torch.cat((torch.zeros(hidden_size * 2).cuda(), linear[-1, -1, :]), dim=0).cpu().detach().numpy()
  start = tuple([truncate(v, decimals=decimals) for v in start])
  assert vectors[start] >= samples_count

  return vectors, initial_state

model, params = load_model(
    params_path="/model/light/model.params",
    model_path="/model/light/model.pt"
)

train_data = load_data(
  DATA_DIR + "/model/light/train.data.npy",
  DATA_DIR + "/model/light/train.length.npy",
  DATA_DIR + "/model/light/train.labels.npy"
)
train_loader = tud.DataLoader(
  NLDataset(train_data), batch_size=10, shuffle = False
)

t0 = time.time()
#test_lstm_vectors(model, params, train_loader, decimals=9)
vectors, initial_state = lstm_vectors(model, params, train_loader, filter_negative=True)

print("Elapsed: {:.2f}s".format(time.time() - t0))
print("Number of unique vectors {0:,}".format(len(vectors)))
print("Number of total vectors {0:,}".format(sum(vectors.values())))

with open(DATA_DIR + "/kMeans/light+dense/vectors-train-pos.txt", "w") as f:
  # intial state first
  count = vectors.pop(initial_state)
  f.write(str(count))
  f.write("\t")
  for i in range(len(initial_state)):
    f.write(str(initial_state[i]))
    f.write(" ")
  f.write("\n")

  for vector, count in vectors.items():
    f.write(str(count))
    f.write("\t")
    for i in range(len(vector)):
      f.write(str(vector[i]))
      f.write(" ")
    f.write("\n")

Elapsed: 0.12s
Number of unique vectors 102
Number of total vectors 444


Covariance matrix 

In [None]:
vectors = []
with open(DATA_DIR + "/kMeans/large/vectors-dev-9.txt", "r") as f:
  lines = f.readlines()
  for line in lines:
    line = line.strip()
    if line:
      vec = line.split("\t")[1]
      vectors.append([float(p) for p in vec.split(" ")])

numpy.savetxt(
    DATA_DIR + "/kMeans/large/vectors-dev-9-cov.txt",
    numpy.cov(numpy.array(vectors).T),
    delimiter=',', fmt='%1.3f'
)

Load K centroids and build automata

In [None]:
def closest_centroid_naive(centroids, t):
  min_dist, min_dist_centroid = float("inf"), -1

  for i in range(len(centroids)):
    dist = numpy.sum([(x_i - y_i) ** 2 for (x_i, y_i) in zip(centroids[i], t)])
    if dist < min_dist:
      min_dist, min_dist_centroid = dist, i

  return min_dist_centroid

def closest_centroid(centroids, t):
  subtraction = centroids - t.expand_as(centroids)
  dist = torch.sum(subtraction * subtraction, dim=1)
  return torch.argmin(dist, dim=0)

def test_closest_centroid():
  rand_dim = random.randint(64, 128)
  rand_centroids_count = random.randint(1024, 2048)
  rand_centroids = torch.rand(rand_centroids_count, rand_dim)
  rand_vectors_count = random.randint(64, 128)

  for i in range(rand_vectors_count):
    rand_vector = torch.rand(rand_dim)

    actual = closest_centroid(rand_centroids, rand_vector).item()
    expected = closest_centroid_naive(rand_centroids.cpu().detach().numpy(), rand_vector.cpu().detach().numpy())
    assert actual == expected

  for i in range(rand_centroids_count):
    actual = closest_centroid(rand_centroids, rand_centroids[i]).item()
    assert actual == i

#test_closest_centroid()


In [None]:
def read_centroids(file):
  centroids = []
  with open(file, "r") as f:
    lines = f.readlines()
    for line in lines:
      line = line.strip()
      if line:
        centroids.append([float(p) for p in line.split()])

  print("Number of centroids: {0:,}".format(len(centroids)))

  return torch.tensor(centroids).cuda()

def get_initial_state_centroid(assignmentsFile):
  with open(assignmentsFile, "r") as f:
    lines = f.readlines()
    return int(lines[0])

def write_initial_state(f, initial_state_centroid):
  f.write(str(initial_state_centroid))
  f.write("\n")

def write_final_states(f, final_states):
  for fs in final_states:
    f.write(str(fs))
    f.write(" ")
  f.write("\n")

def write_transitions(f, transitions):
  for tr in transitions:
    f.write(str(tr[0]) + " " + str(tr[1]) + " " + str(tr[2]))
    f.write("\n")

def build_and_save_automaton(model, params, centroidsFile, assignmentsFile, automatonFile):
  t0 = time.time()

  centroids = read_centroids(centroidsFile)
  transitions, final_states = automaton(model, params, centroids)

  with open(automatonFile, "w") as f:
      # initial state on the first line
      write_initial_state(f, get_initial_state_centroid(assignmentsFile))
      # final states separated with space on the second line
      write_final_states(f, final_states)
      # transitions, each on a separate line
      write_transitions(f, transitions)

  print("Number of tranitions {0:,}".format(len(transitions)))
  print("Number of final states {0:,}".format(len(final_states)))
  print("Elapsed: {:.2f}s".format(time.time() - t0))

In [None]:
def automaton(model, params, centroids):

  alphabet_size = params['alphabet_size']
  hidden_lstm_dim = params['hidden_lstm_dim']

  transitions = []
  x = torch.tensor([[i] for i in range(1, alphabet_size + 1)]).cuda()
  ones = torch.ones(alphabet_size).cuda()

  for q1, centroid in enumerate(centroids):
    h_t, c_t, _ = torch.split(centroid, hidden_lstm_dim, dim=0)
    h_t = h_t.repeat(alphabet_size, 1).view(1, alphabet_size, hidden_lstm_dim)
    c_t = c_t.repeat(alphabet_size, 1).view(1, alphabet_size, hidden_lstm_dim)

    _, (h_t, c_t) = model.lstm_hidden_states(x, ones, h_t, c_t)
    linear = model.linear_output(h_t)
    concat = torch.cat((h_t[-1,:,:], c_t[-1,:,:], linear[-1,:,:]), 1)

    for i in range(alphabet_size):
      # TODO closest centroid to return vector with size alphabet_size
      q2 = closest_centroid(centroids, concat[i])
      transitions.append((q1, x[i][0].item(), q2.item()))

  _, linear = torch.split(centroids, hidden_lstm_dim * 2, dim=1)
  final_states = numpy.where(model.classify(linear) == 1)[0]

  return transitions, final_states

model, params = load_model(
    params_path = "/model/light/model.params",
    model_path = "/model/light/model.pt"
)

# build_and_save_automaton(
#     model, params,
#     DATA_DIR + "/kMeans/tardis/kmeans-standardized-euclidean-distance-centroids-k=34.txt",
#     DATA_DIR + "/kMeans/tardis/kmeans-standardized-euclidean-distance-assignments-k=34.txt",
#     DATA_DIR + "/kMeans/tardis/aut-standardized-euclidean-distance-k=34.txt"
# )
build_and_save_automaton(
    model, params,
    DATA_DIR + "/kMeans/tardis/kmeans-standardized-euclidean-distance-centroids-k=51.txt",
    DATA_DIR + "/kMeans/tardis/kmeans-standardized-euclidean-distance-assignments-k=51.txt",
    DATA_DIR + "/kMeans/tardis/aut-standardized-euclidean-distance-k=51.txt"
)
build_and_save_automaton(
    model, params,
    DATA_DIR + "/kMeans/tardis/kmeans-standardized-euclidean-distance-centroids-k=82.txt",
    DATA_DIR + "/kMeans/tardis/kmeans-standardized-euclidean-distance-assignments-k=82.txt",
    DATA_DIR + "/kMeans/tardis/aut-standardized-euclidean-distance-k=82.txt"
)
build_and_save_automaton(
    model, params,
    DATA_DIR + "/kMeans/tardis/kmeans-standardized-euclidean-distance-centroids-k=102.txt",
    DATA_DIR + "/kMeans/tardis/kmeans-standardized-euclidean-distance-assignments-k=102.txt",
    DATA_DIR + "/kMeans/tardis/aut-standardized-euclidean-distance-k=102.txt"
)

Number of centroids: 51
Number of tranitions 663
Number of final states 42
Elapsed: 1.31s
Number of centroids: 82
Number of tranitions 1,066
Number of final states 65
Elapsed: 1.43s
Number of centroids: 102
Number of tranitions 1,326
Number of final states 83
Elapsed: 1.42s


K Means Vectors Stats

In [None]:
def read_vectors(file):
  vectors = []
  counts = []
  with open(file, "r") as f:
    lines = f.readlines()
    for line in lines:
      line = line.strip()
      if line:
        count, vector = line.split("\t")
        vectors.append([float(p) for p in vector.split()])
        counts.append(int(count))

  return vectors, counts

def vectors_distances_stats_2(vectors, counts, model, params, batch_size=100):
  with torch.no_grad():
    vectors = torch.tensor(vectors).cuda()
    vectors.requires_grad = False

    min_dist, min_i, min_j = float("inf"), -1, -1
    max_dist, max_i, max_j = -1, -1, -1
    distances = []

    for index, vector in enumerate(vectors):
      if index == vectors.size()[0] - 1:
        break

      for j in range(index + 1, vectors.size()[0], batch_size):
        vec = vectors[j:j + batch_size, :]
        v = vectors[index].expand_as(vec)

        substract = vec - v
        distance = torch.sum(substract * substract, dim=1)

        min_d = torch.min(distance).item()
        if min_d < min_dist:
          min_dist = min_d
          min_i = index
          min_j = torch.argmin(distance).item() + j

        max_d = torch.max(distance).item()
        if max_d > max_dist:
          max_dist = max_d
          max_i = index
          max_j = torch.argmax(distance).item() + j

        distances.extend(list(distance.cpu().detach().numpy()))

    print("Min distance between 2 vectors {}".format(math.sqrt(min_dist)))
    print("{} {} {}".format(
        counts[min_i],
        model.classify(vectors[min_i].split(2 * params['hidden_lstm_dim'])[1].view(1, 2))[0],
        list(vectors[min_i].cpu().detach().numpy())
    ))
    print("{} {} {}".format(
        counts[min_j],
        model.classify(vectors[min_j].split(2 * params['hidden_lstm_dim'])[1].view(1, 2))[0],
        list(vectors[min_j].cpu().detach().numpy())
    ))

    print("Max distance between 2 vectors {}".format(math.sqrt(max_dist)))
    print("{} {} {}".format(
        counts[max_i],
        model.classify(vectors[max_i].split(2 * params['hidden_lstm_dim'])[1].view(1, 2))[0],
        list(vectors[max_i].cpu().detach().numpy())
    ))
    print("{} {} {}".format(
        counts[max_j],
        model.classify(vectors[max_j].split(2 * params['hidden_lstm_dim'])[1].view(1, 2))[0],
        list(vectors[max_j].cpu().detach().numpy())
    ))

    distances = [math.sqrt(d) for d in distances]
    print("Mean distance between 2 vectors {}".format(statistics.mean(distances)))
    print("Pvariance distance between 2 vectors {}".format(statistics.pvariance(distances)))
    print("Pstandard deviation distance between 2 vectors {}".format(statistics.pstdev(distances)))

def vectors_distances_stats(vectors, counts, model, params):
  vectors = torch.tensor(vectors).cuda()
  vectors.requires_grad = False

  min_dist, min_i, min_j = float("inf"), -1, -1
  max_dist, max_i, max_j = -1, -1, -1
  distances = []

  for index, vector in enumerate(vectors):
    if index == vectors.size()[0] - 1:
      break

    vec = vectors[index + 1:, :]
    v = vectors[index].expand_as(vec)

    substract = vec - v
    distance = torch.sum(substract * substract, dim=1)

    min_d = torch.min(distance).item()
    if min_d < min_dist:
      min_dist = min_d
      min_i = index
      min_j = torch.argmin(distance).item() + index + 1

    max_d = torch.max(distance).item()
    if max_d > max_dist:
      max_dist = max_d
      max_i = index
      max_j = torch.argmax(distance).item() +  index + 1

    distances.extend(list(distance.cpu().detach().numpy()))

  print("Min distance between 2 vectors {}".format(math.sqrt(min_dist)))
  print("{} {} {}".format(
      counts[min_i],
      model.classify(vectors[min_i].split(2 * params['hidden_lstm_dim'])[1].view(1, 2))[0],
      list(vectors[min_i].cpu().detach().numpy())
  ))
  print("{} {} {}".format(
      counts[min_j],
      model.classify(vectors[min_j].split(2 * params['hidden_lstm_dim'])[1].view(1, 2))[0],
      list(vectors[min_j].cpu().detach().numpy())
  ))

  print("Max distance between 2 vectors {}".format(math.sqrt(max_dist)))
  print("{} {} {}".format(
      counts[max_i],
      model.classify(vectors[max_i].split(2 * params['hidden_lstm_dim'])[1].view(1, 2))[0],
      list(vectors[max_i].cpu().detach().numpy())
  ))
  print("{} {} {}".format(
      counts[max_j],
      model.classify(vectors[max_j].split(2 * params['hidden_lstm_dim'])[1].view(1, 2))[0],
      list(vectors[max_j].cpu().detach().numpy())
  ))

  distances = [math.sqrt(d) for d in distances]
  print("Mean distance between 2 vectors {}".format(statistics.mean(distances)))
  print("Pvariance distance between 2 vectors {}".format(statistics.pvariance(distances)))
  print("Pstandard deviation distance between 2 vectors {}".format(statistics.pstdev(distances)))

model, params = load_model(
    params_path = "/model/large/model.params",
    model_path = "/model/large/model.pt"
)
vectors, counts = read_vectors(DATA_DIR + "/kMeans/large/vectors-dev-9.txt")
#vectors, counts = read_vectors(DATA_DIR + "/kMeans/light+dense/vectors-train-14.txt")
#vectors_distances_stats(vectors, counts, model, params)
vectors_distances_stats_2(vectors, counts, model, params)
# vectors_distances_stats(vectors, counts, model, params)
# print("===")
#vectors_distances_stats_2(vectors, counts, model, params, batch_size=)

KeyboardInterrupt: ignored

K Means Results Stats

In [None]:
def count_points_in_clusters(assignments):
  count_points_in_clusters = numpy.array(list(Counter(assignments).values()))

  print("Min points in cluster {}".format(numpy.min(count_points_in_clusters)))
  print("Max points in cluster {}".format(numpy.max(count_points_in_clusters)))

  print("Mean points in cluster {}".format(numpy.mean(count_points_in_clusters)))
  print("Variance points in cluster {}".format(numpy.var(count_points_in_clusters)))
  print("Standard deviation points in cluster {}".format(numpy.std(count_points_in_clusters)))
  print()

def distances_to_centroids(centroids, assignments, vectors, use_variance=False):

  if use_variance:
    deviation = torch.tensor(numpy.std(vectors, axis=0)).cuda()

  assignments = torch.tensor(assignments).cuda()
  vectors = torch.tensor(vectors).cuda()

  embedding = nn.Embedding(centroids.size()[0], centroids.size()[1])
  embedding.weight.requires_grad = False
  embedding.weight.data = centroids
 
  substract = vectors - embedding(assignments)
  if use_variance:
    devision = substract / deviation
  else:
    devision = substract
  distance = torch.sqrt(torch.sum(devision * devision, dim=1)).cpu().detach().numpy()

  print("Min distance between centroid and vector {}".format(numpy.min(distance)))
  print("Max distance between centroid and vector {}".format(numpy.max(distance)))

  print("Mean distance between centroid and vector {}".format(numpy.mean(distance)))
  print("Variance distance between centroid and vector {}".format(numpy.var(distance)))
  print("Standard deviation distance between centroid and vector {}".format(numpy.std(distance)))
  print()

def k_means_stats(centroids, assignments, vectors, use_variance=False):
  count_points_in_clusters(assignments)
  distances_to_centroids(centroids, assignments, vectors, use_variance=use_variance)

def read_vectors(file):
  vectors = []
  with open(file, "r") as f:
    lines = f.readlines()
    for line in lines:
      line = line.strip()
      if line:
        count, vector = line.split("\t")
        vectors.append([float(p) for p in vector.split()])

  return vectors

def read_assignments(file):
  assignments = []
  with open(file, "r") as f:
    lines = f.readlines()
    for line in lines:
      line = line.strip()
      if line:
        assignments.append(int(line))

  return assignments

assignments = read_assignments(DATA_DIR + "/kMeans/light+dense/kmeans-standardized-euclidean-distance-assignments-k=34.txt")
centroids = read_centroids(DATA_DIR + "/kMeans/light+dense/kmeans-standardized-euclidean-distance-centroids-k=34.txt")
vectors = read_vectors(DATA_DIR + "/kMeans/light+dense/vectors-train-14.txt")
k_means_stats(centroids, assignments, vectors, use_variance=True)

Number of centroids: 34
Min points in cluster 1
Max points in cluster 20
Mean points in cluster 8.088235294117647
Variance points in cluster 22.845155709342556
Standard deviation points in cluster 4.779660627005076

Min distance between centroid and vector 0.0
Max distance between centroid and vector 1.6243095065812312
Mean distance between centroid and vector 0.6980171488806557
Variance distance between centroid and vector 0.09505240878611079
Standard deviation distance between centroid and vector 0.30830570670376956



Properties of the covariance matrix of the vectors for clustering

In [None]:
import numpy

def read_vectors(file):
  vectors = []
  counts = []
  with open(file, "r") as f:
    lines = f.readlines()
    for line in lines:
      line = line.strip()
      if line:
        count, vector = line.split("\t")
        vectors.append([float(p) for p in vector.split()])
        counts.append(int(count))

  return vectors, counts

# vectors, counts = read_vectors(DATA_DIR + "/kMeans/large/vectors-train.txt")
# covariance_matrix = numpy.cov(numpy.array(vectors).T)

# print ("Covariance matrix ", covariance_matrix)
# print ("Covariance matrix rank ", numpy.linalg.matrix_rank(covariance_matrix))
#print ("Vectors rank ", numpy.linalg.matrix_rank(numpy.array(vectors)))

from scipy.spatial import distance

# ainv = numpy.linalg.inv(covariance_matrix)
# print (ainv)
# numpy.allclose(numpy.dot(covariance_matrix, ainv), numpy.eye(202))

# print (numpy.linalg.det(covariance_matrix))
# print(numpy.isclose( numpy.linalg.det(covariance_matrix), 0))
# iv = [[1, 0.5, 0.5], [0.5, 1, 0.5], [0.5, 0.5, 1]]
# distance.mahalanobis([2, 0, 0], [0, 1, 0], iv)
a = numpy.array([[1, 0, 0, 0], [0, 2, 0, 0], [0, 0, 3, 0], [0, 0, 0, 4]])
print (numpy.linalg.inv(a))

[[1.         0.         0.         0.        ]
 [0.         0.5        0.         0.        ]
 [0.         0.         0.33333333 0.        ]
 [0.         0.         0.         0.25      ]]
