In [39]:
import numpy as np
import torch
from torch import nn
import matplotlib.pyplot as plt
import csv

import gdown
#from googledrivedownloader import GoogleDriveDownloader
#import GoogleDriveDownloader
import zipfile

eps_torch = torch.finfo(float).eps

torch.manual_seed(191090)

<torch._C.Generator at 0x1053bbef0>

# Core goals of the lab
1) Learn how to use torch.optim instead of manual parameter updates
2) Implement logistic regression (useful for classification)


In [40]:
gdown.download(f"https://drive.google.com/uc?id=1SagLh5XNSV4znhlnkLRkV7zHPSDbOAqv",
               output="./got.zip", quiet=False)

with zipfile.ZipFile("got.zip", 'r') as zip_ref:
  zip_ref.extractall()

Downloading...
From: https://drive.google.com/uc?id=1SagLh5XNSV4znhlnkLRkV7zHPSDbOAqv
To: /Users/francesco/Desktop/Courses/Notes/DeepLearning/labs/got.zip
100%|██████████| 84.6k/84.6k [00:00<00:00, 7.26MB/s]


In [42]:
def load_got_dataset(path, train_split=0.8, verbose=True):
  """
  Loads the Game of Thrones dataset.

  Parameters
  ----------
  path: str
      the relative path of the csv file.
  train_split: float
      percentage of training examples in [0, 1].

  Returns
  -------
  tuple
      x_train: np.array
          training characters. shape=(n_train_examples, n_features)
      y_train: np.array
          training labels. shape=(n_train_examples,)
      train_names: np.array
          training names. shape=(n_train_examples,)
      x_test: np.array
          test characters. shape=(n_test_examples, n_features)
      y_test: np.array
          test labels. shape=(n_test_examples,)
      test_names: np.array
          test names. shape=(n_test_examples,)
      feature_names: np.array
          an array explaining each feature. shape=(n_test_examples,)
  """

  # read file into string ndarray
  with open(path, 'r') as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    data = np.array([row for row in reader])

  if verbose:
    print(f"\nLoaded dataset from {path}")
    print(f"Shape: {data.shape[0]} rows Ã— {data.shape[1]} columns")

    # print header
    header = data[0]
    print("Columns:", ", ".join(header))

    # print a preview of first 5 rows
    print("\nSample rows:")
    for row in data[1:6]:
      print("  ", row)

  # extract feature names
  feature_names = data[0, 1:-1]

  # shuffle data
  data = data[1:]
  np.random.shuffle(data)

  # extract character names
  character_names = data[:, 0]

  # extract features X and targets Y
  X = np.float32(data[:, 1:-1])
  Y = np.float32(data[:, -1])

  # normalize X
  X -= np.min(X, axis=0)
  X /= np.max(X, axis=0)

  # add bias to X
  X = np.concatenate((X, np.ones(shape=(X.shape[0], 1))), axis=1)
  feature_names = np.concatenate((feature_names, np.array(['bias'])), axis=-1)

  total_characters = X.shape[0]
  test_sampling_probs = np.ones(shape=total_characters)
  test_sampling_probs[Y == 1] /= float(np.sum(Y == 1))
  test_sampling_probs[Y == 0] /= float(np.sum(Y == 0))
  test_sampling_probs /= np.sum(test_sampling_probs)

  # sample test people without replacement
  n_test_characters = int(total_characters * (1 - train_split))
  test_idx = np.random.choice(np.arange(0, total_characters), size=(n_test_characters,),
                              replace=False, p=test_sampling_probs)
  x_test = X[test_idx]
  y_test = Y[test_idx]
  test_names = character_names[test_idx]

  # sample train people
  train_sampling_probs = test_sampling_probs.copy()
  train_sampling_probs[test_idx] = 0
  train_sampling_probs /= np.sum(train_sampling_probs)

  n_train_characters = int(total_characters * train_split)
  train_idx = np.random.choice(np.arange(0, total_characters), size=(n_train_characters,),
                                replace=True, p=train_sampling_probs)
  x_train = X[train_idx]
  y_train = Y[train_idx]
  train_names = character_names[train_idx]

  return x_train, y_train, train_names, x_test, y_test, test_names, feature_names

In [43]:
x_train, y_train, train_names, x_test, y_test, test_names, feature_names = load_got_dataset(path='got.csv', train_split=0.8)

#convert from np_array to tensors
x_train = torch.from_numpy(x_train).to(dtype=torch.float32)
x_test = torch.from_numpy(x_test).to(dtype=torch.float32)
y_train = torch.from_numpy(y_train).to(dtype=torch.float32)
y_test = torch.from_numpy(y_test).to(dtype=torch.float32)


Loaded dataset from got.csv
Shape: 1947 rows Ã— 27 columns
Columns: name, male, numDeadRelations, book1, book2, book3, book4, book5, bookCount, isMarried, isPopular, witnessed_wins, witnessed_losses, hadMoreWinsThanLosses, wasAttackerCommander, wasDefenderCommander, wasCommander, witnessed_own_attacker_size_mean, witnessed_opponent_attacker_size_mean, witnessed_own_defender_size_mean, witnessed_opponent_defender_size_mean, witnessed_major_deaths, witnessed_major_capture, battleCountAsAttackerCommander, battleCountAsDefenderCommander, battleCountAsCommander, isAlive

Sample rows:
   ['Viserys II Targaryen' '1' '11' '0' '0' '0' '0' '0' '0' '0' '1' '0' '0'
 '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0']
   ['Walder Frey' '1' '1' '1' '1' '1' '1' '1' '5' '1' '1' '3' '0' '1' '1' '0'
 '1' '3166' '0' '0' '1166' '1' '2' '3' '0' '3' '1']
   ['Addison Hill' '1' '0' '0' '0' '0' '1' '0' '1' '0' '0' '0' '0' '0' '0'
 '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '1']
   ['Aemma Arryn' '0' '0

In [44]:
def sigmoid(x: torch.Tensor) -> torch.Tensor:
  e = torch.exp(x)
  return e / (1 + e)

In [None]:
class LogisticRegression:
  """ Models a logistic regression classifier. """

  def __init__(self):
    self._w = None
    self.optim = None
    self.loss = None

  def fit_sgd(self, X, Y, n_epochs, learning_rate, verbose=False):
      """
      Implements the stochastic gradient descent training procedure.

      Parameters
      ----------
      X: torch.tensor
          data. shape=(n_examples, n_features)
      Y: np.array
          labels. shape=(n_examples,)
      n_epochs: int
          number of gradient updates.
      learning_rate: float
          step towards the descent.
      verbose: bool
          whether or not to print the value of cost function.
      """

      n_samples, n_features = X.shape

      # weight initialization
      self._w = torch.randn(n_features, requires_grad=True)

      # optimizer initialization
      self.optim = torch.optim.SGD([self._w], learning_rate)

      # loss initialization
      self.loss_f = torch.nn.BCELoss()


      for e in range(n_epochs):
        # Empy optimizer gradient buffer
        self.optim.zero_grad()

        # Compute predictions
        preds = sigmoid(X @ self._w)

        # Print loss between Y and predictions p
        loss = self.loss_f(preds, Y)

        if verbose and e % 500 == 0:
          print(f'Epoch {e:4d}: loss={loss}')

        # Gradient backpropagation
        loss.backward()

        # Parameters update
        self.optim.step()

  def predict(self, X):
    """
    Function that predicts.

    Parameters
    ----------
    X: torch.tensor
        data to be predicted. shape=(n_test_examples, n_features)

    Returns
    -------
    prediction: torch.tensor
        prediction in {0, 1}.
        Shape is (n_test_examples,)
    """

    with torch.no_grad():
      cont_preds = sigmoid(X @ self._w.T)
      discrete_preds = torch.round(cont_preds)
      return discrete_preds

      

In [58]:
logistic_reg = LogisticRegression()

# train
logistic_reg.fit_sgd(x_train, y_train, n_epochs=10000, learning_rate=0.01, verbose=True)

# test
predictions = logistic_reg.predict(x_test)

accuracy = float(torch.sum(predictions == y_test)) / y_test.shape[0]
print(f'Test accuracy: {accuracy}')

Epoch    0: loss=1.2299299240112305
Epoch  500: loss=0.8214868307113647
Epoch 1000: loss=0.7468236088752747
Epoch 1500: loss=0.7009949088096619
Epoch 2000: loss=0.6715631484985352
Epoch 2500: loss=0.6519611477851868
Epoch 3000: loss=0.6383994221687317
Epoch 3500: loss=0.6287004947662354
Epoch 4000: loss=0.6215760111808777
Epoch 4500: loss=0.6162293553352356
Epoch 5000: loss=0.6121455430984497
Epoch 5500: loss=0.6089779734611511
Epoch 6000: loss=0.6064861416816711
Epoch 6500: loss=0.6044992208480835
Epoch 7000: loss=0.6028935313224792
Epoch 7500: loss=0.601578414440155
Epoch 8000: loss=0.6004866361618042
Epoch 8500: loss=0.5995681881904602
Epoch 9000: loss=0.5987850427627563
Epoch 9500: loss=0.5981085896492004
Test accuracy: 0.6915167095115681
