In [1]:
import torch
from torch.cuda import random
import torch.nn as nn
import torch.nn.functional as F
import os
import torch.optim as optim
from os import listdir
from os.path import isfile, join
from tracin.tracin_batched import save_tracin_checkpoint, load_tracin_checkpoint,  approximate_tracin_batched
import pandas as pd
from LSTM_clean.model import LSTM
import numpy as np
import re
from statistics import mean
import scipy.stats as stats
import pandas as pd
from sklearn.utils import shuffle
from copy import deepcopy

# Global Parameters

In [2]:
OUTPUT_SIZE = 1743

# Get Most important Checkpoint

In [3]:
curr_dir = os.getcwd()
path = curr_dir + "/checkpoints_subset/"
checkpoints = []
with os.scandir(path) as listOfEntries:
    for entry in listOfEntries:
        # print all entries that are files
        if entry.is_file():
            checkpoints.append(os.path.join(path,entry.name))
last_checkpoint_epoch = max([re.sub('[^0-9]','', a)[2:] for a in checkpoints])
last_checkpoint = sorted(checkpoints)[-1][:-5] + str(last_checkpoint_epoch) + ".pt"

# Set Up Devices

In [4]:
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]='6'

cpu_device = torch.device("cpu")
print("CPU Device is ", cpu_device)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("device is ", device)

CPU Device is  cpu
device is  cuda


# Load In Data

In [5]:
train = np.load(os.path.join(os.getcwd(), "data/twitch_sequence/train.data"), allow_pickle=True)

In [6]:
model = LSTM(
        input_size=128,
        output_size=OUTPUT_SIZE,
        hidden_dim=64,
        n_layers=1,
        device=cpu_device,
    )

In [7]:
train_num = len(train)
train_labels = []

for i in range(train_num):
    train_labels.append(train[i][1])
train = [train[i][0] for i in range(train_num)]


# Subset Data
Don't need to do if you want to run full experiment (takes a long time to run ~24 hours)

In [8]:
train, train_labels = shuffle(train, train_labels, random_state=201)
train = train[:3000]
train_labels = train_labels[:3000]

# Deepcopy to prevent overwriting

In [9]:
train_copy = deepcopy(train)
train_labels_copy = deepcopy(train_labels)

# Run Experiments

In [None]:
si = []
ri =[]

for i in range(20):
    print("___________________________________________________________________________________")
    print(f"Iteration {i}")
    train_random, train_labels_random = shuffle(train_copy, train_labels_copy, random_state=i)
    self_influence = approximate_tracin_batched(LSTM, sources=train_random, targets=train_random, source_labels=train_labels_random, target_labels=train_labels_random, optimizer="SGD", paths=checkpoints, batch_size=2048, num_items=OUTPUT_SIZE, device=device)
    print(f"Self influence is: {self_influence}")
    si.append(self_influence)
    train_random, train_labels_random = shuffle(train_copy, train_labels_copy, random_state=i)
    rs = approximate_tracin_batched(LSTM, sources=train, targets=train_random, source_labels=train_labels, target_labels=train_labels_random, optimizer="SGD", paths=checkpoints, batch_size=2048, num_items=OUTPUT_SIZE, device=device)
    print(f"Random Sample {i} Influence is {rs}")
    ri.append(rs)

___________________________________________________________________________________
Iteration 0
In checkpoint number: 0
Total time for checkpoint 0 : 24.67806100845337
In checkpoint number: 1
Total time for checkpoint 1 : 12.459105730056763
In checkpoint number: 2
Total time for checkpoint 2 : 11.063999652862549
In checkpoint number: 3
Total time for checkpoint 3 : 12.087036848068237
In checkpoint number: 4
Total time for checkpoint 4 : 11.126459836959839
In checkpoint number: 5
Total time for checkpoint 5 : 14.599980592727661
In checkpoint number: 6
Total time for checkpoint 6 : 10.92798137664795
In checkpoint number: 7
Total time for checkpoint 7 : 10.986692667007446
In checkpoint number: 8
Total time for checkpoint 8 : 11.02364706993103
In checkpoint number: 9
Total time for checkpoint 9 : 7.775242567062378
In checkpoint number: 10
Total time for checkpoint 10 : 9.68164348602295
In checkpoint number: 11
Total time for checkpoint 11 : 10.861163854598999
In checkpoint number: 12
Total

Total time for checkpoint 19 : 13.25405216217041
In checkpoint number: 20
Total time for checkpoint 20 : 13.242337465286255
Total time taken is 300.5848665237427
Self influence is: 0.08289601653814316
In checkpoint number: 0
Total time for checkpoint 0 : 1.4141178131103516
In checkpoint number: 1
Total time for checkpoint 1 : 1.6256227493286133
In checkpoint number: 2
Total time for checkpoint 2 : 1.3595144748687744
In checkpoint number: 3
Total time for checkpoint 3 : 1.6997575759887695
In checkpoint number: 4
Total time for checkpoint 4 : 2.9019246101379395
In checkpoint number: 5
Total time for checkpoint 5 : 4.243244647979736
In checkpoint number: 6
Total time for checkpoint 6 : 6.272408723831177
In checkpoint number: 7
Total time for checkpoint 7 : 6.016612529754639
In checkpoint number: 8
Total time for checkpoint 8 : 1.3535501956939697
In checkpoint number: 9
Total time for checkpoint 9 : 5.641200542449951
In checkpoint number: 10
Total time for checkpoint 10 : 9.005222082138062

In [None]:
ri = [float(i) for i in ri]
si = [float(j) for j in si]
print("Random influences are \n", ri)
print("Self influences are \n", si)

# Perform Statistical Tests

In [None]:
print(f"Difference in population is {stats.ttest_ind(a=np.array(si), b=np.array(ri), equal_var=False)}")