In [None]:
# https://docs.pytorch.org/tutorials/intermediate/FSDP_tutorial.html
# https://discuss.pytorch.org/t/gradient-accumulation-with-ddp-no-sync-interface/169593/3
# https://docs.pytorch.org/tutorials/intermediate/ddp_tutorial.html
# https://docs.pytorch.org/tutorials/beginner/dist_overview.html
# https://docs.pytorch.org/tutorials/intermediate/dist_tuto.html   <-- IMPORTANTE
# https://sebarnold.net/dist_blog/

# access GPUs

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import gc
import torch
import numpy as np

def get_cuda_objects():
    objs = [obj for obj in gc.get_objects()
            if torch.is_tensor(obj) and obj.is_cuda]

    for o in objs:
        print(type(o), o.size(), o.dtype, o.device, "| bits: ", o.element_size() * o.nelement()*8,"| bytes: ", o.element_size() * o.nelement())




In [None]:
torch.cuda.is_available()

In [None]:
for n,i in enumerate(range(torch.cuda.device_count())):
   print(torch.cuda.get_device_properties(i).name)
   print(torch.cuda.get_device_properties(i))
   print(torch.cuda.device(n).idx)

In [None]:
devices = [n for n in range(torch.cuda.device_count())]
devices

In [None]:
[torch.tensor([1,2,3,4]).to(device) for device in devices]

In [None]:
get_cuda_objects()

In [None]:
data = pd.read_csv("sample_data/mnist_train_small.csv", header=None)
data.shape

In [None]:
data_chunks = np.array_split(data, len(devices))

In [None]:
dataset, dataset_test =[ train_test_split(data_chunk, test_size=0.05) for data_chunk in data_chunks]

In [None]:
for d, d_test in zip(dataset, dataset_test):
    d.reset_index(inplace=True, drop=True)
    d_test.reset_index(inplace=True, drop=True)


In [None]:
for d,d_test in zip(dataset, dataset_test):
    X_train = [torch.tensor(d.iloc[:, 1:].to_numpy(),dtype = torch.float32).to(device) for device in devices]
    Y_train = [F.one_hot(torch.tensor(d.iloc[:,0].to_numpy()), 10).to(torch.float16).to(device) for device in devices]
    X_test = [torch.tensor(d_test.iloc[:, 1:].to_numpy(),dtype = torch.float32).to(device) for device in devices]
    Y_test = [F.one_hot(torch.tensor(d_test.iloc[:,0].to_numpy()), 10).to(torch.float16).to(device) for device in devices]

In [None]:
train_loader = [DataLoader(TensorDataset(X_train[device], Y_train[device]), batch_size=128, shuffle=True) for device in devices]
test_loader = [DataLoader(TensorDataset(X_test[device], Y_test[device]), batch_size=128, shuffle=True) for device in devices]


In [None]:
input,target = next(iter(train_loader[1]))
plt.imshow(input[0].view(28,28).to("cpu"))
target[0]

In [None]:

class MLP(nn.Module):
    def __init__(self):
        nn.Module.__init__(self)
        self.fc1 = nn.Linear(28**2, 28**2*4)
        self.fc2 = nn.Linear(28**2*4, 28**2*4)
        self.fc3 = nn.Linear(28**2*4, 10)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        logits = self.fc3(x)
        return logits

In [None]:
models = [MLP().to(device) for device in devices]

In [None]:
next(models[1].parameters())

In [None]:
for a in train_loader:
    print(a)

# pytorch way

In [None]:
# look at the scripts here:
# https://drive.google.com/drive/folders/10uVLzn0tgllbjSLPHUKtVJNluwzT6DBD?usp=drive_link