# Pytorch test example

Source: ИАД, ВШЭ

In [None]:
import torch
from torch import nn
import pandas as pd
import numpy as np
import random
from tqdm.notebook import tqdm
import torch.nn.functional as F

In [None]:
# %conda install wget

In [None]:
!wget -O data.txt.zip https://archive.ics.uci.edu/ml/machine-learning-databases/00203/YearPredictionMSD.txt.zip

--2022-10-10 16:27:15--  https://archive.ics.uci.edu/ml/machine-learning-databases/00203/YearPredictionMSD.txt.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 211011981 (201M) [application/x-httpd-php]
Saving to: ‘data.txt.zip’


2022-10-10 16:27:21 (34.9 MB/s) - ‘data.txt.zip’ saved [211011981/211011981]



In [None]:
df = pd.read_csv('data.txt.zip', header=None)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,81,82,83,84,85,86,87,88,89,90
0,2001,49.94357,21.47114,73.0775,8.74861,-17.40628,-13.09905,-25.01202,-12.23257,7.83089,...,13.0162,-54.40548,58.99367,15.37344,1.11144,-23.08793,68.40795,-1.82223,-27.46348,2.26327
1,2001,48.73215,18.4293,70.32679,12.94636,-10.32437,-24.83777,8.7663,-0.92019,18.76548,...,5.66812,-19.68073,33.04964,42.87836,-9.90378,-32.22788,70.49388,12.04941,58.43453,26.92061
2,2001,50.95714,31.85602,55.81851,13.41693,-6.57898,-18.5494,-3.27872,-2.35035,16.07017,...,3.038,26.05866,-50.92779,10.93792,-0.07568,43.2013,-115.00698,-0.05859,39.67068,-0.66345
3,2001,48.2475,-1.89837,36.29772,2.58776,0.9717,-26.21683,5.05097,-10.34124,3.55005,...,34.57337,-171.70734,-16.96705,-46.67617,-12.51516,82.58061,-72.08993,9.90558,199.62971,18.85382
4,2001,50.9702,42.20998,67.09964,8.46791,-15.85279,-16.81409,-12.48207,-9.37636,12.63699,...,9.92661,-55.95724,64.92712,-17.72522,-1.49237,-7.50035,51.76631,7.88713,55.66926,28.74903


In [None]:
X = df.iloc[:, 1:].values
y = df.iloc[:, 0].values

train_size = 463715
X_train = X[:train_size, :]
y_train = y[:train_size]
X_test = X[train_size:, :]
y_test = y[train_size:]

In [None]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

In [None]:
ridge = Ridge(10).fit(X_train, y_train)
print(f"Test: {mean_squared_error(y_test, ridge.predict(X_test), squared=False)}")

Test: 9.510160746337895


In [None]:
def set_random_seed(seed):
    torch.backends.cudnn.deterministic = True
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

In [None]:
set_random_seed(177)

In [None]:
class SongsDataset(torch.utils.data.Dataset):
  def __init__(self, X, y):
    self.X = X # признаки
    self.y = y # правильные ответы

  def __len__(self):
    return len(self.y)

  def __getitem__(self, idx):
    return self.X[idx], self.y[idx]

In [None]:
from sklearn import preprocessing

In [None]:
# https://intel.github.io/scikit-learn-intelex/samples/linear_regression.html
# пример нормализации взял отсюда
x_scaler = preprocessing.MinMaxScaler().fit(X_train)
y_scaler = preprocessing.StandardScaler().fit(y_train.reshape(-1, 1))

X_train = x_scaler.transform(X_train)
X_test = x_scaler.transform(X_test)

y_train = y_scaler.transform(y_train.reshape(-1, 1)).ravel()
y_test = y_scaler.transform(y_test.reshape(-1, 1)).ravel()

In [None]:
set_random_seed(177)
train_set = SongsDataset(X_train, y_train)
train_loader = torch.utils.data.DataLoader(train_set, batch_size=32, shuffle=True)

test_set = SongsDataset(X_test, y_test)
test_loader = torch.utils.data.DataLoader(test_set, batch_size=32, shuffle=True)

In [None]:
# Чтобы проверить сгенерированную модель, необходимо заменить модель ниже

model = nn.Sequential(
    nn.Linear(90, 45),
    nn.BatchNorm1d(45),
    nn.ReLU(),
    nn.Linear(45, 15),
    nn.BatchNorm1d(15),
    nn.ReLU(),
    nn.Linear(15, 5),
    nn.BatchNorm1d(5),
    nn.ReLU(),
    nn.Linear(5, 1)
)

In [None]:
cnt_epochs = 10
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)
criterion = F.mse_loss
scheduler = None

In [None]:
loss_values = []

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
def train(model, optimizer, criterion, scheduler, cnt_epochs, train_loader, test_loader):
    '''
    params:
        model - torch.nn.Module to be fitted
        optimizer - model optimizer
        criterion - loss function from torch.nn
        train_loader - torch.utils.data.Dataloader with train set
        test_loader - torch.utils.data.Dataloader with test set
                      (if you wish to validate during training)
    '''

    for epoch in range(cnt_epochs):
      for x_train, y_train in tqdm(train_loader):
        y_pred = model(x_train.float()).to(device).ravel()
        loss = F.mse_loss(y_pred, y_train.float())
        a = y_pred.detach().numpy()
        b = a.reshape(-1, 1)
        c = torch.tensor(y_scaler.inverse_transform(b))
        loss_values.append(get_loss(criterion, c, y_train, y_scaler))
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

      if scheduler is not None:
        scheduler.step()


def test(model, criterion, test_loader):
    '''
    params:
        model - torch.nn.Module to be evaluated on test set
        criterion - loss function from torch.nn
        test_loader - torch.utils.data.Dataloader with test set
    ----------
    returns:
        predicts - torch.tensor with shape (len(test_loader.dataset), ),
                   which contains predictions for test objects
    '''
    with torch.no_grad():
      predictions = model(torch.from_numpy(X_test).float())

      return torch.from_numpy(y_scaler.inverse_transform(predictions))


def get_loss(criterion, predictions, y_test, y_scaler):
  answers = torch.from_numpy(y_scaler.inverse_transform(y_test.reshape(-1, 1)))

  return torch.sqrt(criterion(predictions, answers)).tolist()

In [None]:
train(model, optimizer, criterion, scheduler, cnt_epochs, train_loader, test_loader)

  0%|          | 0/14492 [00:00<?, ?it/s]

  0%|          | 0/14492 [00:00<?, ?it/s]

  0%|          | 0/14492 [00:00<?, ?it/s]

  0%|          | 0/14492 [00:00<?, ?it/s]

  0%|          | 0/14492 [00:00<?, ?it/s]

  0%|          | 0/14492 [00:00<?, ?it/s]

  0%|          | 0/14492 [00:00<?, ?it/s]

  0%|          | 0/14492 [00:00<?, ?it/s]

  0%|          | 0/14492 [00:00<?, ?it/s]

  0%|          | 0/14492 [00:00<?, ?it/s]

In [None]:
pred = test(model, criterion, test_loader)
pred

tensor([[1999.6498],
        [2003.9419],
        [2001.8908],
        ...,
        [1999.1221],
        [2001.6094],
        [2001.0342]], dtype=torch.float64)

In [None]:
get_loss(criterion, pred, y_test, y_scaler)

8.775110881783505