In [114]:
import pandas as pd
import numpy as np
import seaborn as sns

### Грибы препроцессинг и реализация через Pytorch

In [None]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
mushroom = fetch_ucirepo(id=73) 
  
# data (as pandas dataframes) 
X = mushroom.data.features 
y = mushroom.data.targets 

In [85]:
X.isna().sum()

cap-shape                      0
cap-surface                    0
cap-color                      0
bruises                        0
odor                           0
gill-attachment                0
gill-spacing                   0
gill-size                      0
gill-color                     0
stalk-shape                    0
stalk-root                  2480
stalk-surface-above-ring       0
stalk-surface-below-ring       0
stalk-color-above-ring         0
stalk-color-below-ring         0
veil-type                      0
veil-color                     0
ring-number                    0
ring-type                      0
spore-print-color              0
population                     0
habitat                        0
dtype: int64

In [None]:
X['stalk-root'] = X['stalk-root'].fillna('n')

In [87]:
X.duplicated().sum()

0

In [117]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [118]:
from sklearn.preprocessing import TargetEncoder
enc_auto = TargetEncoder(smooth = "auto")
X_train.iloc[:, :] = enc_auto.fit_transform(X_train, y_train.values.reshape(y_train.shape[0]))
X_test.iloc[:, :] = enc_auto.transform(X_test)
X_train = X_train.astype('float')
X_test = X_test.astype('float')
y_train = pd.get_dummies(y_train, drop_first = True, dtype=int)
y_test = pd.get_dummies(y_test, drop_first = True, dtype=int)

In [119]:
import torch
from torch.utils.data import DataLoader, Dataset

class MyDataset(Dataset):

    def __init__(self, X, y):
        self.X = torch.tensor(X.values, dtype = torch.float)
        self.y = torch.tensor(y.values, dtype = torch.int)
    def __len__(self):
        return self.X.shape[0]
    def __getitem__(self, index):
        return (self.X[index], self.y[index])

In [120]:
BATCH_SIZE = 64
train_loader = DataLoader(MyDataset(X_train, y_train), batch_size = BATCH_SIZE, shuffle = True)
test_loader = DataLoader(MyDataset(X_test, y_test), batch_size = BATCH_SIZE, shuffle = False)

In [121]:
from torch import nn
class MLP_Classification(nn.Module):
    def __init__(self, in_features, num_classes, hidden_size):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(in_features = in_features, out_features = hidden_size),
            nn.ReLU(),
            nn.Linear(in_features = hidden_size, out_features= hidden_size),
            nn.Sigmoid(),
            nn.Linear(in_features = hidden_size, out_features = num_classes)
        )
    def forward(self, x):
        return self.model(x)

In [122]:
NUM_CLASSES = 2
HIDDEN_SIZE = 32
model = MLP_Classification(X.shape[1], NUM_CLASSES, HIDDEN_SIZE)
optimizer = torch.optim.SGD(model.parameters(), lr = 0.01, momentum = 0.9)
criterion = nn.CrossEntropyLoss()

In [None]:
from tqdm.notebook import tqdm
num_epochs = 100
pbar = tqdm(range(1, num_epochs + 1))

for epoch in pbar:
    train_loss, test_loss = 0.0, 0.0
    model.train()
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        predictions = model(X_batch)
        loss = criterion(predictions, y_batch.squeeze().long())
        loss.backward()
        optimizer.step()

        train_loss += loss.item() * X_batch.shape[0]

    model.eval()
    for X_batch, y_batch in test_loader:
        predictions = model(X_batch)
        loss = criterion(predictions, y_batch.squeeze().long())
        test_loss += loss.item() * X_batch.shape[0]

    train_loss /= len(train_loader.dataset)
    test_loss /= len(test_loader.dataset)
    pbar.set_postfix({'train loss': train_loss, 'test loss': test_loss})

In [124]:
NUM_CLASSES = 2
HIDDEN_SIZE = 32
model = MLP_Classification(X.shape[1], NUM_CLASSES, HIDDEN_SIZE)
optimizer = torch.optim.Adam(model.parameters(), lr = 1e-3, weight_decay = 1e-2)
criterion = nn.CrossEntropyLoss()

In [None]:
from tqdm.notebook import tqdm
num_epochs = 100
pbar = tqdm(range(1, num_epochs + 1))

for epoch in pbar:
    train_loss, test_loss = 0.0, 0.0
    model.train()
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        predictions = model(X_batch)
        loss = criterion(predictions, y_batch)
        loss.backward()
        optimizer.step()

        train_loss += loss.item() * X_batch.shape[0]

    model.eval()
    for X_batch, y_batch in test_loader:
        predictions = model(X_batch)
        loss = criterion(predictions, y_batch)
        test_loss += loss.item() * X_batch.shape[0]

    train_loss /= len(train_loader.dataset)
    test_loss /= len(test_loader.dataset)
    pbar.set_postfix({'train loss': train_loss, 'test loss': test_loss})

### Собственная реализация

In [125]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [126]:
import modules as mm
from tqdm.notebook import tqdm

IN_FEATURES = X.shape[1]
NUM_CLASSES = 2
HIDDEN_SIZE = 32

train_loader = mm.DataLoader(X_train.values, y_train.values.squeeze(), batch_size=64, shuffle=True)
test_loader = mm.DataLoader(X_test.values, y_test.values.squeeze(), batch_size=64, shuffle=False)
model = mm.Sequential(
    mm.Linear(in_features = IN_FEATURES, out_features = HIDDEN_SIZE),
    mm.ReLU(),
    mm.Linear(in_features = HIDDEN_SIZE, out_features= HIDDEN_SIZE),
    mm.Sigmoid(),
    mm.Linear(in_features = HIDDEN_SIZE, out_features = NUM_CLASSES)
)
optimizer = mm.SGD(model, lr=1e-3, momentum = 0.9)
criterion = mm.CrossEntropyLoss()

In [None]:
num_epochs = 100
pbar = tqdm(range(1, num_epochs + 1))

for epoch in pbar:
    train_loss, test_loss = 0.0, 0.0

    model.train()
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        predictions = model(X_batch)
        loss = criterion(predictions, y_batch)
        model.backward(X_batch, criterion.backward(predictions, y_batch))
        optimizer.step()

        train_loss += loss * X_batch.shape[0]

    model.eval()
    for X_batch, y_batch in test_loader:
        predictions = model(X_batch)
        loss = criterion(predictions, y_batch)
        test_loss += loss * X_batch.shape[0]

    train_loss /= train_loader.num_samples()
    test_loss /= test_loader.num_samples()
    pbar.set_postfix({'train loss': train_loss, 'test loss': test_loss})

### Компьютеры реализация через Pytorch

In [1]:
! kaggle datasets download -d mrsimple07/laptoppriceprediction
! tar -xf laptoppriceprediction.zip

Downloading laptoppriceprediction.zip to c:\Users\alex\vsCodeProjects\intro-to-dl-hse-2022-2023\intro-to-dl-hse-2022-2023\homeworks-small




  0%|          | 0.00/39.1k [00:00<?, ?B/s]
100%|██████████| 39.1k/39.1k [00:00<00:00, 586kB/s]


In [135]:
data = pd.read_csv("Laptop_price.csv")

In [136]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data.drop(['Price'], axis = 1), data['Price'], test_size = 0.2, random_state = 42)

In [137]:
from category_encoders import TargetEncoder
enc = TargetEncoder(return_df = True).fit(X_train['Brand'],  y_train)
X_train['Brand'] = enc.transform(X_train['Brand'])
X_test['Brand'] = enc.transform(X_test['Brand'])

In [138]:
import torch
from torch.utils.data import DataLoader, Dataset

class MyDataset(Dataset):

    def __init__(self, X, y):
        self.X = torch.tensor(X.values, dtype = torch.float)
        self.y = torch.tensor(y.values, dtype = torch.float)
    def __len__(self):
        return self.X.shape[0]
    def __getitem__(self, index):
        return (self.X[index], self.y[index])

In [139]:
BATCH_SIZE = 64
train_loader = DataLoader(MyDataset(X_train, y_train), batch_size = BATCH_SIZE, shuffle = True)
test_loader = DataLoader(MyDataset(X_test, y_test), batch_size = BATCH_SIZE, shuffle = False)

In [140]:
from torch import nn
class MLP_Regression(nn.Module):
    def __init__(self, in_features, hidden_size):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(in_features = in_features, out_features = hidden_size),
            nn.ReLU(),
            nn.Linear(in_features = hidden_size, out_features= hidden_size),
            nn.ReLU(),
            nn.Linear(in_features = hidden_size, out_features = 1)
        )
    def forward(self, x):
        return self.model(x)

In [141]:
HIDDEN_SIZE = 128
model = MLP_Regression(X_train.shape[1], HIDDEN_SIZE)
optimizer = torch.optim.Adam(model.parameters(), lr = 1e-3, weight_decay=0.1)
criterion = nn.MSELoss()

In [None]:
from tqdm.notebook import tqdm
num_epochs = 200
pbar = tqdm(range(1, num_epochs + 1))

for epoch in pbar:
    train_loss, test_loss = 0.0, 0.0
    model.train()
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        predictions = model(X_batch)
        loss = criterion(predictions, torch.reshape(y_batch, shape = (y_batch.shape[0],1)))
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * X_batch.shape[0]

    model.eval()
    for X_batch, y_batch in test_loader:
        predictions = model(X_batch)
        loss = criterion(predictions, torch.reshape(y_batch, shape = (y_batch.shape[0],1)))
        test_loss += loss.item() * X_batch.shape[0]

    train_loss /= len(train_loader.dataset)
    test_loss /= len(test_loader.dataset)
    pbar.set_postfix({'train loss': train_loss, 'test loss': test_loss})

### Моя реализация

In [143]:
import modules as mm
from tqdm.notebook import tqdm

IN_FEATURES = X_train.shape[1]
NUM_CLASSES = 1
HIDDEN_SIZE = 32

train_loader = mm.DataLoader(X_train.values, y_train.values.squeeze(), batch_size=64, shuffle=True)
test_loader = mm.DataLoader(X_test.values, y_test.values.squeeze(), batch_size=64, shuffle=False)
model = mm.Sequential(
    mm.Linear(in_features = IN_FEATURES, out_features = HIDDEN_SIZE),
    mm.ReLU(),
    mm.Linear(in_features = HIDDEN_SIZE, out_features= HIDDEN_SIZE),
    mm.ReLU(),
    mm.Linear(in_features = HIDDEN_SIZE, out_features = NUM_CLASSES)
)
optimizer = mm.Adam(model, lr=1e-3, weight_decay=0.1)
criterion = mm.MSELoss()

In [None]:
num_epochs = 100
pbar = tqdm(range(1, num_epochs + 1))

for epoch in pbar:
    train_loss, test_loss = 0.0, 0.0

    model.train()
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        predictions = model(X_batch)
        y_batch = np.reshape(y_batch, (predictions.shape[0], predictions.shape[1]))
        loss = criterion(predictions, y_batch)
        model.backward(X_batch, criterion.backward(predictions, y_batch))
        optimizer.step()

        train_loss += loss * X_batch.shape[0]

    model.eval()
    for X_batch, y_batch in test_loader:
        predictions = model(X_batch)
        y_batch = np.reshape(y_batch, (predictions.shape[0], predictions.shape[1]))
        loss = criterion(predictions, y_batch)
        test_loss += loss * X_batch.shape[0]

    train_loss /= train_loader.num_samples()
    test_loss /= test_loader.num_samples()
    pbar.set_postfix({'train loss': train_loss, 'test loss': test_loss})