In [1]:
import pandas as pd
import numpy as np
import nltk
from collections import Counter
from sklearn.feature_extraction import DictVectorizer
from torch.utils.data import Dataset, DataLoader, random_split
import torch
import torch.nn as nn
import torch.functional as F
from tqdm import tqdm
import gensim.downloader as api

CPU = torch.device('cpu')
GPU = torch.device('cuda')

In [2]:
def to_device(data, device):
    if isinstance(data, (list,tuple)):
        return [to_device(x, device) for x in data]
    if isinstance(data, dict):
        return dict((k, to_device(v, device)) for k, v in data.items())
    return data.to(device, non_blocking=True)

class DeviceDataLoader():
    def __init__(self, dl, device):
        self.dl = dl
        self.device = device
        
    def __iter__(self):
        for b in self.dl: 
            yield to_device(b, self.device)

    def __len__(self):
        return len(self.dl)

# Предварительная обработка данных

In [3]:
DATA_DIR = './train_data.csv'

class SalaryDataset(Dataset):
    MIN_COUNT = 10
    PAD, UNK = 'PAD', 'UNK'
    PAD_IX, UNK_IX = 0, 1
    TEXT_COLS = ['Title', 'FullDescription']
    CATEGORIAL_COLS = ['Category', 'Company', 'LocationNormalized', 'ContractType', 'ContractTime']
    TARGET_COL = 'Log1pSalary'
    MAX_TITLE_LENGHT = 20
    MAX_DESC_LENGHT = 500

    def _process_data(self):
        self._data[self.TARGET_COL] = np.log1p(self._data['SalaryNormalized']).astype('float32')
        self._data[self.CATEGORIAL_COLS] = self._data[self.CATEGORIAL_COLS].fillna('NaN')
        self._data[self.TEXT_COLS] = self._data[self.TEXT_COLS].fillna('NaN')
        tokenizer = nltk.tokenize.WordPunctTokenizer()
        self._data[self.TEXT_COLS] = self._data[self.TEXT_COLS].applymap(lambda x: " ".join(tokenizer.tokenize(x.lower())))
        for col in self._data[self.TEXT_COLS]:
            for line in self._data[col].values:
                self._tok_cntr.update(line.split(" "))

        self._tokens = sorted(t for t, c in self._tok_cntr.items() if c >= self.MIN_COUNT)
        self._tokens = [self.PAD, self.UNK] + self._tokens
        self._token_to_id = {t: i for i, t in enumerate(self._tokens)}
        top_companies, top_counts = zip(*Counter(self._data['Company']).most_common(1000))
        recognized_companies = set(top_companies)
        self._data["Company"] = self._data["Company"].apply(lambda comp: comp if comp in recognized_companies else "Other")
        self._categorical_vectorizer = DictVectorizer(dtype=np.float32, sparse=False).fit(self._data[self.CATEGORIAL_COLS].apply(dict, axis=1))

    def __init__(self, path: str):
        self._data = pd.read_csv(path)
        self._tok_cntr = Counter()
        self._process_data()

    def __getitem__(self, i):
        row = self._data[i:i+1]

        title = row['Title'].values[0]
        desc = row['FullDescription'].values[0]

        title_vals_encoded = [self._token_to_id.get(tok, self.UNK_IX) for tok in str.split(title, ' ')]
        desc_vals_encoded = [self._token_to_id.get(tok, self.UNK_IX) for tok in str.split(desc, ' ')]

        return {
            'Title': title_vals_encoded,
            'FullDescription': desc_vals_encoded,
            self.TARGET_COL: row[self.TARGET_COL].values[0],
            'Categorical': self._categorical_vectorizer.transform(row[self.CATEGORIAL_COLS].apply(dict, axis=1)).flatten().tolist()
        }

    def __len__(self):
        return len(self._data)

### Создаем датасет и разбиваем на тренировки / тесты / подмножества проверки

In [4]:
dataset = SalaryDataset(DATA_DIR)
NUM_TOKENS = len(dataset._tokens)
NUM_CAT_FEATURES = len(dataset._categorical_vectorizer.vocabulary_)

train_size = round(len(dataset)*0.7)
val_size = round((len(dataset) - train_size) * (1/3))
test_size = (len(dataset) - train_size) - val_size
train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size], torch.Generator().manual_seed(12))


def collate_fn(data):
    collated = dict(zip(data[0].keys(), [[], [], [], []]))

    for d in data:
        for k, v in d.items():
            if k == 'Title':
                v.extend([SalaryDataset.PAD_IX] * (SalaryDataset.MAX_TITLE_LENGHT - len(v))) # padding
                v = v[:SalaryDataset.MAX_TITLE_LENGHT]
                collated[k].append(v)
            elif k == 'FullDescription':
                v.extend([SalaryDataset.PAD_IX] * (SalaryDataset.MAX_DESC_LENGHT - len(v)))  # padding
                v = v[:SalaryDataset.MAX_DESC_LENGHT]
                collated[k].append(v)
            else:
                collated[k].append(v)

    for k, v in collated.items():
        t = torch.float32 if k in [SalaryDataset.TARGET_COL, 'Categorical'] else torch.int32
        collated[k] = torch.as_tensor(v, dtype=t)
        
    return collated


train_loader = DataLoader(train_dataset, batch_size=128, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=128, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=128, collate_fn=collate_fn)

# Глубокое обучение

In [5]:
class SalaryPredictor(nn.Module):
    def __init__(self, n_tokens=NUM_TOKENS, n_cat_features=NUM_CAT_FEATURES, hid_size=8):
        super().__init__()
        self.n_tokens = n_tokens
        self.n_cat_features = n_cat_features
        self.hid_size = hid_size
        self.embedder = nn.Embedding(n_tokens, hid_size)
        self.title_encoder = nn.Sequential(
            nn.Conv1d(hid_size, hid_size, kernel_size=2),
            nn.Dropout(p=0.25),
            nn.ReLU(),
            nn.AdaptiveMaxPool1d(output_size=1)
        )
        self.description_encoder = nn.Sequential(
            nn.Conv1d(hid_size, hid_size, kernel_size=2),
            nn.Dropout(p=0.25),
            nn.ReLU(),
            nn.AdaptiveMaxPool1d(output_size=1)
        )
        self.categorical_encoder = nn.Sequential(
            nn.Linear(n_cat_features, hid_size * 2),
            nn.ReLU(),
            nn.Linear(hid_size * 2, hid_size * 2),
            nn.ReLU()
        )
        self.final_predictor = nn.Sequential(
            nn.Linear(hid_size * 4, hid_size),
            nn.ReLU(),
            nn.Linear(hid_size, 1)
        )

    def forward(self, batch):
        title_embeddings = self.embedder(batch['Title']).permute(0, 2, 1)
        title_features = self.title_encoder(title_embeddings).squeeze()
        description_embeddings = self.embedder(batch['FullDescription']).permute(0, 2, 1)
        description_features = self.description_encoder(description_embeddings).squeeze()
        categorical_features = self.categorical_encoder(batch['Categorical'])
        features = torch.cat([title_features, description_features, categorical_features], dim=1)
        return self.final_predictor(features).squeeze()

### Evaluate

In [6]:
def evaluate(model, device=None):
    squared_error = abs_error = num_samples = 0.0

    loader = val_loader if not device else DeviceDataLoader(val_loader, device)

    model.eval()
    with torch.no_grad():
        for batch in loader:
            pred = model(batch)
            squared_error += torch.mean(torch.square(pred - batch[SalaryDataset.TARGET_COL]))
            abs_error += torch.mean(torch.abs(pred - batch[SalaryDataset.TARGET_COL]))
            num_samples += len(batch)
    mse = squared_error.detach().cpu().numpy() / num_samples
    mae = abs_error.detach().cpu().numpy() / num_samples

    return mse, mae

### Train

In [7]:
def train(model, optimizer, epoches=5, device=None, criterion=nn.MSELoss(reduction='mean')):
    loader = train_loader

    if device:
        model.to(device)
        loader = DeviceDataLoader(train_loader, device)

    for epoch in range(epoches):
        model.train()
        for batch in tqdm(loader):
            optimizer.zero_grad(set_to_none=True)
            pred = model(batch)
            loss = criterion(pred, batch[SalaryDataset.TARGET_COL])
            loss.backward()
            optimizer.step()

        mse, mae = evaluate(model, device)
        print(f'Epoch: {epoch+1} | Loss: {loss.item()} | Validation: MSE={mse}/MAE={mae}')

    if device:
        model.cpu()

### Test

In [8]:
def test(model, device=None):
    squared_error = abs_error = num_samples = 0.0
    loader = test_loader

    if device:
        model.to(device)
        loader = DeviceDataLoader(test_loader, device)

    model.eval()
    with torch.no_grad():
        for x in loader:
            pred = model(x)
            squared_error += torch.mean(torch.square(pred - x[SalaryDataset.TARGET_COL]))
            abs_error += torch.mean(torch.abs(pred - x[SalaryDataset.TARGET_COL]))
            num_samples += len(x)

    mse = squared_error.detach().cpu().numpy() / num_samples
    mae = abs_error.detach().cpu().numpy() / num_samples

    if device:
        model.cpu()

    return mse, mae

# Задание 1

### Стартовые показатели

In [9]:
model = SalaryPredictor()
train(model, torch.optim.Adam(model.parameters(), lr=1e-3), epoches=3)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1339/1339 [04:20<00:00,  5.14it/s]


Epoch: 1 | Loss: 0.2022046446800232 | Validation: MSE=1.1587030092875164/MAE=0.5278991063435873


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1339/1339 [04:26<00:00,  5.02it/s]


Epoch: 2 | Loss: 0.1151353195309639 | Validation: MSE=0.9814110596974691/MAE=0.48653586705525714


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1339/1339 [04:30<00:00,  4.95it/s]


Epoch: 3 | Loss: 0.1103941947221756 | Validation: MSE=0.9770181179046631/MAE=0.48610806465148926


### Вместе с BatchNorm и LayerNorm:

In [10]:
class BatchLayerNormSalaryPredictor(SalaryPredictor):
    def __init__(self):
        super().__init__()
        self.title_encoder = nn.Sequential(
            nn.Conv1d(self.hid_size, self.hid_size, kernel_size=2),
            nn.Dropout(p=0.25),
            nn.LayerNorm(SalaryDataset.MAX_TITLE_LENGHT-1),
            nn.ReLU(),
            nn.BatchNorm1d(self.hid_size),
            nn.AdaptiveMaxPool1d(output_size=1)
        )
        self.description_encoder = nn.Sequential(
            nn.Conv1d(self.hid_size, self.hid_size, kernel_size=2),
            nn.Dropout(p=0.25),
            nn.LayerNorm(SalaryDataset.MAX_DESC_LENGHT-1),
            nn.ReLU(),
            nn.BatchNorm1d(self.hid_size),
            nn.AdaptiveMaxPool1d(output_size=1)
        )
        self.categorical_encoder = nn.Sequential(
            nn.Linear(self.n_cat_features, self.hid_size * 2),
            nn.ReLU(),
            nn.LayerNorm(self.hid_size * 2),
            nn.Linear(self.hid_size * 2, self.hid_size * 2),
            nn.ReLU()
        )
        self.final_predictor = nn.Sequential(
            nn.Linear(self.hid_size * 4, self.hid_size),
            nn.ReLU(),
            nn.LayerNorm(self.hid_size),
            nn.Linear(self.hid_size, 1)
        )

### После интеграции BatchNorm и LayerNorm сеть стала справляться удачнее, лучше всего видно по значениям MSE/MAE

In [11]:
model = BatchLayerNormSalaryPredictor()
train(model, torch.optim.Adam(model.parameters(), lr=1e-3), epoches=3)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1339/1339 [04:40<00:00,  4.77it/s]


Epoch: 1 | Loss: 0.2048853039741516 | Validation: MSE=0.05279832581679026/MAE=0.09047077099482219


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1339/1339 [04:43<00:00,  4.72it/s]


Epoch: 2 | Loss: 0.1174137145280838 | Validation: MSE=0.0376193051536878/MAE=0.07444887359937032


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1339/1339 [05:01<00:00,  4.44it/s]


Epoch: 3 | Loss: 0.10806610435247421 | Validation: MSE=0.03722138206164042/MAE=0.07187482217947642


In [12]:
class ParrallelConvSalaryPredictor(SalaryPredictor):
    def __init__(self):
        super().__init__()
        self.title_encoder2 = nn.Sequential(
            nn.Conv1d(self.hid_size, self.hid_size, kernel_size=4),
            nn.Dropout(p=0.33),
            nn.ReLU(),
            nn.BatchNorm1d(self.hid_size),
            nn.AdaptiveMaxPool1d(output_size=1)
        )
        self.description_encoder2 = nn.Sequential(
            nn.Conv1d(self.hid_size, self.hid_size, kernel_size=4),
            nn.Dropout(p=0.33),
            nn.ReLU(),
            nn.BatchNorm1d(self.hid_size),
            nn.AdaptiveMaxPool1d(output_size=1)
        )
        self.final_predictor = nn.Sequential(
            nn.Linear(self.hid_size * 6, self.hid_size * 3),
            nn.ReLU(),
            nn.Linear(self.hid_size * 3, int(self.hid_size * 1.5)),
            nn.ReLU(),
            nn.Linear(int(self.hid_size * 1.5), 1)
        )
        
    def forward(self, batch):
            title_embeddings = self.embedder(batch['Title']).permute(0, 2, 1)
            title_features = self.title_encoder(title_embeddings).squeeze()
            title_features2 = self.title_encoder2(title_embeddings).squeeze()
            description_embeddings = self.embedder(batch['FullDescription']).permute(0, 2, 1)
            description_features = self.description_encoder(description_embeddings).squeeze()
            description_features2 = self.description_encoder2(description_embeddings).squeeze()
            categorical_features = self.categorical_encoder(batch['Categorical'])
            title_features = torch.cat((title_features, title_features2), dim=1)
            description_features = torch.cat((description_features, description_features2), dim=1)
            features = torch.cat([title_features, description_features, categorical_features], dim=1)
            return self.final_predictor(features).squeeze()

### MSE и MAE улучшаются при добавлении параллельных сверточных слоев, однако совсем незначительно

In [13]:
model = ParrallelConvSalaryPredictor()
train(model, torch.optim.Adam(model.parameters(), lr=1e-3), 3)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1339/1339 [05:24<00:00,  4.13it/s]


Epoch: 1 | Loss: 0.12488577514886856 | Validation: MSE=0.7543275356292725/MAE=0.42392945289611816


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1339/1339 [05:09<00:00,  4.33it/s]


Epoch: 2 | Loss: 0.0923488512635231 | Validation: MSE=0.6474840641021729/MAE=0.3924413522084554


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1339/1339 [05:18<00:00,  4.20it/s]


Epoch: 3 | Loss: 0.10246707499027252 | Validation: MSE=0.5841095447540283/MAE=0.3726075490315755


In [14]:
class NewSalaryPredictor(SalaryPredictor):
    def __init__(self):
        super().__init__()

        self.embedder = nn.Embedding(self.n_tokens, self.hid_size * 2)
        self.title_encoder = nn.Sequential(
            nn.Conv1d(self.hid_size * 2, self.hid_size, kernel_size=2),
            nn.Dropout(p=0.25),
            nn.ReLU(),
            nn.BatchNorm1d(self.hid_size),                             
            nn.Conv1d(self.hid_size, self.hid_size, kernel_size=2),     
            nn.ReLU(),
            nn.AdaptiveMaxPool1d(output_size=1)
        )
        self.title_encoder2 = nn.Sequential(                           
            nn.Conv1d(self.hid_size * 2, self.hid_size, kernel_size=4),
            nn.Dropout(p=0.33),
            nn.ReLU(),
            nn.BatchNorm1d(self.hid_size),
            nn.AdaptiveMaxPool1d(output_size=1)
        )
        self.description_encoder = nn.Sequential(
            nn.Conv1d(self.hid_size * 2, self.hid_size, kernel_size=2),
            nn.Dropout(p=0.25),
            nn.ReLU(),
            nn.BatchNorm1d(self.hid_size),                             
            nn.Conv1d(self.hid_size, self.hid_size, kernel_size=2),    
            nn.ReLU(),
            nn.AdaptiveMaxPool1d(output_size=1)
        )
        self.description_encoder2 = nn.Sequential(                      
            nn.Conv1d(self.hid_size * 2, self.hid_size, kernel_size=4),
            nn.Dropout(p=0.33),
            nn.ReLU(),
            nn.BatchNorm1d(self.hid_size),
            nn.AdaptiveMaxPool1d(output_size=1)
        )
        self.categorical_encoder = nn.Sequential(                       
            nn.Linear(self.n_cat_features, self.hid_size * 4),
            nn.ReLU(),
            nn.LayerNorm(self.hid_size * 4),                           
            nn.Linear(self.hid_size * 4, self.hid_size * 2),
            nn.ReLU()
        )
        self.final_predictor = nn.Sequential(                           
            nn.Linear(self.hid_size * 6, self.hid_size * 3),
            nn.ReLU(),
            nn.LayerNorm(self.hid_size * 3),                            
            nn.Linear(self.hid_size * 3, int(self.hid_size * 1.5)),
            nn.ReLU(),
            nn.BatchNorm1d(int(self.hid_size * 1.5)),                   
            nn.Linear(int(self.hid_size * 1.5), 1)
        )

    def forward(self, batch):
        title_embeddings = self.embedder(batch['Title']).permute(0, 2, 1)
        title_features = self.title_encoder(title_embeddings).squeeze()
        title_features2 = self.title_encoder2(title_embeddings).squeeze()
        description_embeddings = self.embedder(batch['FullDescription']).permute(0, 2, 1)
        description_features = self.description_encoder(description_embeddings).squeeze()
        description_features2 = self.description_encoder2(description_embeddings).squeeze()
        categorical_features = self.categorical_encoder(batch['Categorical'])
        title_features = torch.cat((title_features, title_features2), dim=1)
        description_features = torch.cat((description_features, description_features2), dim=1)
        features = torch.cat((title_features, description_features, categorical_features), dim=1)
        return self.final_predictor(features).squeeze()

### Результаты стали еще лучше после того как я использовал и нормализацию и параллельные энкодеры

In [15]:
model = NewSalaryPredictor()
train(model, torch.optim.Adam(model.parameters(), lr=1e-3), 3)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1339/1339 [05:53<00:00,  3.78it/s]


Epoch: 1 | Loss: 0.10126442462205887 | Validation: MSE=0.029981888830661774/MAE=0.0659247636795044


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1339/1339 [09:09<00:00,  2.44it/s]


Epoch: 2 | Loss: 0.09590458869934082 | Validation: MSE=0.02421173204978307/MAE=0.05799533426761627


100%|██████████| 1339/1339 [08:57<00:00,  2.49it/s]


Epoch: 3 | Loss: 0.08416073024272919 | Validation: MSE=0.02155221253633499/MAE=0.0541977733373642


### Раняя остановка

In [16]:
def avg(lst: list):
    return sum(lst) / len(lst)

def delta(x):
    def delta_impl(y):
        return x - y
    return delta_impl

def early_stop_train(model, optimizer, epoches=5, device=None, criterion=nn.MSELoss(reduction='mean')):
    EXIT_CRITERION = 0.01
    loader = train_loader
    outputs: dict[str, list] = {}

    if device:
        model.to(device)
        loader = DeviceDataLoader(train_loader, device)

    for epoch in range(epoches):
        model.train()
        for batch in tqdm(loader):
            optimizer.zero_grad(set_to_none=True)
            pred = model(batch)
            loss = criterion(pred, batch[SalaryDataset.TARGET_COL])
            loss.backward()
            optimizer.step()
        mse, mae = evaluate(model, device)
        means = {k: avg(v[-5:]) for k,v in outputs.items()}
        deltas = [means['Loss'] - loss.item(), means['mse'] - mse, means['mae'] - mae]

        if any(x < EXIT_CRITERION for x in deltas):
            return

        outputs['Losses'].append(loss.item())
        outputs['mse'].append(mse)
        outputs['mae'].append(mae)
    if device:
        model.cpu()

# Задание 2

### Как работает pooling layer

Pooling слой это - фильтр заданного размера, он обрабатывает данную матрицу и возвращает число, которое зависит от алгоритма.

Его основные цели: 
 - Уменьшение изображения. Нужно для того чтобы последующие конволюции могли проводить операции над большей областью картинки.
 - Увеличение инвариантности выхода сети по отношению к малому переносу входа.
 - Оптимизация вычислений.

### Максимум по временной компоненте (незавимисо для каждой фичи)

In [17]:
class MaxPoolSalaryPredictor(SalaryPredictor):
    def __init__(self):
        super().__init__()

        self.title_encoder = nn.Sequential(
            nn.Conv1d(self.hid_size, self.hid_size, kernel_size=2),
            nn.Dropout(p=0.25),
            nn.ReLU()
        )
        self.description_encoder = nn.Sequential(
            nn.Conv1d(self.hid_size, self.hid_size, kernel_size=2),
            nn.Dropout(p=0.25),
            nn.ReLU()
        )
        self.final_predictor = nn.Sequential(
            nn.Linear(534, self.hid_size),
            nn.ReLU(),
            nn.Linear(self.hid_size, 1)
        )

    def forward(self, batch):
        max_pool = nn.AdaptiveMaxPool1d(output_size=1)

        title_embeddings = self.embedder(batch['Title']).permute(0, 2, 1)
        title_features = self.title_encoder(title_embeddings).squeeze().permute(0, 2, 1)
        title_features = max_pool(title_features).squeeze() # Max pooling

        description_embeddings = self.embedder(batch['FullDescription']).permute(0, 2, 1)
        description_features = self.description_encoder(description_embeddings).squeeze().permute(0, 2, 1)
        description_features = max_pool(description_features).squeeze() # Max pooling

        categorical_features = self.categorical_encoder(batch['Categorical'])

        features = torch.cat((title_features, description_features, categorical_features), dim=1)
        return self.final_predictor(features).squeeze()

 Все что нам дал MaxPool - незначительно меньшее значение по loss  
 Остальные переменные показали лишь худший результат

In [18]:
model = MaxPoolSalaryPredictor()
train(model, torch.optim.Adam(model.parameters(), lr=1e-3), 3)

100%|██████████| 1339/1339 [04:36<00:00,  4.84it/s]


Epoch: 1 | Loss: 0.1306263953447342 | Validation: MSE=0.8713125387827555/MAE=0.4564642111460368


100%|██████████| 1339/1339 [04:28<00:00,  5.00it/s]


Epoch: 2 | Loss: 0.0991913452744484 | Validation: MSE=0.985065221786499/MAE=0.4879465103149414


100%|██████████| 1339/1339 [04:28<00:00,  4.99it/s]


Epoch: 3 | Loss: 0.09645353257656097 | Validation: MSE=0.948194662729899/MAE=0.4796563386917114


Среднее по временной компоненте

In [19]:
class AvgPoolSalaryPredictor(SalaryPredictor):
    def __init__(self):
        super().__init__()
        self.title_encoder = nn.Sequential(
            nn.Conv1d(self.hid_size, self.hid_size, kernel_size=2),
            nn.Dropout(p=0.25),
            nn.ReLU(),
        )
        self.title_avgPooler = nn.AvgPool1d(19, stride=19, count_include_pad=False)
        self.description_encoder = nn.Sequential(
            nn.Conv1d(self.hid_size, self.hid_size, kernel_size=2),
            nn.Dropout(p=0.25),
            nn.ReLU(),
        )
        self.desc_avgPooler = nn.AvgPool1d(499, stride=499, count_include_pad=False)

    def forward(self, batch):
        title_embeddings = self.embedder(batch['Title']).permute(0, 2, 1)
        title_features = self.title_encoder(title_embeddings).squeeze()
        title_features = self.title_avgPooler(title_features).squeeze()

        description_embeddings = self.embedder(batch['FullDescription']).permute(0, 2, 1)
        description_features = self.description_encoder(description_embeddings).squeeze()
        description_features = self.desc_avgPooler(description_features).squeeze()

        categorical_features = self.categorical_encoder(batch['Categorical'])
        features = torch.cat((title_features, description_features, categorical_features), dim=1)
        return self.final_predictor(features).squeeze()

C AveragePooling результаты заметно лучше

In [20]:
model = AvgPoolSalaryPredictor()
train(model, torch.optim.Adam(model.parameters(), lr=1e-3), 3)

100%|██████████| 1339/1339 [04:26<00:00,  5.03it/s]


Epoch: 1 | Loss: 0.1536521464586258 | Validation: MSE=0.042825246850649513/MAE=0.0805329829454422


100%|██████████| 1339/1339 [04:54<00:00,  4.55it/s]


Epoch: 2 | Loss: 0.14230093359947205 | Validation: MSE=0.03107058008511861/MAE=0.06736072897911072


100%|██████████| 1339/1339 [04:29<00:00,  4.97it/s]


Epoch: 3 | Loss: 0.11169379204511642 | Validation: MSE=0.026431108514467876/MAE=0.061901321013768516


# Задание 3

Предобученный эмбеддинг (весы заморожены)

In [32]:
class PretrainedFrozenSalaryPredictor(SalaryPredictor):
    def __init__(self, wordvec: torch.FloatTensor, hid_size=100):
        super().__init__(hid_size=100)
        self.embedder = nn.Embedding.from_pretrained(wordvec, freeze=True) 

In [33]:
kv = api.load('glove-wiki-gigaword-100')

Ошибка все еще высокая

In [23]:
model = PretrainedFrozenSalaryPredictor(torch.FloatTensor(kv.vectors))
train(model, torch.optim.Adam(model.parameters(), lr=1e-3), 3)

100%|██████████| 1339/1339 [06:18<00:00,  3.54it/s]


Epoch: 1 | Loss: 0.09619984775781631 | Validation: MSE=1.437311013539632/MAE=0.594262440999349


100%|██████████| 1339/1339 [06:19<00:00,  3.53it/s]


Epoch: 2 | Loss: 0.08519542962312698 | Validation: MSE=1.4581607182820637/MAE=0.5993243058522543


100%|██████████| 1339/1339 [06:27<00:00,  3.46it/s]


Epoch: 3 | Loss: 0.07579339295625687 | Validation: MSE=1.46799898147583/MAE=0.6016062498092651


Предобученный эмбеддинг (весы обучаются)

In [24]:
class PretrainedEmbedSalaryPredictor(SalaryPredictor):
    def __init__(self, wordvec: torch.FloatTensor, hid_size=100):
        super().__init__(hid_size=hid_size)
        self.embedder = nn.Embedding.from_pretrained(wordvec, freeze=False) 

Тут ситуация та же

In [25]:
model = PretrainedEmbedSalaryPredictor(torch.FloatTensor(kv.vectors))
train(model, torch.optim.Adam(model.parameters(), lr=1e-3), 3)

100%|██████████| 1339/1339 [12:31<00:00,  1.78it/s]


Epoch: 1 | Loss: 0.09913275390863419 | Validation: MSE=1.3804882367451985/MAE=0.5826433499654134


100%|██████████| 1339/1339 [12:35<00:00,  1.77it/s]


Epoch: 2 | Loss: 0.08199077844619751 | Validation: MSE=1.323238452275594/MAE=0.5709047714869181


100%|██████████| 1339/1339 [12:40<00:00,  1.76it/s]


Epoch: 3 | Loss: 0.07381653040647507 | Validation: MSE=1.379287560780843/MAE=0.5832666953404745


# Задание 4

LSTM

In [26]:
class LSTMSalaryPredictor(SalaryPredictor):
    def __init__(self, bidirectional=False):
        super().__init__()
        self.title_lstm = nn.LSTM(20, self.hid_size, bidirectional=bidirectional)
        self.title_encoder = nn.Sequential(
            nn.Dropout(p=0.25),
            nn.ReLU(),
            nn.AdaptiveMaxPool1d(output_size=1)
        )
        self.desc_lstm = nn.LSTM(500, self.hid_size, bidirectional=bidirectional)
        self.description_encoder = nn.Sequential(
            nn.Dropout(p=0.25),
            nn.ReLU(),
            nn.AdaptiveMaxPool1d(output_size=1)
        )

    def forward(self, batch):
        title_embeddings = self.embedder(batch['Title']).permute(0, 2, 1)
        lstm_out, (hn, cn) = self.title_lstm(title_embeddings)
        title_features = self.title_encoder(lstm_out).squeeze()
        description_embeddings = self.embedder(batch['FullDescription']).permute(0, 2, 1)
        lstm_out, (hn, cn) = self.desc_lstm(description_embeddings)
        description_features = self.description_encoder(lstm_out).squeeze()
        categorical_features = self.categorical_encoder(batch['Categorical'])
        features = torch.cat(
            [title_features, description_features, categorical_features], dim=1)
        
        return self.final_predictor(features).squeeze()

LSTM && (bidirectional = False)

In [27]:
model = LSTMSalaryPredictor()
train(model, torch.optim.Adam(model.parameters(), lr=1e-3), 3)

100%|██████████| 1339/1339 [04:51<00:00,  4.59it/s]


Epoch: 1 | Loss: 0.158545583486557 | Validation: MSE=0.1733523408571879/MAE=0.18522685766220093


100%|██████████| 1339/1339 [04:52<00:00,  4.57it/s]


Epoch: 2 | Loss: 0.15152324736118317 | Validation: MSE=0.16154774030049643/MAE=0.1777780850728353


100%|██████████| 1339/1339 [04:53<00:00,  4.57it/s]


Epoch: 3 | Loss: 0.14832158386707306 | Validation: MSE=0.12742688258488974/MAE=0.15458577871322632


LSTM && (bidirectional = True)

In [28]:
model = LSTMSalaryPredictor(bidirectional=True)
train(model, torch.optim.Adam(model.parameters(), lr=1e-3), 3)

100%|██████████| 1339/1339 [05:47<00:00,  3.86it/s]


Epoch: 1 | Loss: 0.13777832686901093 | Validation: MSE=0.4909350474675496/MAE=0.3366178274154663


100%|██████████| 1339/1339 [05:54<00:00,  3.78it/s]


Epoch: 2 | Loss: 0.14693059027194977 | Validation: MSE=0.5267475843429565/MAE=0.3501235644022624


100%|██████████| 1339/1339 [05:57<00:00,  3.75it/s]


Epoch: 3 | Loss: 0.11374281346797943 | Validation: MSE=0.4604838689168294/MAE=0.3277652859687805


GRU

In [29]:
class GRUSalaryPredictor(SalaryPredictor):
    def __init__(self, bidirectional=False):
        super().__init__()

        self.title_gru = nn.GRU(20, self.hid_size, bidirectional=bidirectional)
        self.title_encoder = nn.Sequential(
            nn.Dropout(p=0.25),
            nn.ReLU(),
            nn.AdaptiveMaxPool1d(output_size=1)
        )
        self.desc_gru = nn.GRU(500, self.hid_size, bidirectional=bidirectional)
        self.description_encoder = nn.Sequential(
            nn.Dropout(p=0.25),
            nn.ReLU(),
            nn.AdaptiveMaxPool1d(output_size=1)
        )

    def forward(self, batch):
        title_embeddings = self.embedder(batch['Title']).permute(0, 2, 1)
        gru_out, hn = self.title_gru(title_embeddings)
        title_features = self.title_encoder(gru_out).squeeze()
        description_embeddings = self.embedder(batch['FullDescription']).permute(0, 2, 1)
        gru_out, hn = self.desc_gru(description_embeddings)
        description_features = self.description_encoder(gru_out).squeeze()
        categorical_features = self.categorical_encoder(batch['Categorical'])
        features = torch.cat([title_features, description_features, categorical_features], dim=1)
        return self.final_predictor(features).squeeze()

GRU && (bidirectional = False)

In [30]:
model = GRUSalaryPredictor()
train(model, torch.optim.Adam(model.parameters(), lr=1e-3), 3)

100%|██████████| 1339/1339 [04:59<00:00,  4.47it/s]


Epoch: 1 | Loss: 0.148161843419075 | Validation: MSE=0.23544867833455405/MAE=0.22281877199808756


100%|██████████| 1339/1339 [04:58<00:00,  4.49it/s]


Epoch: 2 | Loss: 0.14018207788467407 | Validation: MSE=0.24832107623418173/MAE=0.23035502433776855


100%|██████████| 1339/1339 [05:53<00:00,  3.78it/s]


Epoch: 3 | Loss: 0.14000648260116577 | Validation: MSE=0.2331114411354065/MAE=0.22225689888000488


GRU && (bidirectional = True)

In [31]:
model = GRUSalaryPredictor(bidirectional=True)
train(model, torch.optim.Adam(model.parameters(), lr=1e-3), 3)

100%|██████████| 1339/1339 [06:39<00:00,  3.35it/s]


Epoch: 1 | Loss: 0.14410187304019928 | Validation: MSE=0.3469020128250122/MAE=0.2782339851061503


100%|██████████| 1339/1339 [06:36<00:00,  3.38it/s]


Epoch: 2 | Loss: 0.13265176117420197 | Validation: MSE=0.3661278486251831/MAE=0.2873084743817647


100%|██████████| 1339/1339 [06:37<00:00,  3.37it/s]


Epoch: 3 | Loss: 0.12114362418651581 | Validation: MSE=0.36182117462158203/MAE=0.287251075108846


LSTM сработал лучше