In [1]:
import pandas as pd
import numpy as np
import spacy
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from transformers import BertTokenizer, BertForMaskedLM, AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
from util import preprocess_data

SEED = 2137

data = pd.read_csv('../data/recruitment_data_modified_python.csv',
                   encoding='utf-8',
                   sep=',',
                   on_bad_lines='warn',
                   quotechar='"',
                   doublequote=True,
                   names=['URL', 'Voivodeship', 'Scrap_time', 'Name', 'Price', 'Brand', 'Condition', 'Offer_from', 'Type', 'Description', 'Added_at', 'Views', 'User_since'],
                   skiprows=1)

preprocessed_data = preprocess_data(data)

Skipping line 346: expected 13 fields, saw 14
Skipping line 367: expected 13 fields, saw 15
Skipping line 466: expected 13 fields, saw 19
Skipping line 467: expected 13 fields, saw 19
Skipping line 494: expected 13 fields, saw 15
Skipping line 615: expected 13 fields, saw 15
Skipping line 616: expected 13 fields, saw 15
Skipping line 617: expected 13 fields, saw 15
Skipping line 618: expected 13 fields, saw 15
Skipping line 619: expected 13 fields, saw 15
Skipping line 620: expected 13 fields, saw 15
Skipping line 712: expected 13 fields, saw 14
Skipping line 713: expected 13 fields, saw 14
Skipping line 739: expected 13 fields, saw 14
Skipping line 747: expected 13 fields, saw 16
Skipping line 867: expected 13 fields, saw 15
Skipping line 956: expected 13 fields, saw 14
Skipping line 1028: expected 13 fields, saw 14
Skipping line 1234: expected 13 fields, saw 15
Skipping line 1282: expected 13 fields, saw 17
Skipping line 1326: expected 13 fields, saw 14
Skipping line 1327: expected 1

In [2]:
preprocessed_data

Unnamed: 0,Price,Days_passed_name,Days_passed_name_desc
0,2799.0,52 days iphone 11 64 jak nowy 95% gwarancja wy...,52 days iphone 11 64 jak nowy 95% gwarancja wy...
1,2700.0,"51 days iphone 11 64 gb czarny, idealny z gwar...","51 days iphone 11 64 gb czarny, idealny z gwar..."
2,2899.0,51 days jak nowy apple iphone 11 256gbgb white...,51 days jak nowy apple iphone 11 256gbgb white...
3,2500.0,51 days apple iphone 11 biały 64gb - jak nowy ...,51 days apple iphone 11 biały 64gb - jak nowy ...
4,2150.0,51 days iphone 11 64 gb + gwarancja,"51 days iphone 11 64 gb + gwarancja witam, mam..."
...,...,...,...
2667,2299.0,51 days iphone 11 black 64gb,51 days iphone 11 black 64gb sprzedam iphone 1...
2668,1900.0,51 days i phone 11 64 gb cena tylko dzis,51 days i phone 11 64 gb cena tylko dzis cena ...
2669,2800.0,"51 days iphone 11 128 gb gwarancja , 100% bat...","51 days iphone 11 128 gb gwarancja , 100% bat..."
2670,1650.0,50 days iphone 11 white 64gb,50 days iphone 11 white 64gb na sprzedaż posia...


In [3]:
preprocessed_data['Days_passed_name'][:10]

0    52 days iphone 11 64 jak nowy 95% gwarancja wy...
1    51 days iphone 11 64 gb czarny, idealny z gwar...
2    51 days jak nowy apple iphone 11 256gbgb white...
3    51 days apple iphone 11 biały 64gb - jak nowy ...
4                  51 days iphone 11 64 gb + gwarancja
5                  51 days iphone 11 64 gb + gwarancja
6             51 days iphone 11 w bardzo dobrym stanie
7    51 days iphone 11 * idealny stan * 100% bateri...
8    51 days iphone 11 128 gb stan idealny etui gratis
9     52 days iphone 11 64 gb prawie nowy super zestaw
Name: Days_passed_name, dtype: object

In [3]:
# tokenizer = AutoTokenizer.from_pretrained('dkleczek/bert-base-polish-uncased-v1')

In [4]:
# encoded_corpus = tokenizer(text=preprocessed_data['Days_passed_name'].tolist(),
#                             add_special_tokens=True,
#                             padding='max_length',
#                             truncation='longest_first',
#                             max_length=50,
#                             return_attention_mask=True)

In [7]:
# input_ids = np.array(encoded_corpus['input_ids'])
# attention_mask = np.array(encoded_corpus['attention_mask'])
# labels = np.asarray(preprocessed_data['Price'])

In [8]:
# train_inputs, test_inputs, train_labels, test_labels = train_test_split(input_ids, labels, train_size=0.8, random_state=SEED)
# train_inputs, val_inputs, train_labels, val_labels = train_test_split(train_inputs, train_labels, train_size=0.8, random_state=SEED)
# train_masks, test_masks, train_masks_labels, _ = train_test_split(attention_mask, labels, train_size=0.8, random_state=SEED)
# train_masks, val_masks, _, _ = train_test_split(train_masks, train_masks_labels, train_size=0.8, random_state=SEED)

In [9]:
# scaler = StandardScaler()
# scaler.fit(train_labels.reshape(-1, 1))
#
# train_labels = scaler.transform(train_labels.reshape(-1,1))
# test_labels = scaler.transform(test_labels.reshape(-1,1))
# val_labels = scaler.transform(val_labels.reshape(-1,1))

In [10]:
# batch_size = 32
# def create_dataloaders(inputs, masks, labels, batch_size):
#     input_tensor = torch.tensor(inputs)
#     mask_tensor = torch.tensor(masks)
#     labels_tensor = torch.tensor(labels)
#     dataset = TensorDataset(input_tensor, mask_tensor, labels_tensor)
#     dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
#     return dataloader
#
# train_dataloader = create_dataloaders(train_inputs, train_masks, train_labels, batch_size)
# test_dataloader = create_dataloaders(test_inputs, test_masks, test_labels, batch_size)
# val_dataloader = create_dataloaders(val_inputs, val_masks, val_labels, batch_size)

In [9]:
# class BertRegressor(nn.Module):
#
#     def __init__(self, drop_rate=0.2):
#
#         super(BertRegressor, self).__init__()
#         D_in, D_out = 768, 1
#
#         self.bert = AutoModelForSequenceClassification.from_pretrained('dkleczek/bert-base-polish-uncased-v1', num_labels=1)
#         self.regressor = nn.Sequential(
#             nn.Dropout(drop_rate),
#             nn.Linear(self.bert.config.hidden_size, D_out))
#
#     def forward(self, input_ids, attention_masks):
#
#         outputs = self.bert(input_ids, attention_masks, return_dict=False)
#         outputs = self.regressor(outputs[0])
#
#         return outputs
#
# model = BertRegressor()
# optimizer = torch.optim.AdamW(model.parameters(),
#                   lr=5e-5,
#                   eps=1e-8)

Some weights of the model checkpoint at dkleczek/bert-base-polish-uncased-v1 were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [31]:
# from transformers import get_linear_schedule_with_warmup
#
# EPOCHS = 5
# total_steps = len(train_dataloader) * EPOCHS
# scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
# loss_function = nn.MSELoss()

In [29]:
# from torch.nn.utils.clip_grad import clip_grad_norm
#
# def train(model, optimizer, scheduler, loss_function, epochs,
#           train_dataloader, device, clip_value=2):
#
#     for epoch in range(epochs):
#         print(epoch)
#         print("-----")
#         best_loss = 1e10
#         model.train()
#         for step, batch in enumerate(train_dataloader):
#             print(step)
#             batch_inputs, batch_masks, batch_labels = tuple(b.to(device) for b in batch)
#             model.zero_grad()
#             outputs = model(batch_inputs, batch_masks)
#             loss = loss_function(outputs.squeeze(),
#                              batch_labels.squeeze())
#             loss.backward()
#             clip_grad_norm(model.parameters(), clip_value)
#             optimizer.step()
#             scheduler.step()
#
#     return model
#
# model = train(model, optimizer, scheduler, loss_function, EPOCHS,
#               train_dataloader, torch.device("cpu"), clip_value=2)

0
-----
0


RuntimeError: mat1 and mat2 shapes cannot be multiplied (1600x60000 and 768x1)

In [11]:
from sklearn.metrics import mean_squared_error, r2_score, mean_squared_error, mean_absolute_error
model = AutoModelForSequenceClassification.from_pretrained('dkleczek/bert-base-polish-uncased-v1', num_labels = 1).to("cpu")

def compute_metrics_for_regression(eval_pred):
    logits, labels = eval_pred
    labels = labels.reshape(-1, 1)

    mse = mean_squared_error(labels, logits)
    rmse = mean_squared_error(labels, logits, squared=False)
    mae = mean_absolute_error(labels, logits)
    r2 = r2_score(labels, logits)
    smape = 1/len(labels) * np.sum(2 * np.abs(logits-labels) / (np.abs(labels) + np.abs(logits))*100)

    return {"mse": mse, "rmse": rmse, "mae": mae, "r2": r2, "smape": smape}

Some weights of the model checkpoint at dkleczek/bert-base-polish-uncased-v1 were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassificatio

In [16]:
tokenizer = AutoTokenizer.from_pretrained('dkleczek/bert-base-polish-uncased-v1')

X_train, X_test, y_train, y_test = train_test_split(preprocessed_data['Days_passed_name'], preprocessed_data['Price'], train_size=0.8)


# encoded_corpus = tokenizer(text=preprocessed_data['Days_passed_name'].tolist(),
#                             add_special_tokens=True,
#                             padding='max_length',
#                             truncation='longest_first',
#                             max_length=50,
#                             return_attention_mask=True)

train_encodings = tokenizer(X_train.tolist(), truncation=True, padding=True, max_length=50)
valid_encodings = tokenizer(X_test.tolist(), truncation=True, padding=True, max_length=50)


class MakeTorchData(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor([self.labels[idx]])
        item["labels"] = float(item["labels"])
        return item

    def __len__(self):
        return len(self.labels)

# convert our tokenized data into a torch Dataset
train_dataset = MakeTorchData(X_train, y_train.ravel())
valid_dataset = MakeTorchData(X_test, y_test.ravel())

# Specifiy the arguments for the trainer
training_args = TrainingArguments(
    output_dir ='../results',
    num_train_epochs = 5,
    per_device_train_batch_size = 64,
    per_device_eval_batch_size = 20,
    weight_decay = 0.01,
    learning_rate = 2e-5,
    logging_dir = './logs',
    save_total_limit = 10,
    load_best_model_at_end = True,
    metric_for_best_model = 'rmse',
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
)

# Call the Trainer
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = train_dataset,
    eval_dataset = valid_dataset,
    compute_metrics = compute_metrics_for_regression,
)

# Train the model
trainer.train()

# Call the summary
trainer.evaluate()

loading configuration file config.json from cache at C:\Users\cubix/.cache\huggingface\hub\models--dkleczek--bert-base-polish-uncased-v1\snapshots\62be9821055981deafb23f217b68cc41f38cdb76\config.json
Model config BertConfig {
  "_name_or_path": "dkleczek/bert-base-polish-uncased-v1",
  "architectures": [
    "BertForMaskedLM",
    "BertForPreTraining"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.26.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 60000
}

loading file vocab.txt from cache at C:\Users\cubix/.cache\huggingface\hub\models--dkleczek--bert-base-pol

IndexError: string index out of range