In [1]:
import pickle

import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments

from classes import MakeTorchData
from util import preprocess_data, compute_metrics, predict_text

pd.options.mode.chained_assignment = None  # this turns off insignificant warning when processing dataset

SEED = 2137
PATH_FOR_MODEL = '../models/auto_model/test_model'
EPOCHS = 5

data = pd.read_csv('../data/recruitment_data_standardized.csv',
                   encoding='utf-8',
                   sep=',',
                   on_bad_lines='skip',
                   quotechar='"',
                   doublequote=True,
                   names=['URL', 'Voivodeship', 'Scrap_time', 'Name', 'Price', 'Brand', 'Condition', 'Offer_from',
                          'Type', 'Description', 'Added_at', 'Views', 'User_since'],
                   skiprows=1)

preprocessed_data = preprocess_data(data)

In [2]:
preprocessed_data

Unnamed: 0,Price,Days_passed_name,Days_passed_name_desc
0,2799.0,52 days iphone 11 64 jak nowy 95% gwarancja wy...,52 days iphone 11 64 jak nowy 95% gwarancja wy...
1,2700.0,"51 days iphone 11 64 gb czarny, idealny z gwar...","51 days iphone 11 64 gb czarny, idealny z gwar..."
2,2899.0,51 days jak nowy apple iphone 11 256gbgb white...,51 days jak nowy apple iphone 11 256gbgb white...
3,2500.0,51 days apple iphone 11 biały 64gb - jak nowy ...,51 days apple iphone 11 biały 64gb - jak nowy ...
4,2150.0,51 days iphone 11 64 gb + gwarancja,"51 days iphone 11 64 gb + gwarancja witam, mam..."
...,...,...,...
2667,2299.0,51 days iphone 11 black 64gb,51 days iphone 11 black 64gb sprzedam iphone 1...
2668,1900.0,51 days i phone 11 64 gb cena tylko dzis,51 days i phone 11 64 gb cena tylko dzis cena ...
2669,2800.0,"51 days iphone 11 128 gb gwarancja , 100% bat...","51 days iphone 11 128 gb gwarancja , 100% bat..."
2670,1650.0,50 days iphone 11 white 64gb,50 days iphone 11 white 64gb na sprzedaż posia...


In [3]:
preprocessed_data['Days_passed_name'].tolist()

['52 days iphone 11 64 jak nowy 95% gwarancja wyświetlacz',
 '51 days iphone 11 64 gb czarny, idealny z gwarancją. wymiana',
 '51 days jak nowy apple iphone 11 256gbgb white gwarancja',
 '51 days apple iphone 11 biały 64gb - jak nowy gwarancja paragon 4xetui',
 '51 days iphone 11 64 gb + gwarancja',
 '51 days iphone 11 64 gb + gwarancja',
 '51 days iphone 11 w bardzo dobrym stanie',
 '51 days iphone 11 * idealny stan * 100% bateria * gwarancja * zamiana *',
 '51 days iphone 11 128 gb stan idealny etui gratis',
 '52 days iphone 11 64 gb prawie nowy super zestaw',
 '52 days iphone 11 czarny 128 gb',
 '51 days oryginalny | apple iphone 11 64/128gb | różne kolory | rok gwarancji',
 '51 days sprzedam iphone 11 128gb red 23msc gwarancji',
 '52 days iphone 11 64gb fioletowy perfekcyjny purple',
 '51 days apple iphone 11 64gb red poznań długa 14',
 '51 days iphone 11 ideał gwarancja 256gb',
 '51 days telefon iphone se (2020) gw do 11.09.2021',
 '51 days iphone  11 ( rezerwacja )',
 '51 days ip

In [3]:
preprocessed_data['Days_passed_name'].apply(len).mean()  # optimal number of characters

43.5437874251497

In [4]:
preprocessed_data['Days_passed_name_desc'].apply(len).mean()  # too many characters to process, needs truncation

577.8701347305389

In [7]:
# polish language model loaded into basic transformer framework
model = AutoModelForSequenceClassification.from_pretrained('dkleczek/bert-base-polish-uncased-v1', num_labels=1).to("cpu") # num_labels=1 sets model to regression problem

Some weights of the model checkpoint at dkleczek/bert-base-polish-uncased-v1 were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassificatio

In [2]:
# polish language model loaded into basic transformer framework
tokenizer = AutoTokenizer.from_pretrained('dkleczek/bert-base-polish-uncased-v1')

# scaling data
scaler = StandardScaler()
y_dataset = scaler.fit_transform(np.asarray(preprocessed_data['Price']).reshape(-1, 1))

X_train, X_test, y_train, y_test = train_test_split(preprocessed_data['Days_passed_name'].tolist(), y_dataset, train_size=0.8, random_state=SEED)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, train_size=0.8, random_state=SEED)

#converting data to tokens
train_encodings = tokenizer(X_train, truncation=True, padding=True, max_length=100)
valid_encodings = tokenizer(X_val, truncation=True, padding=True, max_length=100)
test_encodings = tokenizer(X_test, truncation=True, padding=True, max_length=100)

# convert our tokenized data into a torch Dataset
train_dataset = MakeTorchData(train_encodings, y_train.ravel())
valid_dataset = MakeTorchData(valid_encodings, y_val.ravel())
test_dataset = MakeTorchData(test_encodings, y_test.ravel())

In [11]:
# Specify arguments for the trainer
training_args = TrainingArguments(
    output_dir=PATH_FOR_MODEL + '/checkpoints',
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=20,
    weight_decay=0.01,
    learning_rate=2e-5,
    logging_dir='../logs',
    save_total_limit=10,
    load_best_model_at_end=True,
    metric_for_best_model='rmse',
    evaluation_strategy="epoch",
    save_strategy="epoch",
)

# Create trainer
trainer = Trainer( # check if verbose = 0
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()

# Call the summary
_ = trainer.evaluate()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running training *****
  Num examples = 1709
  Num Epochs = 20
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 1080
  Number of trainable parameters = 132122113


Epoch,Training Loss,Validation Loss


***** Running Evaluation *****
  Num examples = 428
  Batch size = 20
Saving model checkpoint to ../models/auto_model/days_name_desc_20_ml100/checkpoints\checkpoint-54
Configuration saved in ../models/auto_model/days_name_desc_20_ml100/checkpoints\checkpoint-54\config.json
Model weights saved in ../models/auto_model/days_name_desc_20_ml100/checkpoints\checkpoint-54\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 428
  Batch size = 20
Saving model checkpoint to ../models/auto_model/days_name_desc_20_ml100/checkpoints\checkpoint-108
Configuration saved in ../models/auto_model/days_name_desc_20_ml100/checkpoints\checkpoint-108\config.json
Model weights saved in ../models/auto_model/days_name_desc_20_ml100/checkpoints\checkpoint-108\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 428
  Batch size = 20
Saving model checkpoint to ../models/auto_model/days_name_desc_20_ml100/checkpoints\checkpoint-162
Configuration saved in ../models/auto_model/days_name_de

{'eval_loss': 0.6470615267753601,
 'eval_mse': 0.6470615267753601,
 'eval_rmse': 0.8044013381004333,
 'eval_mae': 0.5976303219795227,
 'eval_r2': 0.32811412840604026,
 'eval_runtime': 81.6857,
 'eval_samples_per_second': 5.24,
 'eval_steps_per_second': 0.269,
 'epoch': 20.0}

In [12]:
# save model and scaler
trainer.save_model(PATH_FOR_MODEL)
pickle.dump(scaler, open(PATH_FOR_MODEL + '/scaler.pkl', 'wb'))

Saving model checkpoint to ../models/auto_model/days_name_desc_20_ml100
Configuration saved in ../models/auto_model/days_name_desc_20_ml100\config.json
Model weights saved in ../models/auto_model/days_name_desc_20_ml100\pytorch_model.bin


In [13]:
# try model on test data
predictions = trainer.predict(test_dataset)

***** Running Prediction *****
  Num examples = 535
  Batch size = 20


In [14]:
# check rmse
inverse_predictions = scaler.inverse_transform(np.asarray(predictions[0]).reshape(-1, 1))
inverse_test = scaler.inverse_transform(np.asarray(predictions[1]).reshape(-1, 1))
mean_squared_error(inverse_test, inverse_predictions, squared=False)

436.6896

In [21]:
# moved function to util.py
prediction = predict_text(trainer, scaler, tokenizer, 'iphone 11 64')

***** Running Prediction *****
  Num examples = 89
  Batch size = 20


In [23]:
fig = px.scatter(prediction, trendline='lowess',
                 title='Changes in price in relation to days passed since 01.01.2021 for phrase "iphone 11 64"').update_layout(
    xaxis_title='Days passed', yaxis_title='Price', title_x=0.5)
fig.add_vline(55) # legend for this

In [19]:
# saving scaler data
# pickle.dump(scaler, open(PATH_FOR_MODEL + '/scaler.pkl','wb'))
# scaler = pickle.load(open('../results/scaler_auto_model.pkl','rb'))