# Структура работы над проектом
I) EDA (в отдельном Fork'e)

II) Построение "наивной"/baseline модели, предсказывающую цену по модели и году выпуска (для сравнения с другими моделями)  

III) Обработка и нормировка признаков -> получение таблицы данных для подачи в ML и DL - модели  

IV) Создание модели ML с помощью CatBoost (градиентный бустинг), тренировка, оценка результата    

V) Создание моделей DL:

Va) SimpleNN - Создание простой модели DL (на основе полносвязной нейронной сети), тренировка, оценка результата  

Vb) NLP:
- Работа с текстом, обработка приведение в векторный вид  
- NLP - Cоздание модели DL для работы с текстом (блоки LSTM, GRU, Transformer)  

Vc) Создание multi-input нейронной сети (SimpleNN + NLP) для анализа табличных данных и текста одновременно, тренировка, оценка результата  

Vd) EFN:
- обработка изображений с помощью библиотеки albumentations
- EFN - Cоздание модели DL для работы с изображениями на основе TransferLearning + использование техники FineTunning

Ve) Создание multi-input нейронной сети (SimpleNN + NLP + EFN) для анализа табличных данных, текста и картинок одновременно, тренировка, оценка результата 

VI) Ансамблирование градиентного бустинга и нейронной сети (усреднение их предсказаний)

# Installing and Imports libraries

In [None]:
!pip install -q tensorflow==2.3

In [None]:
# аугментация изображений
!pip install albumentations -q

In [None]:
# морфологический анализатор
!pip install pymorphy2

In [None]:
!pip install -q efficientnet

In [None]:
# Imports
import random
import numpy as np
import pandas as pd
import os
import sys
import codecs
import PIL
import cv2
import re
import string

import nltk
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords, wordnet
from pymorphy2 import MorphAnalyzer

# Input data files are available in the read-only "../input/" directory

# ML libraries
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

# keras
import tensorflow as tf
import tensorflow.keras.layers as L
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
import efficientnet.tfkeras as efn
#from keras_bert import load_trained_model_from_checkpoint
import albumentations

# plt
import matplotlib.pyplot as plt
# увеличим дефолтный размер графиков
from pylab import rcParams
rcParams['figure.figsize'] = 10, 5
%config InlineBackend.figure_format = 'svg' 
%matplotlib inline

In [None]:
print('Python         :', sys.version.split('\n')[0])
print('Numpy          :', np.__version__)
print('Tensorflow     :', tf.__version__)
print('Albumentations :', albumentations.__version__)

In [None]:
def mape(y_true, y_pred):
    return np.mean(np.abs((y_pred-y_true)/y_true))

In [None]:
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

In [None]:
!pip freeze > requirements.txt

# DATA

In [None]:
DATA_DIR = '../input/sf-dst-car-price-prediction-part2/'
train = pd.read_csv(DATA_DIR + 'train.csv')
test = pd.read_csv(DATA_DIR + 'test.csv')
sample_submission = pd.read_csv(DATA_DIR + 'sample_submission.csv')

In [None]:
train.columns

In [None]:
train.head()

In [None]:
train.info()

In [None]:
train.nunique()

Типы признаков:

* bodyType - категориальный
* brand - категориальный
* color - категориальный
* description - текстовый
* engineDisplacement - числовой, представленный как текст
* enginePower - числовой, представленный как текст
* fuelType - категориальный
* mileage - числовой
* modelDate - числовой
* model_info - категориальный
* name - категориальный, желательно сократить размерность
* numberOfDoors - категориальный
* price - числовой, целевой
* productionDate - числовой
* sell_id - изображение (файл доступен по адресу, основанному на sell_id)
* vehicleConfiguration - не используется (комбинация других столбцов)
* vehicleTransmission - категориальный
* Владельцы - категориальный
* Владение - числовой, представленный как текст
* ПТС - категориальный
* Привод - категориальный
* Руль - категориальный

# II) Model 1: Создадим "наивную" модель 
Эта модель будет предсказывать среднюю цену по модели и году выпуска. 
C ней будем сравнивать другие модели.

In [None]:
# split данных
data_train, data_test = train_test_split(train, test_size=0.13, shuffle=True, random_state=RANDOM_SEED)

In [None]:
# Наивная модель
predicts = []
for index, row in pd.DataFrame(data_test[['model_info', 'productionDate']]).iterrows():
    query = f"model_info == '{row[0]}' and productionDate == '{row[1]}'"
    predicts.append(data_train.query(query)['price'].median())
    #print(predicts)

# заполним не найденные совпадения
predicts = pd.DataFrame(predicts)
predicts = predicts.fillna(predicts.median())

# округлим
predicts = (predicts // 1000) * 1000

#оцениваем точность
print(f"Точность наивной модели по метрике MAPE: {(mape(data_test['price'], predicts.values[:, 0]))*100:0.2f}%")

# III) Обработка и преобразование признаков

### Функции для преобразований

In [None]:
# Функция преобразования 'Владение' в числовой признак
def prepare_vladenie(string, col='Владение', pattern_1 = '\d+'):
    if type(string) == float:
        # Если NaN меняем на 1
        num_mounth = 1
    elif len(string.split('и'))==2:
        nums = re.findall(pattern_1, string)
        num_mounth = int(nums[1]) + int(nums[0])*12
    elif 'мес'in string:
        num_mounth = int(re.findall(pattern_1, string)[0])
    else: 
        num_mounth = int(re.findall(pattern_1, string)[0])*12
    return num_mounth

In [None]:
# функция преобразования колонки name - сокращение признака
def ch_name_col(d_frame_name):
    if '4WD' in d_frame_name:
        full_drive = ' 4WD'
    else:
        full_drive = ''
    pattern_1 = ' \d\.\d'
    pattern_2 = '\d\.\d'
    pattern_3 = ' AT'
    if len(re.findall(pattern_1, d_frame_name))!=0:
        return re.split(pattern_1, d_frame_name)[0]+full_drive
    elif len(re.findall(pattern_2, d_frame_name))!=0:
        return 'no_val'+full_drive
    else:
        return re.split(pattern_2, d_frame_name)[0]+full_drive

### Выбросы

In [None]:
# изменяем значение выброса с 999999 на 99999 (см. EDA - есть опечатка, судя по описанию в description)
train.loc[train['mileage']==train['mileage'].max(), 'mileage'] = 99999

# изменяем значение выброса с 1000000 на 100000, т.к. представленные роадстеры 1989 года
#  имеют пробеги до 230000, встречаются с пробегом 100000 и возможно опечатка в одном объявлении опечатка
test.loc[test['mileage']==test['mileage'].max(), 'mileage'] = 100000
#test.loc[(test['modelDate']<=1995)&(test['brand']=='MERCEDES')&(test['bodyType']=='родстер')]

# PreProc Data

In [None]:
# Для корректной обработки признаков объединяем трейн и тест в один датасет
train['sample'] = 1 # помечаем трейн
test['sample'] = 0 # помечаем тест
test['price'] = 0 # в тесте нет значения price, поэтому заполняем нулями

data = test.append(train, sort=False).reset_index(drop=True) 
print(train.shape, test.shape, data.shape)

In [None]:
# признаки, которые не дали улучшение при расчетах в CatBoost

#генерация числовых фичей полиномиальные признаки
#numerical_features = list(set(df_output.dtypes[df_output.dtypes!=object].index) - \
#                          set(['price', 'sample'])) # 'mileage', 'enginePower' 'productionDate'

#poly = PolynomialFeatures(2)
#poly_feat = poly.fit_transform(df_output[numerical_features])[:, 1:]
#poly_cols = poly.get_feature_names(df_output[numerical_features].columns)[1:]
#for i, col in enumerate(poly_cols):
#    df_output[col] = poly_feat[:, i]

# создаем чиловые признаки
## средний провег за год:calс_mean_year_mile
#calс_mean_year_mile = lambda df: np.round(df.mileage/(1 + 2020 - df.productionDate), 1)
#df_output['mean_year_mileage'] = df_output.loc[:, ['mileage', 'productionDate']].apply(
#    calс_mean_year_mile, axis=1)

#df_output['log_enginePower'] = df_output['enginePower'].apply(log_df)
#df_output['log_engineDisplacement'] = df_output['engineDisplacement'].apply(log_df)
#df_output['sqrt_mileage'] = df_output['mileage'].apply(np.sqrt)
#df_output['sqrt_productionDate'] = df_output['productionDate'].apply(np.sqrt)
#df_output['sqrt_enginePower'] = df_output['enginePower'].apply(np.sqrt)
#df_output['sqrt_engineDisplacement'] = df_output['engineDisplacement'].apply(np.sqrt)
#df_output['sqrt_ch_Владение'] = df_output['ch_Владение'].apply(np.sqrt)
#df_output['log_sell_id'] = df_output['sell_id'].apply(log_df)
#df_output['log_ch_Владение'] = df_output['ch_Владение'].apply(log_df)
#calс_pow_div_disp = lambda df: df.engineDisplacement/df.enginePower
#df_output['pow_div_disp'] = df_output.loc[:, ['enginePower', 'engineDisplacement']].apply(
#    calс_pow_div_disp, axis=1)

In [None]:
# Функция для преобразования данных
def preproc_data(df_input):
    '''includes several functions to pre-process the predictor data.'''
    
    df_output = df_input.copy()
    
    # ################### 1. Предобработка #########################
    # ненужные для модели признаки
    df_output.drop(['description','vehicleConfiguration', 'Руль', 'sell_id'], axis = 1, inplace=True) 
    
    
    # Преобразуем категориальные признаки к числовым признакам 
    df_output['ch_Владение'] = df_output.Владение.apply(prepare_vladenie)
    df_output['enginePower'] = df_output.enginePower.apply(lambda x: int(x[:-4]))
    
    df_output.engineDisplacement.replace('undefined LTR', '0.0 LTR', inplace=True)
    df_output['engineDisplacement'] = df_output.engineDisplacement.apply(lambda x: float(x[:-4]))
    
    
    # Преобразуем числовые признаки (sqrt)
    df_output['sqrt_modelDate'] = df_output['modelDate'].apply(np.sqrt)
    
    
    # Преобразуем числовые признаки (log)
    df_output['log_mileage'] = df_output['mileage'].apply(lambda x: np.log(1+x))
    df_output['log_modelDate'] = df_output['modelDate'].apply(lambda x: np.log(2020-x)) 
    df_output['log_productionDate'] = df_output['productionDate'].apply(lambda x: np.log(1+x))
    
    
    # Категориальные признаки - сокращаем name
    df_output['name'] = df_output.name.apply(ch_name_col)
    
    # ################### Numerical Features ########################
    numerical_features = list(set(df_output.dtypes[df_output.dtypes!=object].index) - \
                              set(['price', 'sample'])) 
    
    
    # Обработка NAN (удаление столбца)
    df_output.dropna(subset=['Владельцы'], inplace=True)
    
    
    # Нормализация данных + уменьшение размера
    scaler = StandardScaler()
    for column in numerical_features:
        df_output[column] = scaler.fit_transform(df_output[[column]])[:,0].astype('float32')
              
    
    # Categorical Features
    categorical_features = list(set(df_output.dtypes[df_output.dtypes==object].index) - set(['Владение']))
    
    
    # Label Encoding
    for column in categorical_features:
        df_output[column] = df_output[column].astype('category').cat.codes
        df_output[column] = StandardScaler().fit_transform(np.array(df_output[column]).reshape(-1, 1))
     
    
    # One-Hot Encoding: в pandas есть готовая функция - get_dummies.
    df_output = pd.get_dummies(df_output, columns=categorical_features, dummy_na=False)
    
    
    # Feature Engineering (новые признаки не улучшили качество в catboost, поэтому не были включены)
    # новый признак: есть ли NaN во 'Владении'? 
    df_output['is_nan_vladenie'] = np.where(df_output['Владение'].isna(), 0, 1)
        
    
    # Clean
    # убираем признаки которые еще не успели обработать, 
    df_output.drop(['Владение', 'modelDate'], axis = 1, inplace=True)
    
    return df_output

In [None]:
# для обработки NLP data.description - в модели 4 и модель 5
stopwords_list = stopwords.words('english') + stopwords.words('russian')
morph = MorphAnalyzer()

# Максимальное используемое числов слов
MAX_WORDS = 100000
# Максимальное число слов в каждом отзыве
MAX_SEQUENCE_LENGTH = 256

In [None]:
# Функция для подгрузки данных, чтобы потом можно было их подгружать одной строкой
def create_data():
    DATA_DIR = '../input/sf-dst-car-price-prediction-part2/'
    train = pd.read_csv(DATA_DIR + 'train.csv')
    test = pd.read_csv(DATA_DIR + 'test.csv')
    sample_submission = pd.read_csv(DATA_DIR + 'sample_submission.csv')
    
    # изменяем значение выброса с 999999 на 99999 (см. EDA)
    train.loc[train['mileage']==train['mileage'].max(), 'mileage'] = 99999

    # изменяем значение выброса с 1000000 на 100000, т.к. представленные роадстеры 1989 года
    #  имеют пробеги до 230000, встречаются с пробегом 100000 и возможно опечатка в одном объявлении опечатка с 0
    test.loc[test['mileage']==test['mileage'].max(), 'mileage'] = 100000
        
    # Объединяем трейн и тест в один датасет
    train['sample'] = 1 # помечаем трейн
    test['sample'] = 0 # помечаем тест
    test['price'] = 0 # в тесте нет значения price, просто заполняем нулями

    data = test.append(train, sort=False).reset_index(drop=True) 
    #print(train.shape, test.shape, data.shape)
    
    df_preproc = preproc_data(data)
    
    print('tabular preproc done')
    
    
    ## Блок преобразования data.decsription
    data.description = data.description.apply(data_prepare)
    print('clean data.description done')
    
    tokenize = Tokenizer(num_words=MAX_WORDS)
    tokenize.fit_on_texts(data.description)
    
    ## слова которые будем фильтровать
    filtered_values = set(filtetred_freq_words(100, 5000, tokenize.word_index).keys()) #(100, 9000)
    
    ## исключаем фильтрованные слова
    data.description = data.description.apply(
        lambda x: " ".join([word for word in x.split() if word not in filtered_values]))
    print('filter data.description done')
    ## Конец блока преобразования data.decsription
     
    
    # Преобразование табличных данных функцией df_preproc
    train_data = df_preproc.query('sample == 1').drop(['sample'], axis=1)
    test_data = df_preproc.query('sample == 0').drop(['sample'], axis=1)

    y = train_data.price.values     # таргет
    X = train_data.drop(['price'], axis=1)
    X_sub = test_data.drop(['price'], axis=1)
    
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.13, shuffle=True, random_state=RANDOM_SEED)
    
    print('data preproc finish')
    
    return X, y, X_sub, X_train, X_test, y_train, y_test, data

In [None]:
### Функция для очистки данных от цифр, знаков, повторяющихся символов
def data_prepare(str_2_clean):
    str_2_clean=re.sub(r'[^\w\s]|([0-9])', ' ', str_2_clean)
    str_2_clean=re.sub(' +', ' ', str_2_clean.lower()).strip(' ')
    str_2_clean=re.sub("(.)\\1{2,}", "\\1", str_2_clean)
    words = str_2_clean.split() 
    clean_words = [morph.normal_forms(word)[0] for word in words 
                   if (word not in stopwords_list) and len(word)>2]
    return " ".join(clean_words)

In [None]:
### делаем список для фильтра часто и редко встречающихся слов
def filtetred_freq_words(max_freq_to_remove, min_freq_to_remove, dict_from_data):
    filtered_dict = dict(filter(lambda item: (item[1]<max_freq_to_remove)or(item[1]>min_freq_to_remove), 
                                dict_from_data.items()))
    return filtered_dict

In [None]:
# Функция преобрабования числовых данных в вектора
def data_descr_to_nlp():    
    ## Токенизируем очищенное и преобразованное описание data description
    tokenize = Tokenizer(num_words=MAX_WORDS)
    tokenize.fit_on_texts(data.description)
    
    # split данных
    text_train = data.description.iloc[X_train.index]
    text_test = data.description.iloc[X_test.index]
    text_sub = data.description.iloc[X_sub.index]
    
    text_train_sequences = sequence.pad_sequences(
        tokenize.texts_to_sequences(text_train), maxlen=MAX_SEQUENCE_LENGTH)
    text_test_sequences = sequence.pad_sequences(
        tokenize.texts_to_sequences(text_test), maxlen=MAX_SEQUENCE_LENGTH)
    text_sub_sequences = sequence.pad_sequences(
        tokenize.texts_to_sequences(text_sub), maxlen=MAX_SEQUENCE_LENGTH)
    
    return text_train_sequences, text_test_sequences, text_sub_sequences, tokenize

# Model 2: CatBoostRegressor

In [None]:
X, y, X_sub, X_train, X_test, y_train, y_test, data = create_data()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.13, shuffle=True, random_state=RANDOM_SEED)

In [None]:
def calc_cat_boost():
    model = CatBoostRegressor(iterations=10000,
                              depth=8, 
                              learning_rate=0.019,
                              random_seed=RANDOM_SEED,
                              eval_metric='MAPE',
                              custom_metric=['RMSE', 'MAE'],
                              od_wait=500,
                              grow_policy='Lossguide', #+Lossguide
                              l2_leaf_reg=3.7,
                              model_size_reg=2,
                              loss_function='MAE' #'MAE'

                              
                              )
    model.fit(X_train, y_train,
              eval_set=(X_test, y_test),
              verbose_eval=100,
              use_best_model=True,
              #metric_period
              #plot=True
             )
    
    test_predict_catboost = model.predict(X_test)
    print(f"TEST mape: {(mape(y_test, test_predict_catboost))*100:0.2f}%")
    return model

model = calc_cat_boost()

### Submission

In [None]:
sub_predict_catboost = model.predict(X_sub)
sample_submission['price'] = sub_predict_catboost
sample_submission.to_csv('catboost_submission.csv', index=False)

# Model 3: Tabular NN

## Simple Dense NN

In [None]:
X, y, X_sub, X_train, X_test, y_train, y_test, data = create_data()

In [None]:
def callbacks(lr):
    checkpoint = ModelCheckpoint('../working/best_model.hdf5' , monitor='val_MAPE', 
                                 verbose=1, mode='min', save_best_only=True)
    earlystop = EarlyStopping(monitor='val_MAPE', patience=85, restore_best_weights=True,)
    reduce_lr = ReduceLROnPlateau(monitor='val_MAPE', factor=0.5, patience=25, verbose=1, 
                                  min_lr=lr/1000, mode='min')
    
    return [checkpoint, earlystop, reduce_lr]

In [None]:
model = Sequential()
model.add(L.Dense(1024, input_dim=X_train.shape[1], activation="relu"))
model.add(L.BatchNormalization())
model.add(L.Dropout(0.5))
model.add(L.Dense(512, activation="relu"))
model.add(L.Dropout(0.25))
model.add(L.Dense(256, activation="relu"))
model.add(L.Dropout(0.5))
model.add(L.Dense(1, activation="linear"))

print(model.summary())

# Compile model
LR = 0.01
optimizer = tf.keras.optimizers.Adam(learning_rate=LR,
                                     beta_1=0.992, 
                                     beta_2=0.999,
                                     epsilon=2e-06,
                                     amsgrad=False,
                                    )
model.compile(loss='MAPE',optimizer=optimizer, metrics=['MAPE'])

callbacks_list = callbacks(lr=LR)

# fit
history = model.fit(X_train, y_train,
                    batch_size=512,
                    epochs=500, 
                    validation_data=(X_test, y_test),
                    callbacks=callbacks_list,
                    verbose=1,
                    )

# plot history
plt.title('Loss')
plt.plot(history.history['MAPE'], label='train')
plt.plot(history.history['val_MAPE'], label='test')
plt.show();

# save_model
model.load_weights('../working/best_model.hdf5')
model.save_weights('../working/best_model_nn1.hdf5')
model.save('../working/nn_1.hdf5')

# test predict
test_predict_nn1 = model.predict(X_test)
print(f"TEST mape: {(mape(y_test, test_predict_nn1[:,0]))*100:0.2f}%")

In [None]:
sub_predict_nn1 = model.predict(X_sub)
sample_submission['price'] = sub_predict_nn1[:,0]
sample_submission.to_csv('nn1_submission.csv', index=False)

# Model 4: NLP + Multiple Input

## Work with data.description

In [None]:
X, y, X_sub, X_train, X_test, y_train, y_test, data = create_data()

In [None]:
text_train_sequences, text_test_sequences, text_sub_sequences, tokenize = data_descr_to_nlp()
# текст после преобразования
#print(text_train.iloc[0])
#print(text_train_sequences[0])

### Создаем модель MultiNN: SimpleNN + NLP

### RNN NLP

In [None]:
model_nlp = Sequential()
model_nlp.add(L.Input(shape=MAX_SEQUENCE_LENGTH, name="seq_description"))
model_nlp.add(L.Embedding(len(tokenize.word_index)+1, MAX_SEQUENCE_LENGTH,))
model_nlp.add(L.BatchNormalization())
model_nlp.add(L.LSTM(1024, return_sequences=True))
model_nlp.add(L.Dropout(0.25))
model_nlp.add(L.LSTM(512,))
model_nlp.add(L.Dropout(0.5))
model_nlp.add(L.Dense(256, activation="relu"))
model_nlp.add(L.Dropout(0.5))

### MLP

In [None]:
model_mlp = Sequential()
model_mlp.add(L.Dense(1024, input_dim=X_train.shape[1], activation="relu"))
model_mlp.add(L.BatchNormalization())
model_mlp.add(L.Dropout(0.25))
model_mlp.add(L.Dense(512, activation="relu"))
model_mlp.add(L.Dropout(0.5))
#model_mlp.add(L.Dense(256, activation="relu"))
#model_mlp.add(L.Dropout(0.5))

### Multiple Inputs NN

In [None]:
combinedInput = L.concatenate([model_nlp.output, model_mlp.output])

# being our regression head
head = L.Dense(256, activation="relu")(combinedInput)
#head = L.BatchNormalization()(head)
head = L.Dropout(0.5)(head)
head = L.Dense(1, activation="linear")(head)

model = Model(inputs=[model_nlp.input, model_mlp.input], outputs=head)

### Fit

In [None]:
def callbacks(lr):
    checkpoint = ModelCheckpoint('../working/best_model_m_inps.hdf5' , monitor='val_MAPE', 
                                 verbose=1, mode='min', save_best_only=True)
    earlystop = EarlyStopping(monitor='val_MAPE', patience=10, restore_best_weights=True,)
    
    return [checkpoint, earlystop]

In [None]:
LR = 0.01
optimizer = tf.keras.optimizers.Adam(learning_rate=LR)

model.compile(loss='MAPE',optimizer=optimizer, metrics=['MAPE'])
callbacks_list = callbacks(lr=LR)

In [None]:
history = model.fit([text_train_sequences, X_train], y_train,
                    batch_size=512,
                    epochs=500, 
                    validation_data=([text_test_sequences, X_test], y_test),
                    callbacks=callbacks_list,
                    verbose=1
                   )

plt.title('Loss')
plt.plot(history.history['MAPE'], label='train')
plt.plot(history.history['val_MAPE'], label='test')
plt.show();

In [None]:
model.load_weights('../working/best_model_m_inps.hdf5')
model.save_weights('../working/best_model_m_inps_st1.hdf5')
model.save('../working/nn_mlp_nlp_st1.hdf5')

test_predict_nn2 = model.predict([text_test_sequences, X_test])
print(f"TEST mape: {(mape(y_test, test_predict_nn2[:,0]))*100:0.2f}%")

In [None]:
model.load_weights('../working/best_model_m_inps_st1.hdf5')

In [None]:
LR = 0.005 
optimizer = tf.keras.optimizers.Adam(learning_rate=LR)

model.compile(loss='MAPE',optimizer=optimizer, metrics=['MAPE'])
callbacks_list = callbacks(lr=LR)

history = model.fit([text_train_sequences, X_train], y_train,
                    batch_size=512,
                    epochs=500, 
                    validation_data=([text_test_sequences, X_test], y_test),
                    callbacks=callbacks_list,
                    verbose=1
                   )

model.load_weights('../working/best_model_m_inps.hdf5')
model.save_weights('../working/best_model_m_inps_st2.hdf5')
model.save('../working/nn_mlp_nlp_st2.hdf5')

test_predict_nn2 = model.predict([text_test_sequences, X_test])
print(f"TEST mape: {(mape(y_test, test_predict_nn2[:,0]))*100:0.2f}%")

In [None]:
model.load_weights('../working/best_model_m_inps_st2.hdf5')

In [None]:
LR = 0.002 
optimizer = tf.keras.optimizers.Adam(learning_rate=LR)

model.compile(loss='MAPE',optimizer=optimizer, metrics=['MAPE'])
callbacks_list = callbacks(lr=LR)

history = model.fit([text_train_sequences, X_train], y_train,
                    batch_size=512,
                    epochs=500, 
                    validation_data=([text_test_sequences, X_test], y_test),
                    callbacks=callbacks_list,
                    verbose=1
                   )

model.load_weights('../working/best_model_m_inps.hdf5')
model.save_weights('../working/best_model_m_inps_st3.hdf5')
model.save('../working/nn_mlp_nlp_st3.hdf5')

test_predict_nn2 = model.predict([text_test_sequences, X_test])
print(f"TEST mape: {(mape(y_test, test_predict_nn2[:,0]))*100:0.2f}%")

In [None]:
model.load_weights('../working/best_model_m_inps_st3.hdf5')

In [None]:
LR = 0.0002
optimizer = tf.keras.optimizers.Adam(learning_rate=LR)

model.compile(loss='MAPE',optimizer=optimizer, metrics=['MAPE'])
callbacks_list = callbacks(lr=LR)

history = model.fit([text_train_sequences, X_train], y_train,
                    batch_size=512,
                    epochs=500, 
                    validation_data=([text_test_sequences, X_test], y_test),
                    callbacks=callbacks_list,
                    verbose=1
                   )

model.load_weights('../working/best_model_m_inps.hdf5')
model.save_weights('../working/best_model_m_inps_st4.hdf5')
model.save('../working/nn_mlp_nlp_st4.hdf5')

test_predict_nn2 = model.predict([text_test_sequences, X_test])
print(f"TEST mape: {(mape(y_test, test_predict_nn2[:,0]))*100:0.2f}%")

In [None]:
model.load_weights('../working/best_model_m_inps_st4.hdf5')

In [None]:
LR = 0.0001 
optimizer = tf.keras.optimizers.Adam(learning_rate=LR)

model.compile(loss='MAPE',optimizer=optimizer, metrics=['MAPE'])
callbacks_list = callbacks(lr=LR)

history = model.fit([text_train_sequences, X_train], y_train,
                    batch_size=512,
                    epochs=500, 
                    validation_data=([text_test_sequences, X_test], y_test),
                    callbacks=callbacks_list,
                    verbose=1
                   )

model.load_weights('../working/best_model_m_inps.hdf5')
model.save_weights('../working/best_model_m_inps_st5.hdf5')
model.save('../working/nn_mlp_nlp_st5.hdf5')

test_predict_nn2 = model.predict([text_test_sequences, X_test])
print(f"TEST mape: {(mape(y_test, test_predict_nn2[:,0]))*100:0.2f}%")

In [None]:
model.load_weights('../working/best_model_m_inps_st5.hdf5')

In [None]:
LR = 0.0000005 
optimizer = tf.keras.optimizers.Adam(learning_rate=LR)

model.compile(loss='MAPE',optimizer=optimizer, metrics=['MAPE'])
callbacks_list = callbacks(lr=LR)

history = model.fit([text_train_sequences, X_train], y_train,
                    batch_size=512,
                    epochs=500, 
                    validation_data=([text_test_sequences, X_test], y_test),
                    callbacks=callbacks_list,
                    verbose=1
                   )

model.load_weights('../working/best_model_m_inps.hdf5')
model.save_weights('../working/best_model_m_inps_st6.hdf5')
model.save('../working/nn_mlp_nlp_st6.hdf5')

test_predict_nn2 = model.predict([text_test_sequences, X_test])
print(f"TEST mape: {(mape(y_test, test_predict_nn2[:,0]))*100:0.2f}%")

In [None]:
model.load_weights('../working/best_model_m_inps_st6.hdf5')

In [None]:
LR = 0.00000005 
optimizer = tf.keras.optimizers.Adam(learning_rate=LR)

model.compile(loss='MAPE',optimizer=optimizer, metrics=['MAPE'])
callbacks_list = callbacks(lr=LR)

history = model.fit([text_train_sequences, X_train], y_train,
                    batch_size=512,
                    epochs=500, 
                    validation_data=([text_test_sequences, X_test], y_test),
                    callbacks=callbacks_list,
                    verbose=1
                   )

model.load_weights('../working/best_model_m_inps.hdf5')
model.save_weights('../working/best_model_m_inps_st7.hdf5')
model.save('../working/nn_mlp_nlp_st7.hdf5')

test_predict_nn2 = model.predict([text_test_sequences, X_test])
print(f"TEST mape: {(mape(y_test, test_predict_nn2[:,0]))*100:0.2f}%")

In [None]:
model.load_weights('../working/best_model_m_inps_st7.hdf5')

In [None]:
sub_predict_nn2 = model.predict([text_sub_sequences, X_sub])
#sample_submission['price'] = sub_predict_nn2[:,0]
#sample_submission.to_csv('nn2_submission.csv', index=False)

sample_submission['price'] = np.round(sub_predict_nn2[:,0],-4) # округляем до 4 знаков до запятой!!!
sample_submission.to_csv('nn2_submission_wth_round.csv', index=False)

# Model 5: Добавляем картинки

In [None]:
X, y, X_sub, X_train, X_test, y_train, y_test, data = create_data()
print(X.shape, y.shape, X_sub.shape, X_train.shape, X_test.shape, y_train.shape, y_test.shape, data.shape)

text_train_sequences, text_test_sequences, text_sub_sequences, tokenize = data_descr_to_nlp()
print(text_train_sequences.shape, text_test_sequences.shape, text_sub_sequences.shape)

In [None]:
# смотрим примеры изображений и таргет (цена)
plt.figure(figsize = (12,8))

random_image = train.sample(n = 9)
random_image_paths = random_image['sell_id'].values
random_image_cat = random_image['price'].values

for index, path in enumerate(random_image_paths):
    im = PIL.Image.open(DATA_DIR+'img/img/' + str(path) + '.jpg')
    plt.subplot(3, 3, index + 1)
    plt.imshow(im)
    plt.title('price: ' + str(random_image_cat[index]))
    plt.axis('off')
plt.show()

In [None]:
# пример картинки и ее размеры, чтобы понимать, как лучше обрабатывать и сжимать картинки
image = PIL.Image.open(DATA_DIR+'img/img/' + '35782220.jpg')
imgplot = plt.imshow(image)
plt.show()
image.size

In [None]:
size = (320, 240)

def get_image_array(index):
    images_train = []
    for index, sell_id in enumerate(data['sell_id'].iloc[index].values):
        image = cv2.imread(DATA_DIR + 'img/img/' + str(sell_id) + '.jpg')
        assert(image is not None)
        image = cv2.resize(image, size)
        images_train.append(image)
    images_train = np.array(images_train)
    print('images shape', images_train.shape, 'dtype', images_train.dtype)
    return(images_train)

images_train = get_image_array(X_train.index)
images_test = get_image_array(X_test.index)
images_sub = get_image_array(X_sub.index)

In [None]:
augmentation = albumentations.Compose([
    albumentations.CLAHE(p=0.25, clip_limit=(1, 10), tile_grid_size=(10, 10)),
    albumentations.ChannelShuffle(p=0.25),
    #albumentations.ElasticTransform(p=0.25, alpha=1.0, sigma=10, alpha_affine=10, 
    #                                interpolation=1, border_mode=1, value=(0, 0, 0), 
    #                                mask_value=None, approximate=False),
    albumentations.Equalize(p=0.25, mode='cv', by_channels=True),
    albumentations.GaussNoise(p=0.25, var_limit=(10.0, 500.0), mean=-10),
    #albumentations.GridDistortion(p=0.25, num_steps=15, distort_limit=(-0.3, 0.3), 
    #                              interpolation=3, border_mode=1, value=(0, 0, 0), 
    #                              mask_value=None),
    albumentations.HorizontalFlip(p=0.5),
    albumentations.HueSaturationValue(p=0.5, hue_shift_limit=(-20, 20), 
                                      sat_shift_limit=(-20, 20), val_shift_limit=(-20, 20)),
    #albumentations.ISONoise(p=0.5, intensity=(0.1, 0.4), color_shift=(0.01, 0.3)),
    albumentations.MotionBlur(p=0.25, blur_limit=(3, 7)),
    #albumentations.OpticalDistortion(p=0.25, distort_limit=(-0.3, 0.3), 
    #                                 shift_limit=(-0.2, 0.2), interpolation=2, 
    #                                 border_mode=1, value=(0, 0, 0), mask_value=None),
    albumentations.RGBShift(p=0.5),
    albumentations.OneOf([
        albumentations.RandomBrightnessContrast(brightness_limit=0.2, contrast_limit=0.2),
        albumentations.RandomBrightnessContrast(brightness_limit=0.1, contrast_limit=0.1),
        ],p=0.5),
    albumentations.RandomGamma(p=0.25, gamma_limit=(100, 130), eps=1e-07),
    #albumentations.RandomSnow(p=0.25, snow_point_lower=0.25, snow_point_upper=0.75, 
    #                          brightness_coeff=1.2),
    albumentations.Rotate(p=0.5, limit=(-5, 5), interpolation=2, border_mode=2)
    ])

In [None]:
# пример применения аугментаций
img_1 = cv2.imread("../input/sf-dst-car-price-prediction-part2/img/img/35782220.jpg")
plt.figure(figsize=(12,8))
for i in range(0,6):
    x = augmentation(image = img_1)['image']
    plt.subplot(3,3, i+1)
    plt.imshow(x)
plt.show()

In [None]:
def make_augmentations(images):
    print('применение аугментаций', end = '')
    augmented_images = np.empty(images.shape)
    for i in range(images.shape[0]):
        if i % 200 == 0:
            print('.', end = '')
        augment_dict = augmentation(image = images[i])
        augmented_image = augment_dict['image']
        augmented_images[i] = augmented_image
    print('')
    return augmented_images

In [None]:
def process_image(image):
    return augmentation(image = image.numpy())['image']

def tokenize_(descriptions):
    return sequence.pad_sequences(tokenize.texts_to_sequences(descriptions), 
                                  maxlen = MAX_SEQUENCE_LENGTH)

def tokenize_text(text):
    return tokenize_([text.numpy().decode('utf-8')])[0]

def tf_process_train_dataset_element(image, table_data, text, price):
    im_shape = image.shape
    [image,] = tf.py_function(process_image, [image], [tf.uint8])
    image.set_shape(im_shape)
    [text,] = tf.py_function(tokenize_text, [text], [tf.int32])
    return (image, table_data, text), price

def tf_process_val_dataset_element(image, table_data, text, price):
    [text,] = tf.py_function(tokenize_text, [text], [tf.int32])
    return (image, table_data, text), price

train_dataset = tf.data.Dataset.from_tensor_slices((
    images_train, X_train, data.description.iloc[X_train.index], y_train
    )).map(tf_process_train_dataset_element)

test_dataset = tf.data.Dataset.from_tensor_slices((
    images_test, X_test, data.description.iloc[X_test.index], y_test
    )).map(tf_process_val_dataset_element)

y_sub = np.zeros(len(X_sub))
sub_dataset = tf.data.Dataset.from_tensor_slices((
    images_sub, X_sub, data.description.iloc[X_sub.index], y_sub
    )).map(tf_process_val_dataset_element)

#проверяем, что нет ошибок (не будет выброшено исключение):
train_dataset.__iter__().__next__();
test_dataset.__iter__().__next__();
sub_dataset.__iter__().__next__();

In [None]:
def callbacks(lr):
    checkpoint = ModelCheckpoint('../working/best_model_all.hdf5' , monitor='val_MAPE', 
                                 verbose=1, mode='min', save_best_only=True)
    earlystop = EarlyStopping(monitor='val_MAPE', patience=15, restore_best_weights=True,)
    reduce_lr = ReduceLROnPlateau(monitor='val_MAPE', factor=0.3, patience=5, verbose=1, 
                                  min_lr=lr/1000, mode='min')
    
    return [checkpoint, earlystop, reduce_lr]

In [None]:
# Step 1 Model 5 Fine_tunning - train only head & begin from LR = 0.01
efficientnet_model = tf.keras.applications.efficientnet.EfficientNetB3(
    weights = 'imagenet', include_top = False, input_shape = (size[1], size[0], 3))
efficientnet_model.trainable = False

efficientnet_output = L.GlobalAveragePooling2D()(efficientnet_model.output)
efficientnet_output = L.Dense(512, activation="relu")(efficientnet_output)
efficientnet_output = L.Dropout(0.5)(efficientnet_output)

In [None]:
# tabular model
tabular_model = Sequential()
tabular_model.add(L.Dense(1024, input_dim=X_train.shape[1], activation="relu"))
tabular_model.add(L.Dropout(0.25))
tabular_model.add(L.Dense(512, activation="relu"))
tabular_model.add(L.Dropout(0.5))

In [None]:
# nlp model
nlp_model  = Sequential()
nlp_model.add(L.Input(shape=MAX_SEQUENCE_LENGTH, name="seq_description"))
nlp_model.add(L.Embedding(len(tokenize.word_index)+1, MAX_SEQUENCE_LENGTH,))
nlp_model.add(L.LSTM(1024, return_sequences=True))
nlp_model.add(L.Dropout(0.25))
nlp_model.add(L.LSTM(512,))
nlp_model.add(L.Dropout(0.5))
nlp_model.add(L.Dense(256, activation="relu"))
nlp_model.add(L.Dropout(0.5))

In [None]:
# объединяем выходы трех нейросетей
combinedInput = L.concatenate([efficientnet_output, tabular_model.output, nlp_model.output])

# regression head
head = L.Dense(256, activation="relu")(combinedInput)
head = L.Dropout(0.5)(head)
head = L.Dense(1,)(head)

model = Model(inputs=[efficientnet_model.input, tabular_model.input, nlp_model.input], outputs=head)
#model.summary()

In [None]:
# model compile & fit
LR = 0.01
optimizer = tf.keras.optimizers.Adam(LR)
model.compile(loss='MAPE',optimizer=optimizer, metrics=['MAPE'])

callbacks_list = callbacks(lr=LR)

history = model.fit(train_dataset.batch(30),
                    epochs=100,
                    validation_data = test_dataset.batch(30),
                    callbacks=callbacks_list
                   )

In [None]:
model.load_weights('../working/best_model_all.hdf5')
model.save_weights('../working/best_model_all_st1.hdf5')
#model.save('../working/nn_final_st1.hdf5')

In [None]:
# Step 2 Model 5 Fine_tunning - train ~33% last layers of EfficientNetB3 & head & begin from LR = 0.005
efficientnet_model = tf.keras.applications.efficientnet.EfficientNetB3(
    weights = 'imagenet', include_top = False, input_shape = (size[1], size[0], 3))

efficientnet_model.trainable = True
fine_tune_at = int(len(efficientnet_model.layers)//1.5)
for layer in efficientnet_model.layers[:fine_tune_at]:
    layer.trainable =  False

efficientnet_output = L.GlobalAveragePooling2D()(efficientnet_model.output)
efficientnet_output = L.Dense(512, activation="relu")(efficientnet_output)
efficientnet_output = L.Dropout(0.5)(efficientnet_output)

#объединяем выходы трех нейросетей
combinedInput = L.concatenate([efficientnet_output, tabular_model.output, nlp_model.output])

# being our regression head
head = L.Dense(256, activation="relu")(combinedInput)
head = L.Dropout(0.5)(head)
head = L.Dense(1,)(head)

model = Model(inputs=[efficientnet_model.input, tabular_model.input, nlp_model.input], outputs=head)

LR = 0.005
optimizer = tf.keras.optimizers.Adam(LR)
model.compile(loss='MAPE',optimizer=optimizer, metrics=['MAPE'])

callbacks_list = callbacks(lr=LR)

model.load_weights('../input/weightsl-simple-albu-aft-st1/best_model_all_st1(simple albu).hdf5')

history = model.fit(train_dataset.batch(20),
                    epochs=100,
                    validation_data = test_dataset.batch(20),
                    callbacks=callbacks_list
                   )

In [None]:
model.load_weights('../working/best_model_all.hdf5')
model.save_weights('../working/best_model_all_st2.hdf5')
#model.save('../working/nn_final_st2.hdf5')

In [None]:
# Step 3 Model 5 Fine_tunning - train ~50% last layers of EfficientNetB3 & head & begin from LR = 0.001
efficientnet_model = tf.keras.applications.efficientnet.EfficientNetB3(
    weights = 'imagenet', include_top = False, input_shape = (size[1], size[0], 3))

efficientnet_model.trainable = True
fine_tune_at = int(len(efficientnet_model.layers)//2)
for layer in efficientnet_model.layers[:fine_tune_at]:
    layer.trainable =  False

efficientnet_output = L.GlobalAveragePooling2D()(efficientnet_model.output)
efficientnet_output = L.Dense(512, activation="relu")(efficientnet_output)
efficientnet_output = L.Dropout(0.5)(efficientnet_output)

#объединяем выходы трех нейросетей
combinedInput = L.concatenate([efficientnet_output, tabular_model.output, nlp_model.output])

# being our regression head
head = L.Dense(256, activation="relu")(combinedInput)
head = L.Dropout(0.5)(head)
head = L.Dense(1,)(head)

model = Model(inputs=[efficientnet_model.input, tabular_model.input, nlp_model.input], outputs=head)

LR = 0.001
optimizer = tf.keras.optimizers.Adam(LR)
model.compile(loss='MAPE',optimizer=optimizer, metrics=['MAPE'])

callbacks_list = callbacks(lr=LR)

model.load_weights('./best_model_all_st2.hdf5')

history = model.fit(train_dataset.batch(20),
                    epochs=100,
                    validation_data = test_dataset.batch(20),
                    callbacks=callbacks_list
                   )

In [None]:
model.load_weights('../working/best_model_all.hdf5')
model.save_weights('../working/best_model_all_st3.hdf5')
#model.save('../working/nn_final_st3.hdf5')

In [None]:
# Step 4_1 Model 5 Fine_tunning - train ~75% last layers of EfficientNetB3 & head & begin from LR = 0.0005
efficientnet_model = tf.keras.applications.efficientnet.EfficientNetB3(
    weights = 'imagenet', include_top = False, input_shape = (size[1], size[0], 3))

efficientnet_model.trainable = True
fine_tune_at = int(len(efficientnet_model.layers)//4)
for layer in efficientnet_model.layers[:fine_tune_at]:
    layer.trainable =  False

efficientnet_output = L.GlobalAveragePooling2D()(efficientnet_model.output)
efficientnet_output = L.Dense(512, activation="relu")(efficientnet_output)
efficientnet_output = L.Dropout(0.5)(efficientnet_output)

#объединяем выходы трех нейросетей
combinedInput = L.concatenate([efficientnet_output, tabular_model.output, nlp_model.output])

# being our regression head
head = L.Dense(256, activation="relu")(combinedInput)
head = L.Dropout(0.5)(head)
head = L.Dense(1,)(head)

model = Model(inputs=[efficientnet_model.input, tabular_model.input, nlp_model.input], outputs=head)

LR = 0.0005
optimizer = tf.keras.optimizers.Adam(LR)
model.compile(loss='MAPE',optimizer=optimizer, metrics=['MAPE'])

callbacks_list = callbacks(lr=LR)

model.load_weights('../working/best_model_all_st3.hdf5')

history = model.fit(train_dataset.batch(20),
                    epochs=100,
                    validation_data = test_dataset.batch(20),
                    callbacks=callbacks_list
                   )

In [None]:
model.load_weights('../working/best_model_all.hdf5')
model.save_weights('../working/best_model_all_st4_1.hdf5')
#model.save('../working/nn_final_st4_1.hdf5')

In [None]:
# Step 4_2 Model 5 Fine_tunning - train ~75% last layers of EfficientNetB3 & head & begin from LR = 0.0001
efficientnet_model = tf.keras.applications.efficientnet.EfficientNetB3(
    weights = 'imagenet', include_top = False, input_shape = (size[1], size[0], 3))

efficientnet_model.trainable = True
fine_tune_at = int(len(efficientnet_model.layers)//4)
for layer in efficientnet_model.layers[:fine_tune_at]:
    layer.trainable =  False

efficientnet_output = L.GlobalAveragePooling2D()(efficientnet_model.output)
efficientnet_output = L.Dense(512, activation="relu")(efficientnet_output)
efficientnet_output = L.Dropout(0.5)(efficientnet_output)

#объединяем выходы трех нейросетей
combinedInput = L.concatenate([efficientnet_output, tabular_model.output, nlp_model.output])

# being our regression head
head = L.Dense(256, activation="relu")(combinedInput)
head = L.Dropout(0.5)(head)
head = L.Dense(1,)(head)

model = Model(inputs=[efficientnet_model.input, tabular_model.input, nlp_model.input], outputs=head)

LR = 0.0001
optimizer = tf.keras.optimizers.Adam(LR)
model.compile(loss='MAPE',optimizer=optimizer, metrics=['MAPE'])

callbacks_list = callbacks(lr=LR)

model.load_weights('../working/best_model_all_st4_1.hdf5')

history = model.fit(train_dataset.batch(20),
                    epochs=100,
                    validation_data = test_dataset.batch(20),
                    callbacks=callbacks_list
                   )

In [None]:
model.load_weights('../working/best_model_all.hdf5')
model.save_weights('../working/best_model_all_st4_2.hdf5')
#model.save('../working/nn_final_st4_2.hdf5')

In [None]:
# Step 5_1 Image model Fine_tunning - train all layers of EfficientNetB3 & head & begin from LR = 0.00005
efficientnet_model = tf.keras.applications.efficientnet.EfficientNetB3(
    weights = 'imagenet', include_top = False, input_shape = (size[1], size[0], 3))
efficientnet_model.trainable = True

efficientnet_output = L.GlobalAveragePooling2D()(efficientnet_model.output)
efficientnet_output = L.Dense(512, activation="relu")(efficientnet_output)
efficientnet_output = L.Dropout(0.5)(efficientnet_output)

#объединяем выходы трех нейросетей
combinedInput = L.concatenate([efficientnet_output, tabular_model.output, nlp_model.output])

# being our regression head
head = L.Dense(256, activation="relu")(combinedInput)
head = L.Dropout(0.5)(head)
head = L.Dense(1,)(head)

model = Model(inputs=[efficientnet_model.input, tabular_model.input, nlp_model.input], outputs=head)

LR = 0.00005
optimizer = tf.keras.optimizers.Adam(LR)
model.compile(loss='MAPE',optimizer=optimizer, metrics=['MAPE'])

callbacks_list = callbacks(lr=LR)

model.load_weights('../working/best_model_all_st4_2.hdf5')

history = model.fit(train_dataset.batch(20),
                    epochs=100,
                    validation_data = test_dataset.batch(20),
                    callbacks=callbacks_list
                   )

In [None]:
model.load_weights('../working/best_model_all.hdf5')
model.save_weights('../working/best_model_all_st5_1.hdf5')
#model.save('../working/nn_final_st5_1.hdf5')

In [None]:
model.load_weights('../working/best_model_all_st5_1.hdf5')
# предсказание без округления
sub_predict_nn3 = model.predict(sub_dataset.batch(20))
sample_submission['price'] = sub_predict_nn3[:,0]
sample_submission.to_csv('nn3_submission_st5_1.csv', index=False)

In [None]:
# предсказание с округлением
sub_predict_nn3 = model.predict(sub_dataset.batch(20))
sample_submission['price'] = np.round(sub_predict_nn3[:,0],-4)
sample_submission.to_csv('nn3_submission_r4_st5_1.csv', index=False)

In [None]:
## step 6 Model 5 Fine_tunning - train all layers of EfficientNetB3 & head & change size of picture
size = (448, 336)

def get_image_array(index):
    images_train = []
    for index, sell_id in enumerate(data['sell_id'].iloc[index].values):
        image = cv2.imread(DATA_DIR + 'img/img/' + str(sell_id) + '.jpg')
        assert(image is not None)
        image = cv2.resize(image, size)
        images_train.append(image)
    images_train = np.array(images_train)
    print('images shape', images_train.shape, 'dtype', images_train.dtype)
    return(images_train)

images_train = get_image_array(X_train.index)
images_test = get_image_array(X_test.index)
images_sub = get_image_array(X_sub.index)

In [None]:
# more simple
augmentation = albumentations.Compose([
    albumentations.CLAHE(p=0.25, clip_limit=(1, 10), tile_grid_size=(10, 10)),
    albumentations.ChannelShuffle(p=0.25),
    albumentations.Equalize(p=0.25, mode='cv', by_channels=True),
    albumentations.HorizontalFlip(p=0.5),
    albumentations.RGBShift(p=0.5),
    albumentations.OneOf([
        albumentations.RandomBrightnessContrast(brightness_limit=0.2, contrast_limit=0.2),
        albumentations.RandomBrightnessContrast(brightness_limit=0.1, contrast_limit=0.1),
        ],p=0.5),
    albumentations.Rotate(p=0.5, limit=(-5, 5), interpolation=2, border_mode=2)
    ])

In [None]:
efficientnet_model = tf.keras.applications.efficientnet.EfficientNetB3(weights = 'imagenet', include_top = False, input_shape = (size[1], size[0], 3))
efficientnet_model.trainable = True

efficientnet_output = L.GlobalAveragePooling2D()(efficientnet_model.output)
efficientnet_output = L.Dense(512, activation="relu")(efficientnet_output)
efficientnet_output = L.Dropout(0.5)(efficientnet_output)

#объединяем выходы трех нейросетей
combinedInput = L.concatenate([efficientnet_output, tabular_model.output, nlp_model.output])

# being our regression head
head = L.Dense(256, activation="relu")(combinedInput)
head = L.Dropout(0.5)(head)
head = L.Dense(1,)(head)

model = Model(inputs=[efficientnet_model.input, tabular_model.input, nlp_model.input], outputs=head)

LR = 0.000001
optimizer = tf.keras.optimizers.Adam(LR)
model.compile(loss='MAPE',optimizer=optimizer, metrics=['MAPE'])

callbacks_list = callbacks(lr=LR)

model.load_weights('../working/best_model_all_st5_1.hdf5')

history = model.fit(train_dataset.batch(10),
                    epochs=100,
                    validation_data = test_dataset.batch(10),
                    callbacks=callbacks_list
                   )

In [None]:
model.load_weights('../working/best_model_all.hdf5')
model.save_weights('../working/best_model_all_st6.hdf5')
#model.save('../working/nn_final_st6.hdf5')

In [None]:
model.load_weights('../working/best_model_all_st6.hdf5')

In [None]:
test_predict_nn3 = model.predict(test_dataset.batch(10))
print(f"TEST mape: {(mape(y_test, test_predict_nn3[:,0]))*100:0.2f}%")

In [None]:
# предсказание без округления
sub_predict_nn3 = model.predict(sub_dataset.batch(10))
sample_submission['price'] = sub_predict_nn3[:,0]
sample_submission.to_csv('nn3_submission_st6.csv', index=False)

In [None]:
# предсказание с округлением
sub_predict_nn3_rounded = model.predict(sub_dataset.batch(10))
sample_submission['price'] = np.round(sub_predict_nn3_rounded[:,0], -4)
sample_submission.to_csv('nn3_submission_r4_st6.csv', index=False)

# Blend

In [None]:
sub_predict_catboost = np.array(pd.read_csv('../input/best-cat-boost/catboost_submission(2).csv')['price'])
sub_predict_nn1 = np.array(pd.read_csv('../input/best-sub-nn1/nn1_submission(2).csv')['price'])
sub_predict_nn2 = np.array(pd.read_csv('../input/best-sub-nn2/nn2_submission(2)(1).csv')['price'])
sub_predict_nn3 = np.array(pd.read_csv('../input/best-sub-nn3/nn3_submission_st5_1(1).csv')['price'])

In [None]:
blend_sub_predict = (sub_predict_catboost + sub_predict_nn1 + sub_predict_nn2 + sub_predict_nn3)/4
sample_submission['price'] = blend_sub_predict
sample_submission.to_csv('blend_submission.csv', index=False)

In [None]:
sub_predict_catboost_r4 = np.array(pd.read_csv('../input/best-cat-boost/catboost_submission(2).csv')['price'])
sub_predict_nn1_r4 = np.array(pd.read_csv('../input/best-sub-nn1/nn1_submission(2).csv')['price'])
sub_predict_nn2_r4 = np.array(pd.read_csv('../input/best-sub-nn2/nn2_submission(2)(1).csv')['price'])
sub_predict_nn3_r4 = np.array(pd.read_csv('../input/best-sub-nn3/nn3_submission_st5_1(1).csv')['price'])

In [None]:
blend_sub_predict_rounded = (sub_predict_catboost_r4 + sub_predict_nn1_r4 + sub_predict_nn2_r4 + sub_predict_nn3_r4)/4
sample_submission['price'] = blend_sub_predict_rounded
sample_submission.to_csv('blend_submission_sum_rounded.csv', index=False)

In [None]:
blend_sub_predict_w_out_cat = (sub_predict_nn1 + sub_predict_nn2 + sub_predict_nn3)/3
sample_submission['price'] = blend_sub_predict_w_out_cat
sample_submission.to_csv('blend_submission_w_out_cat.csv', index=False)