In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, r2_score
from sklearn.feature_extraction.text import TfidfVectorizer
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
from sklearn.metrics import make_scorer
import spacy
import re
import string
from gensim.models import KeyedVectors
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from nltk.corpus import stopwords
from pymystem3 import Mystem
from nltk.stem import WordNetLemmatizer
from bs4 import BeautifulSoup
warnings.filterwarnings("ignore")

In [2]:
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.models import load_model
from keras.layers import Dropout
from keras.layers import Bidirectional
from keras.layers import GRU,Conv1D,MaxPooling1D,GlobalMaxPooling1D, Flatten,Embedding,BatchNormalization
from keras.optimizers import Adam, SGD
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer

In [3]:
import tensorflow as tf

In [4]:
!python -m nltk.downloader all

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_ru.zip.
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping grammars/basque_grammars.zip.
[nltk_data]    | Downloading package bcp47 to /root/nltk_data...
[nltk_data]    | Downloading package biocreative_ppi to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   U

In [5]:
from tqdm.notebook import tqdm
tqdm.pandas()

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
df = pd.read_csv('drive/MyDrive/cleaned_data_final3.csv')
df_new2 = df[['area_id', 'prof_name', 'name',
            'mean_salary', 'description','schedule', 'experience_rus','employment','quick_responses_allowed','premium',
            'allow_messages','accept_temporary', 'first_language', 'key_skills', 'mean_salary_all', 'city']]

In [8]:
df_new2 = df_new2.drop_duplicates(subset=['description'])

In [9]:
names_to_remove = ['Технический писатель', 'Продуктовый аналитик' , 'Директор по информационным технологиям (CIO)','Дата-сайентист',
                   'Руководитель отдела аналитики', 'Методолог', 'Гейм-дизайнер', 'Арт-директор, креативный директор']
df_new2 = df_new2[~df_new2['prof_name'].isin(names_to_remove)]

In [10]:
class PreprocessData:
    def __init__ (self, data, y, stem_method=Mystem()):
        """Clean data and preprocess text"""
        self.data = data
        self.y = y
        self.mystem = stem_method
        # self.show_stats = show_stats

    def fill_nans(self):
        self.data[y] = self.data[y].fillna(0)
        for col in self.data.select_dtypes(['object']).columns:
            self.data[col] = self.data[col].fillna('')

    def drop_duplicates(self):
        self.data = self.data.drop_duplicates(subset=['description'])

    def clean_and_lemmatize(self, text):
        # clean punctuation and stop words
        text = text.lower()
        words = re.findall(r'\b\w+\b', text)
        stop_words = set(stopwords.words('russian'))
        words = [w for w in words if w not in stop_words]
        # clean_text = ' '.join(words)

        # lemmatize
        # words = text.split()
        lemmas = [self.mystem.lemmatize(word)[0] for word in words]
        lemmatized_text = ' '.join(lemmas)
        return lemmatized_text

    def get_cleaned_data(self):
        self.drop_duplicates()
        self.fill_nans()
        self.data['descr_clean'] = self.data['description'].progress_apply(self.clean_and_lemmatize)
        return self.data.reset_index(drop=True)

    def get_stats(self, cat_features):
        print(self.data.info(), '\n')
        print("Numerical features' analysis")
        plt.figure()
        self.data[self.y].hist(bins=30)
        plt.title(f'{self.y} distribution')
        plt.show;
        display(self.data.describe().T)

        print('\n', "Categorical features' analysis")
        display(self.data.describe(include=object).T)
        for col in cat_features:
            plt.figure()
            self.data[col].value_counts().head(10).plot(kind='barh', color='pink')
            plt.title(f'{col} frequency')
            plt.show();

Installing mystem to /root/.local/bin/mystem from http://download.cdn.yandex.net/mystem/mystem-3.1-linux-64bit.tar.gz


In [17]:
y = 'mean_salary_all'
cat_features = ['prof_name', 'city', 'schedule', 'employment', 'experience_rus']
text_features = ['key_skills', 'descr_clean', 'name']

In [18]:
data_processed = PreprocessData(df_new2, y)
df_new2 = data_processed.get_cleaned_data()

  0%|          | 0/14335 [00:00<?, ?it/s]

We have several text columns: vacancy name, key skills and description. Let's try to use Bi-GRU-CNN on this data.

In [11]:
df_new2['all_text'] = df_new2['name'].str.lower()  + '. '+ df_new2['key_skills'].fillna('').str.lower() +  '. ' + df_new2['description'].str.lower()

In [12]:
X_train, X_test, y_train, y_test = train_test_split(df_new2['all_text'].values, df_new2.mean_salary_all.values.astype(int), test_size=0.2, random_state=10)

In [13]:
tokenizer = Tokenizer()
num_rows = df_new2['all_text'].shape[0]
tokenizer.fit_on_texts(df_new2['all_text'].values)
row_max_length = max([len(x.split()) for x in df_new2['all_text'].values])
vocabulary_size = len(tokenizer.word_index) + 1
X_train_tokens = tokenizer.texts_to_sequences(X_train)
X_test_tokens = tokenizer.texts_to_sequences(X_test)

In [14]:
X_train_pad = pad_sequences(X_train_tokens, maxlen=row_max_length, padding='post')
X_test_pad = pad_sequences(X_test_tokens, maxlen=row_max_length, padding='post')

In [None]:
EMBEDDING_DIM = 256

learning_rate = 0.01
optimizer = Adam(learning_rate=learning_rate)

model=Sequential()
model.add(Embedding(vocabulary_size, EMBEDDING_DIM, input_length=row_max_length))
model.add(Bidirectional(GRU(units=128, return_sequences=True)))
model.add(Conv1D(filters=64, kernel_size=3, padding='same', activation='linear'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dropout(0.2))
model.add(Dense(units = 1, activation='linear'))
model.compile(optimizer=optimizer, loss='mean_absolute_error', metrics=['mae'])


epochs=15
batch_size=128

In [None]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 1145, 256)         21398272  
                                                                 
 bidirectional (Bidirection  (None, 1145, 256)         296448    
 al)                                                             
                                                                 
 conv1d (Conv1D)             (None, 1145, 64)          49216     
                                                                 
 max_pooling1d (MaxPooling1  (None, 572, 64)           0         
 D)                                                              
                                                                 
 flatten (Flatten)           (None, 36608)             0         
                                                                 
 dropout (Dropout)           (None, 36608)            

In [None]:
with tf.device('/gpu:0'):
  model.fit(X_train_pad,y_train,epochs=epochs,batch_size=batch_size)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [None]:
scores = model.evaluate(X_test_pad, y_test)
print('Test MAE:', scores[0])
y_pred = model.predict(X_test_pad).astype(int)
r2 = r2_score(y_test, y_pred)
print('R2 score:', round(r2 * 100, 2))

Test MAE: 30529.525390625
R2 score: 52.76


Let's see model results without description. Only with name, key skills.

In [None]:
df_new2['all_text_wo_descr'] = df_new2['name'].str.lower()  + '. '+ df_new2['key_skills'].fillna('').str.lower()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_new2['all_text_wo_descr'].values, df_new2.mean_salary_all.values.astype(int), test_size=0.2, random_state=10)

In [None]:
tokenizer = Tokenizer()
num_rows = df_new2['all_text_wo_descr'].shape[0]
tokenizer.fit_on_texts(df_new2['all_text_wo_descr'].values)
row_max_length = max([len(x.split()) for x in df_new2['all_text_wo_descr'].values])
vocabulary_size = len(tokenizer.word_index) + 1
X_train_tokens = tokenizer.texts_to_sequences(X_train)
X_test_tokens = tokenizer.texts_to_sequences(X_test)

In [None]:
X_train_pad1 = pad_sequences(X_train_tokens, maxlen=row_max_length, padding='post')
X_test_pad1 = pad_sequences(X_test_tokens, maxlen=row_max_length, padding='post')

In [None]:
EMBEDDING_DIM = 256

learning_rate = 0.01
optimizer = Adam(learning_rate=learning_rate)

model=Sequential()
model.add(Embedding(vocabulary_size, EMBEDDING_DIM, input_length=row_max_length))
model.add(Bidirectional(GRU(units=128, return_sequences=True)))
model.add(Conv1D(filters=64, kernel_size=3, padding='same', activation='linear'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dropout(0.2))
model.add(Dense(units = 1, activation='linear'))
model.compile(optimizer=optimizer, loss='mean_absolute_error', metrics=['mae'])


epochs=15
batch_size=128

In [None]:
with tf.device('/gpu:0'):
  model.fit(X_train_pad1, y_train, epochs=epochs, batch_size=batch_size)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [None]:
scores = model.evaluate(X_test_pad1, y_test)
print('Test MAE:', scores[0])
y_pred = model.predict(X_test_pad1).astype(int)
r2 = r2_score(y_test, y_pred)
print('R2 score:', round(r2 * 100, 2))

Test MAE: 34189.57421875
R2 score: 41.74


Let's see other variables like text except the description

In [None]:
df_new2['total'] = df_new2['city'] + '. ' + df_new2['schedule'].str.lower() +'. '+ df_new2['name'] + '. ' + df_new2['employment'].str.lower() +'. ' +  df_new2['prof_name'].str.lower()  + '. ' + df_new2['experience_rus'].str.lower() +'. '+ df_new2['key_skills'].fillna('')

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_new2['total'].values, df_new2.mean_salary_all.values.astype(int), test_size=0.2, random_state=10)

In [None]:
tokenizer = Tokenizer()
num_rows = df_new2['total'].shape[0]
tokenizer.fit_on_texts(df_new2['total'].values)
row_max_length = max([len(x.split()) for x in df_new2['total'].values])
vocabulary_size = len(tokenizer.word_index) + 1
X_train_tokens = tokenizer.texts_to_sequences(X_train)
X_test_tokens = tokenizer.texts_to_sequences(X_test)

In [None]:
X_train_pad2 = pad_sequences(X_train_tokens, maxlen=row_max_length, padding='post')
X_test_pad2 = pad_sequences(X_test_tokens, maxlen=row_max_length, padding='post')

In [None]:
EMBEDDING_DIM = 256

learning_rate = 0.01
optimizer = Adam(learning_rate=learning_rate)

model=Sequential()
model.add(Embedding(vocabulary_size, EMBEDDING_DIM, input_length=row_max_length))
model.add(Bidirectional(GRU(units=128, return_sequences=True)))
model.add(Conv1D(filters=64, kernel_size=3, padding='same', activation='linear'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dropout(0.2))
model.add(Dense(units = 1, activation='linear'))
model.compile(optimizer=optimizer, loss='mean_absolute_error', metrics=['mae'])


epochs=15
batch_size=128

In [None]:
EMBEDDING_DIM = 256

learning_rate = 0.01
optimizer = Adam(learning_rate=learning_rate)

model=Sequential()
model.add(Embedding(vocabulary_size, EMBEDDING_DIM, input_length=row_max_length))
model.add(Bidirectional(GRU(units=128, return_sequences=True)))
model.add(Conv1D(filters=64, kernel_size=3, padding='same', activation='linear'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dropout(0.2))
model.add(Dense(units = 1, activation='linear'))
model.compile(optimizer=optimizer, loss='mean_absolute_error', metrics=['mae'])


epochs=15
batch_size=128

In [None]:
with tf.device('/gpu:0'):
  model.fit(X_train_pad2, y_train, epochs=epochs, batch_size=batch_size)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [None]:
scores = model.evaluate(X_test_pad2, y_test)
print('Test MAE:', scores[0])
y_pred = model.predict(X_test_pad2).astype(int)
r2 = r2_score(y_test, y_pred)
print('R2 score:', round(r2 * 100, 2))

Test MAE: 26546.19921875
R2 score: 63.74


Let's see add description

In [19]:
df_new2['total'] = df_new2['city'] + '. ' + df_new2['schedule'].str.lower() +'. '+ df_new2['name'] + '. ' + df_new2['employment'].str.lower() +'. ' +  df_new2['prof_name'].str.lower()  + '. ' + df_new2['experience_rus'].str.lower() +'. '+ df_new2['key_skills'].fillna('') + '. '+ df_new2['descr_clean'].str.lower()

In [20]:
tokenizer = Tokenizer()
num_rows = df_new2['total'].shape[0]
tokenizer.fit_on_texts(df_new2['total'].values)
row_max_length = max([len(x.split()) for x in df_new2['total'].values])
vocabulary_size = len(tokenizer.word_index) + 1
X_train_tokens = tokenizer.texts_to_sequences(X_train)
X_test_tokens = tokenizer.texts_to_sequences(X_test)

In [21]:
X_train_pad3 = pad_sequences(X_train_tokens, maxlen=row_max_length, padding='post')
X_test_pad3 = pad_sequences(X_test_tokens, maxlen=row_max_length, padding='post')

In [22]:
EMBEDDING_DIM = 256

learning_rate = 0.01
optimizer = Adam(learning_rate=learning_rate)

model=Sequential()
model.add(Embedding(vocabulary_size, EMBEDDING_DIM, input_length=row_max_length))
model.add(Bidirectional(GRU(units=128, return_sequences=True)))
model.add(Conv1D(filters=64, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dropout(0.2))
model.add(Dense(units = 1, activation='linear'))
model.compile(optimizer=optimizer, loss='mean_absolute_error', metrics=['mae'])


epochs=15
batch_size=128

In [23]:
with tf.device('/gpu:0'):
  model.fit(X_train_pad3, y_train, epochs=epochs, batch_size=batch_size)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [25]:
scores = model.evaluate(X_test_pad3, y_test)
print('Test MAE:', scores[0])
y_pred = model.predict(X_test_pad3).astype(int)
r2 = r2_score(y_test, y_pred)
print('R2 score:', round(r2, 2))

Test MAE: 28740.3828125
R2 score: 0.57


In [26]:
def clean_review_text(text):

    text = text.lower()
    words = re.findall(r'\b\w+\b', text)
    stop_words = set(stopwords.words('russian'))
    words = [word for word in words if word not in stop_words]
    clean_text = ' '.join(words)

    return clean_text


mystem = Mystem()

def lemmatize_text(text):
    words = text.split()
    lemmas = [mystem.lemmatize(word)[0] for word in words]
    lemmatized_text = ' '.join(lemmas)

    return lemmatized_text
df_new2['text_clean'] = df_new2['description'].apply(clean_review_text)

df_new2['lemmatized_text'] = df_new2['text_clean'].apply(lemmatize_text)

In [27]:
df_new2['total'] = df_new2['city'] + '. ' + df_new2['schedule'].str.lower() +'. '+ df_new2['name'] + '. ' + df_new2['employment'].str.lower() +'. ' +  df_new2['prof_name'].str.lower()  + '. ' + df_new2['experience_rus'].str.lower() +'. '+ df_new2['key_skills'].fillna('') + '. '+ df_new2['lemmatized_text'].str.lower()

In [28]:
X_train, X_test, y_train, y_test = train_test_split(df_new2['total'].values, df_new2.mean_salary_all.values.astype(int), test_size=0.2, random_state=10)

In [29]:
tokenizer = Tokenizer()
num_rows = df_new2['total'].shape[0]
tokenizer.fit_on_texts(df_new2['total'].values)
row_max_length = max([len(x.split()) for x in df_new2['total'].values])
vocabulary_size = len(tokenizer.word_index) + 1
X_train_tokens = tokenizer.texts_to_sequences(X_train)
X_test_tokens = tokenizer.texts_to_sequences(X_test)

In [30]:
X_train_pad5 = pad_sequences(X_train_tokens, maxlen=row_max_length, padding='post')
X_test_pad5 = pad_sequences(X_test_tokens, maxlen=row_max_length, padding='post')

In [31]:
EMBEDDING_DIM = 256

learning_rate = 0.01
optimizer = Adam(learning_rate=learning_rate)

model=Sequential()
model.add(Embedding(vocabulary_size, EMBEDDING_DIM, input_length=row_max_length))
model.add(Bidirectional(GRU(units=128, return_sequences=True)))
model.add(Conv1D(filters=64, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dropout(0.2))
model.add(Dense(units = 1, activation='linear'))
model.compile(optimizer=optimizer, loss='mean_absolute_error', metrics=['mae'])


epochs=15
batch_size=64

In [32]:
with tf.device('/gpu:0'):
  model.fit(X_train_pad5, y_train, epochs=epochs, batch_size=batch_size)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [33]:
scores = model.evaluate(X_test_pad5, y_test)
print('Test MAE:', scores[0])
y_pred = model.predict(X_test_pad5).astype(int)
r2 = r2_score(y_test, y_pred)
print('R2 score:', round(r2 * 100, 2))

Test MAE: 25705.912109375
R2 score: 64.25


In [None]:
EMBEDDING_DIM = 256

learning_rate = 0.0101
optimizer = Adam(learning_rate=learning_rate)

model=Sequential()
model.add(Embedding(vocabulary_size, EMBEDDING_DIM, input_length=row_max_length))
model.add(Bidirectional(GRU(units=128, return_sequences=True)))
model.add(Conv1D(filters=64, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dropout(0.2))
model.add(Dense(units = 1, activation='linear'))
model.compile(optimizer=optimizer, loss='mean_absolute_error', metrics=['mae'])


epochs=20
batch_size=64

In [None]:
with tf.device('/gpu:0'):
  model.fit(X_train_pad5, y_train, epochs=epochs, batch_size=batch_size)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [None]:
scores = model.evaluate(X_test_pad5, y_test)
print('Test MAE:', scores[0])
y_pred = model.predict(X_test_pad5).astype(int)
r2 = r2_score(y_test, y_pred)
print('R2 score:', round(r2, 3))

Test MAE: 25967.314453125
R2 score: 0.656


In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_new2['lemmatized_text'].values, df_new2.mean_salary_all.values.astype(int), test_size=0.2, random_state=10)

In [None]:
tokenizer = Tokenizer()
num_rows = df_new2['lemmatized_text'].shape[0]
tokenizer.fit_on_texts(df_new2['lemmatized_text'].values)
row_max_length = max([len(x.split()) for x in df_new2['lemmatized_text'].values])
vocabulary_size = len(tokenizer.word_index) + 1
X_train_tokens = tokenizer.texts_to_sequences(X_train)
X_test_tokens = tokenizer.texts_to_sequences(X_test)

In [None]:
X_train_pad4 = pad_sequences(X_train_tokens, maxlen=row_max_length, padding='post')
X_test_pad4 = pad_sequences(X_test_tokens, maxlen=row_max_length, padding='post')

In [None]:
EMBEDDING_DIM = 256

learning_rate = 0.01
optimizer = Adam(learning_rate=learning_rate)

model=Sequential()
model.add(Embedding(vocabulary_size, EMBEDDING_DIM, input_length=row_max_length))
model.add(Bidirectional(GRU(units=128, return_sequences=True)))
model.add(Conv1D(filters=64, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dropout(0.2))
model.add(Dense(units = 1, activation='linear'))
model.compile(optimizer=optimizer, loss='mean_absolute_error', metrics=['mae'])


epochs=15
batch_size=64

In [None]:
with tf.device('/gpu:0'):
  model.fit(X_train_pad4, y_train, epochs=epochs, batch_size=batch_size)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [None]:
scores = model.evaluate(X_test_pad4, y_test)
print('Test MAE:', scores[0])
y_pred = model.predict(X_test_pad4).astype(int)
r2 = r2_score(y_test, y_pred)
print('R2 score:', round(r2 * 100, 2))

Test MAE: 25851.345703125
R2 score: 65.7


In [None]:
EMBEDDING_DIM = 256

learning_rate = 0.01
optimizer = Adam(learning_rate=learning_rate)

model1=Sequential()
model1.add(Embedding(vocabulary_size, EMBEDDING_DIM, input_length=row_max_length))
model1.add(Bidirectional(GRU(units=128, return_sequences=True)))
model1.add(Conv1D(filters=64, kernel_size=3, padding='same', activation='relu'))
model1.add(MaxPooling1D(pool_size=2))
model1.add(Flatten())
model1.add(Dropout(0.2))
model1.add(Dense(units = 1, activation='relu'))
model1.compile(optimizer=optimizer, loss='mean_absolute_error', metrics=['mae'])


epochs=12
batch_size=128

In [None]:
with tf.device('/gpu:0'):
  model1.fit(X_train_pad4, y_train, epochs=epochs, batch_size=batch_size)

Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12


In [None]:
scores = model1.evaluate(X_test_pad4, y_test)
print('Test MAE:', scores[0])
y_pred = model1.predict(X_test_pad4).astype(int)
r2 = r2_score(y_test, y_pred)
print('R2 score:', round(r2 * 100, 2))

Test MAE: 27076.8515625
R2 score: 63.79


In [None]:
EMBEDDING_DIM = 256

learning_rate = 0.01
optimizer = Adam(learning_rate=learning_rate)

model=Sequential()
model.add(Embedding(vocabulary_size, EMBEDDING_DIM, input_length=row_max_length))
model.add(Bidirectional(GRU(units=128, return_sequences=True)))
model.add(Conv1D(filters=64, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dropout(0.2))
model.add(Dense(units = 1, activation='linear'))
model.compile(optimizer=optimizer, loss='mean_absolute_error', metrics=['mae'])


epochs=15
batch_size=64

In [None]:
with tf.device('/gpu:0'):
  model.fit(X_train_pad3, y_train, epochs=epochs, batch_size=batch_size)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [None]:
scores = model.evaluate(X_test_pad3, y_test)
print('Test MAE:', scores[0])
y_pred = model.predict(X_test_pad3).astype(int)
r2 = r2_score(y_test, y_pred)
print('R2 score:', round(r2, 2))

Test MAE: 27104.01953125
R2 score: 0.62
