In [75]:
import numpy as np 

import pandas as pd 
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_colwidth', 1000)

import matplotlib.pyplot as plt

from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model

import tensorflow as tf
from tensorflow.keras import backend as K

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

from transformers import *
from tokenizers import BertWordPieceTokenizer

import re
import random
import time
from tqdm import tqdm

In [34]:
train = pd.read_csv('/kaggle/input/rucode-fake-job-postings/train_data.csv')
test = pd.read_csv('/kaggle/input/rucode-fake-job-postings/test_data.csv')
df = train.append(test)

In [35]:
def seed_everything(seed):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    
SEED = 2020
seed_everything(SEED)

In [36]:
print("Positive: ", train['Фейк'].value_counts()[0]/len(train)*100,"%")
print("Negative: ", train['Фейк'].value_counts()[1]/len(train)*100,"%")

Positive:  95.12490678598061 %
Negative:  4.8750932140193886 %


Чтение данных и конкатенация информативных столбцов

In [37]:
cat_to_union = ['Название', 'Описание компании', 'Описание вакансии', 'Требования', 'Соцпакет']
for c in cat_to_union:
    df[c] = df[c].fillna('')

In [38]:
df['Инфо'] = df.apply(lambda r: 
                       r['Описание компании'] + ' ' +
                       r['Описание вакансии'] + ' '
                      + r['Требования'] + ' '
                      + r['Соцпакет']
                      , axis=1)

In [39]:
X_train = df[:10728]
X_test = df[10728:]

In [40]:
X_train = X_train[['Инфо', 'Фейк']]
X_test = X_test[['Инфо', 'Фейк']]

In [41]:
X_train.columns = ['text', 'labels']
X_test.columns = ['text', 'labels']

In [42]:
X_train['labels'] = X_train['labels'].astype('int')

In [44]:
X_train

Unnamed: 0,text,labels
0,"PEI Media (#URL_4cffe430b8da4c0d7e6074d11e638d28a1e165c8782b891e111fc17606c3d961#) provides a range of highly respected publications and market leading events to its financial sector audience. The leading information provider in private capital markets, PEI Media also has a strong international presence, with offices in London, New York and Hong Kong. Seeking savvy business reporter Are you a news-hungry journalist? Have a passion for finance, a talent for breaking news, networking and developing sophisticated sources?If so, there’s an exciting full-time reporter position available with PEI Media, the leading global B2B financial publishing and information group focused exclusively on alternative asset classes.Based in Manhattan, this opening is a rare opportunity for an ambitious individual seeking to advance their career in one of the most dynamic segments of global finance.The position is tied to Agri Investor (#URL_a2fcb2906ec9e228792ec6fd689398146109ba55c512a7a9c45b9224c88c1da...",0
1,"Roka Security is a boutique security firm that specializes in full-scale network protection and defending against advanced, targeted attacks. Our staff members have background in governmental and intelligence fields as well as large-scale data center and network deployments.We leverage our in-depth experience to aid our customers in protecting their data, and their intellectual property, andtheir customer's data, We help our clients with the full breadth of services whether it's consulting on general security issues or performing security assessments or assisting them with their latest network design or datacenter build-out. We also provide the full breadth of managed services including managed security perimeters, Managed Infrastructure / private cloud, or 24x7 security monitoring in our state-of-the-art Security Operations Center. Roka Security is looking for Software Engineers with a variety of talents. The position is for a full time employee at our office in Herndon, Virginia...",0
2,"Job tasks:Analyse project specifications on structural design related project items and define the scope of work;Prepare design calculations for all structural design work within the scope, budget and schedule defined at the start of the project. Main emphasis should be to minimise fabrication cost and vessel time;Prepare procedures for using and testing the installation aids and discuss these procedures with the offshore personnel responsible for the execution and the project team;Perform detailed calculations using the appropriate in-house tools;Review subcontractor equipment design and installation methods;Co-ordinate fabrication work of installation aids fabricated in-house or by a subcontractor when required;Assist the Field Engineer onshore or offshore when required during the execution phase of the project.Responsible for the hand-over to the Field Engineer;Check reports of structural design work performed within the Engineering Department. Job requirements:HTS/TU Civil Eng...",0
3,"The Senior Publishing Specialist will be part of the Regulatory Operations Publishing Group and to ensure that dossier is prepared ready to submit to the FDA and other agencies. This includes, Publishing(Bookmarking and Hyperlinking) and Peer Reviewing different types of document like (eg Clinical Study Report, Annual Report, Protocol, Amendments, PSUR, DMF, IMPD, CRFs and eCRF Pre-Meeting packages, Labeling) Knowledge in building different types of submission. Peer Reviewing of other group members’ documents. Experience in publishing the documents globally. Knowledge in preparing Paper submission, scanning documents and Printing Desk copies (Internal or Health Authority). Experience in formatting documents as per Eisai Standards. Minimum of 3 – 4 years experience in Regulatory Affairs publishing and reviewing the documents in a pharmaceutical environment, with strong familiarity with regulatory eCTD submission process. Must be highly proficient in Microsoft office, Adobe Acrobat...",0
4,"BCCC Enterprises provides management consulting services for Commissionaires BC, Kinetic Security and Lexxon Training. Position Overview As the Sales Representative your main focus will be on all aspects of the sales cycle. You are responsible for the development and retention of new business leading to revenue growth within the Fraser Valley territory. The primary focus of this position is lead generation, provide quotes on services required, close contracts and provide account management services. This position is required to be home-based, ideally in the Fraser Valley and reports to the Director of Business Development Key Responsibilities:Contribute to the financial success of the organization by coordinating and monitoring leads through the sales cycle to close.Qualify leads and generate proposals to close the sale on various services; primarily Security, Enforcement, Training, and Identification Services.Introduce new services and expand business opportunities within existi...",0
...,...,...
10723,"We help teachers get safe &amp; secure jobs abroad :) Play with kids, get paid for it Love travel? Jobs in Asia$1,500+ USD monthly ($200 Cost of living)Housing provided (Private/Furnished)Airfare ReimbursedExcellent for student loans/credit cardsGabriel Adkins : #URL_ed9094c60184b8a4975333957f05be37e69d3cdb68decc9dd9a4242733cfd7f7##URL_75db76d58f7994c7db24e8998c2fc953ab9a20ea9ac948b217693963f78d2e6b#12 month contract : Apply today University degree required. TEFL / TESOL / CELTA or teaching experience preferred but not necessaryCanada/US passport holders only See job description",0
10724,"Addy's mission is to make every delivery efficient and delightful.Trak, our first B2B product, makes it easy for businesses to manage and analyze their delivery operations. Trak includes a clean web dashboard, rich APIs, out-of-the-box driver apps for iOS and Android, and customer notifications, accurate ETAs and a map that lets recipients track their orders in real-time.Our customers include food, beverage, and grocery delivery businesses as well as local couriers.Addy is based in San Francisco, California and was accelerated by Stanford's StartX. Here at Addy we are building the easiest way for businesses to manage their delivery operations. We have an incredible team, are well funded by awesome investors, and graduated from Stanford's StartX accelerator program.Our products are meticulously engineered, and we need Account Managers to help our customers make the most of them. Our customers include grocery and beverage delivery services, restaurants, home services businesses, and ...",0
10725,"With over 1,300 investment professionals located throughout the United States and Canada, Marcus &amp; Millichap (NYSE: MMI) is a leading specialist in commercial real estate investment sales, financing, research and advisory services. Founded in 1971, the firm closed over 6,600 transactions in 2013 with a value of approximately $24 billion.Marcus &amp; Millichap has been a pioneer in the real estate investment industry and has established a reputation for maximizing value for its clients while fostering long-term relationships built on integrity, trust and service. The company has perfected a powerful system for marketing properties that combines investment specialization, local market expertise, the industry's most comprehensive research, state-of-the-art technology, and relationships with the largest pool of qualified investors. Marcus &amp; Millichap also offers clients access to the most competitive real estate financing through Marcus &amp; Millichap Capital Corporation (MM...",0
10726,"For more than two decades, the TRAK Companies have been providing temporary and direct-hire professional support staff to some of the most high-powered, influential organizations in the Northern Virginia and metropolitan Washington, DC area. The TRAK Companies consist of TRAK Legal, TRAK Services, and TRAK's Records and Library Division. TRAK Legal places legal and project support staff in law firms and corporate legal departments. TRAK Services supplies executive, administrative, and conference support to corporations, associations, and other organizations. And TRAK's Records and Library Division offers experienced personnel in the information services arena. TRAK recruiters select candidates from an extensive network of highly qualified professionals, many of whom we represent on an exclusive basis. Our in-depth understanding of support roles in the business environment allows us to find the right staffing and recruiting solutions for you. TRAK Services is currently recruiting fo...",0


Токенизация

In [45]:
def fast_encode(texts, tokenizer, chunk_size=256, maxlen=512):
    tokenizer.enable_truncation(max_length=maxlen)
    tokenizer.enable_padding(max_length=maxlen)
    all_ids = []
    
    for i in tqdm(range(0, len(texts), chunk_size)):
        text_chunk = texts[i:i+chunk_size].tolist()
        encs = tokenizer.encode_batch(text_chunk)
        all_ids.extend([enc.ids for enc in encs])
    
    return np.array(all_ids)

TPU using

In [46]:
AUTO = tf.data.experimental.AUTOTUNE

tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)
strategy = tf.distribute.experimental.TPUStrategy(tpu)

#https://huggingface.co/distilbert-base-uncased
fast_tokenizer = BertWordPieceTokenizer('distilbert_base_uncased/vocab.txt', lowercase=False)
fast_tokenizer

Tokenizer(vocabulary_size=28996, model=BertWordPiece, unk_token=[UNK], sep_token=[SEP], cls_token=[CLS], pad_token=[PAD], mask_token=[MASK], clean_text=True, handle_chinese_chars=True, strip_accents=True, lowercase=False, wordpieces_prefix=##)

In [47]:
MAX_SEQ_LENGTH = 512    
train_df, valid_df = train_test_split(X_train, test_size=0.2, stratify=X_train.labels, shuffle=True)
x_train = fast_encode(train_df.text.astype(str), fast_tokenizer, maxlen=MAX_SEQ_LENGTH)
x_valid = fast_encode(valid_df.text.astype(str), fast_tokenizer, maxlen=MAX_SEQ_LENGTH)
x_test = fast_encode(X_test.text.astype(str), fast_tokenizer, maxlen=MAX_SEQ_LENGTH)

y_train = train_df.labels.values
y_valid = valid_df.labels.values

100%|██████████| 34/34 [00:05<00:00,  6.21it/s]
100%|██████████| 9/9 [00:01<00:00,  6.77it/s]
100%|██████████| 28/28 [00:04<00:00,  6.22it/s]


In [48]:
train_dataset = (
    tf.data.Dataset
    .from_tensor_slices((x_train, y_train))
    .repeat()
    .shuffle(2048)
    .batch(64)
    .prefetch(AUTO)
)

valid_dataset = (
    tf.data.Dataset
    .from_tensor_slices((x_valid, y_valid))
    .batch(64)
    .cache()
    .prefetch(AUTO)
)

test_dataset = [(
    tf.data.Dataset
    .from_tensor_slices(x_test)
    .batch(64)
)]

In [None]:
def focal_loss(gamma=2., alpha=.2):
    def focal_loss_fixed(y_true, y_pred):
        pt_1 = tf.where(tf.equal(y_true, 1), y_pred, tf.ones_like(y_pred))
        pt_0 = tf.where(tf.equal(y_true, 0), y_pred, tf.zeros_like(y_pred))
        return -K.mean(alpha * K.pow(1. - pt_1, gamma) * K.log(pt_1)) - K.mean((1 - alpha) * K.pow(pt_0, gamma) * K.log(1. - pt_0))
    return focal_loss_fixed

In [49]:
def build_lrfn(lr_start=0.000001, lr_max=0.000004, 
               lr_min=0.0000001, lr_rampup_epochs=7, 
               lr_sustain_epochs=0, lr_exp_decay=.87):
    lr_max = lr_max * strategy.num_replicas_in_sync

    def lrfn(epoch):
        if epoch < lr_rampup_epochs:
            lr = (lr_max - lr_start) / lr_rampup_epochs * epoch + lr_start
        elif epoch < lr_rampup_epochs + lr_sustain_epochs:
            lr = lr_max
        else:
            lr = (lr_max - lr_min) * lr_exp_decay**(epoch - lr_rampup_epochs - lr_sustain_epochs) + lr_min
        return lr
    
    return lrfn

In [50]:
import numpy as np
from keras.callbacks import Callback
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score

In [66]:
def build_model(transformer, loss='binary_crossentropy', max_len=512):
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    sequence_output = transformer(input_word_ids)[0]
    cls_token = sequence_output[:, 0, :]
    x = tf.keras.layers.Dropout(0.35)(cls_token)
    x = tf.keras.layers.Dense(256, activation='relu')(x)
    x = tf.keras.layers.Dropout(0.1)(x) #0.2
    
    out = Dense(1, activation='sigmoid')(x)
    
    model = Model(inputs=input_word_ids, outputs=out)
    model.compile(Adam(lr=3e-5), loss=loss, metrics=[tf.keras.metrics.Recall(), tf.keras.metrics.Precision()]) #tf.keras.metrics.AUC()
    
    return model

In [52]:
class Metrics(Callback):
    def __init__(self, val_data):
        super().__init__()
        self.validation_data = val_data

    def on_train_begin(self, logs={}):
        self.val_f1s = []
        self.val_recalls = []
        self.val_precisions = []

    def on_epoch_end(self, epoch, logs={}):
        val_predict = (np.asarray(self.model.predict(self.validation_data[0]))).round()
        val_targ = self.validation_data[1]
        _val_f1 = f1_score(val_targ, val_predict, average='macro')
        _val_recall = recall_score(val_targ, val_predict)
        _val_precision = precision_score(val_targ, val_predict)
        self.val_f1s.append(_val_f1)
        self.val_recalls.append(_val_recall)
        self.val_precisions.append(_val_precision)
        print("  val_f1: %f" %(_val_f1))
        return

metrics_callback = Metrics((x_valid, y_valid))

In [67]:
with strategy.scope():
    transformer_layer = TFBertModel.from_pretrained('bert-base-cased')
    model = build_model(transformer_layer, loss=focal_loss(gamma=1.4), max_len=512) #gamma=1.5
model.summary()

Model: "model_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_word_ids (InputLayer)  [(None, 512)]             0         
_________________________________________________________________
tf_bert_model_3 (TFBertModel ((None, 512, 768), (None, 108310272 
_________________________________________________________________
tf_op_layer_strided_slice_3  [(None, 768)]             0         
_________________________________________________________________
dropout_154 (Dropout)        (None, 768)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 256)               196864    
_________________________________________________________________
dropout_155 (Dropout)        (None, 256)               0         
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 257 

In [68]:
%%time
lrfn = build_lrfn()
lr_schedule = tf.keras.callbacks.LearningRateScheduler(lrfn, verbose=1)

train_history = model.fit(
    train_dataset,
    steps_per_epoch=134,
    validation_data=valid_dataset,
    callbacks=[metrics_callback, lr_schedule],
    epochs=20
)


Epoch 00001: LearningRateScheduler reducing learning rate to 1e-06.
Epoch 1/20

Epoch 00002: LearningRateScheduler reducing learning rate to 5.428571428571429e-06.
Epoch 2/20

Epoch 00003: LearningRateScheduler reducing learning rate to 9.857142857142859e-06.
Epoch 3/20

Epoch 00004: LearningRateScheduler reducing learning rate to 1.4285714285714289e-05.
Epoch 4/20

Epoch 00005: LearningRateScheduler reducing learning rate to 1.8714285714285717e-05.
Epoch 5/20

Epoch 00006: LearningRateScheduler reducing learning rate to 2.3142857142857145e-05.
Epoch 6/20

Epoch 00007: LearningRateScheduler reducing learning rate to 2.7571428571428577e-05.
Epoch 7/20

Epoch 00008: LearningRateScheduler reducing learning rate to 3.2e-05.
Epoch 8/20

Epoch 00009: LearningRateScheduler reducing learning rate to 2.7852999999999996e-05.
Epoch 9/20

Epoch 00010: LearningRateScheduler reducing learning rate to 2.4245109999999997e-05.
Epoch 10/20

Epoch 00011: LearningRateScheduler reducing learning rate to 2

In [69]:
preds_valid = model.predict(x_valid)
preds_valid[preds_valid >= 0.5] = 1
preds_valid[preds_valid < 0.5] = 0

In [70]:
import sklearn.metrics as metrics
metrics.f1_score(y_valid, preds_valid, average='macro')

0.9560493309576215

In [71]:
res = model.predict(x_test)

In [72]:
res[res >= 0.5] = 1
res[res < 0.5] = 0

In [73]:
result = pd.read_csv('/kaggle/input/rucode-fake-job-postings/sample_submission.csv')
result['Фейк'] = res
result['Фейк'] = result['Фейк'].astype('int')
result.head()

Unnamed: 0,Id,Фейк
0,10728,0
1,10729,0
2,10730,0
3,10731,0
4,10732,0


In [74]:
result.to_csv('result.csv', index=None)