In [1]:
!pip install -q sentence-transformers scikit-multilearn nlpaug skorch torch scikit-plot tensorflow-addons

[K     |████████████████████████████████| 85 kB 2.6 MB/s 
[K     |████████████████████████████████| 89 kB 7.4 MB/s 
[K     |████████████████████████████████| 405 kB 38.6 MB/s 
[K     |████████████████████████████████| 128 kB 49.2 MB/s 
[K     |████████████████████████████████| 1.1 MB 31.4 MB/s 
[K     |████████████████████████████████| 2.6 MB 24.9 MB/s 
[K     |████████████████████████████████| 1.2 MB 35.5 MB/s 
[K     |████████████████████████████████| 50 kB 5.8 MB/s 
[K     |████████████████████████████████| 3.3 MB 37.4 MB/s 
[K     |████████████████████████████████| 636 kB 29.5 MB/s 
[K     |████████████████████████████████| 895 kB 38.1 MB/s 
[?25h  Building wheel for sentence-transformers (setup.py) ... [?25l[?25hdone


In [2]:
!pip install -q comet-ml==3.2.10

[K     |████████████████████████████████| 241 kB 5.2 MB/s 
[K     |████████████████████████████████| 52 kB 1.2 MB/s 
[K     |████████████████████████████████| 530 kB 39.2 MB/s 
[?25h  Building wheel for configobj (setup.py) ... [?25l[?25hdone


In [3]:
# Import libraries
from comet_ml import Experiment
import numpy as np
import os
import pandas as pd
import random
import seaborn as sns
import scikitplot as skplt
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
import tensorflow as tf
from tensorflow.keras import backend as K
from tensorflow.keras import initializers
from transformers import DistilBertTokenizerFast
from transformers import TFDistilBertModel, DistilBertConfig



# Import matplotlib
pd.plotting.register_matplotlib_converters()
import matplotlib.pyplot as plt
get_ipython().run_line_magic('matplotlib', 'inline')

In [4]:
def Accuracy(y_true, y_pred):
    """
    Accuracy based on Jaccard Similarity Score
    :param y_true: ground truth
    :param y_pred: prediction
    :return: Jaccard Similarity Score
    """
    jaccard = np.minimum(y_true, y_pred).sum(axis=1) / np.maximum(y_true, y_pred).sum(axis=1)
    return jaccard.mean()


def print_ml_score(y_test, y_pred, clf):
    print('Classifier: ', clf.__class__.__name__)
    print('Accuracy Score: {}'.format(Accuracy(y_test, y_pred)))
    print("-----------------------------------")


def train_model(classifier, feature_vector_train, label_train, feature_vector_test, label_test):
    # fit the training set on the classifier
    clf = ClassifierChain(classifier)
    clf.fit(feature_vector_train, label_train)

    # predict the labels on test set
    predictions = clf.predict(feature_vector_test)
    #print(pd.DataFrame(predictions, columns = multilabel.classes_))
    return print_ml_score(label_test, predictions, classifier)

In [5]:
# Notebook parameters
data_name = 'papers_w_JELcode.csv'
data_path = 'data/'

In [6]:
# load data
import numpy as np
import pandas as pd
import ast

df = pd.read_csv(data_path + data_name)
#df.drop(columns=['Unnamed: 0'], inplace=True)
df['tags'] = df['tags'].apply(lambda x: ast.literal_eval(x))
print(df.shape)
df.head()

(3126, 71)


Unnamed: 0,title_x,idpaper_2,journal,journal_num,year,month,volume,issue,idpaper,jelcodes,keywords,abstract,start_page,end_page,num_words,num_words_90_flag,idauthor,author,prop_women,num_authors,gender_group_type,sole_or_coauthors,num_pages,num_pages_dmean,flesch_kincaid_grade_level,log_flesch_kincaid_grade_level,flesch_kincaid_reading_ease,log_flesch_kincaid_reading_ease,dale_chall,log_dale_chall,coleman_liau_index,log_coleman_liau_index,automated_readability_index,log_automated_readability_index,american_economic_review,econometrica,journal_of_pol_economy,quarterly_journal_of_economics,review_of_economic_studies,coauthors,single_author,both_genders,only_females,only_males,num_sentences,num_syllables,observation,authors,jelcodes_letter,jelcodes_text,jel_dummy_A,jel_dummy_B,jel_dummy_C,jel_dummy_D,jel_dummy_E,jel_dummy_F,jel_dummy_G,jel_dummy_H,jel_dummy_I,jel_dummy_J,jel_dummy_K,jel_dummy_L,jel_dummy_M,jel_dummy_N,jel_dummy_O,jel_dummy_P,jel_dummy_Q,jel_dummy_R,jel_dummy_Y,jel_dummy_Z,tags
0,optimal adoption of complementary technologies,120009011529,American Economic Review,1,2000,March,90,1,1200090000.0,E22|G31|O33|D24,,When a production process requires two extreme...,15,29,92,1,"['p00681', 'p01338']","['boyan jovanovic', 'dmitriy stolyarov']",0.0,2,only_males,coauthors,14,-7.0,14.799565,2.694598,29.922609,3.398614,10.269474,2.329176,14.741739,2.690683,15.565435,2.745053,1,0,0,0,0,1,0,0,0,1,4,167,,,ODGE,"Economic Development, Innovation, Technologica...",0,0,0,1,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,"[O, D, G, E]"
1,collateral damage: effects of the japanese ban...,120009013045,American Economic Review,1,2000,March,90,1,1200090000.0,G21|E44,,The Japanese banking crisis provides a natural...,30,45,95,1,"['p01546', 'p02544']","['eric rosengren', 'joe peek']",0.0,2,only_males,coauthors,15,-6.0,19.863158,2.988867,9.055439,2.203366,12.52043,2.527362,16.131368,2.780766,20.729754,3.03157,1,0,0,0,0,1,0,0,0,1,3,186,,,GE,Financial Economics; Macroeconomics and Moneta...,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,"[G, E]"
2,endogenous inequality in integrated labor mark...,120009014672,American Economic Review,1,2000,March,90,1,1200090000.0,J41| J71,,We consider a market with red and green worker...,46,72,99,1,"['p00544', 'p01874', 'p03092']","['avner shaked', 'george mailath', 'larry samu...",0.0,3,only_males,coauthors,26,5.0,11.34601,2.428866,43.105682,3.763655,10.037223,2.306301,15.369697,2.732398,13.224545,2.582075,1,0,0,0,0,1,0,0,0,1,6,172,,,J,Labor and Demographic Welfare;,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,[J]
3,"labor-market integration, investment in risky ...",120009017395,American Economic Review,1,2000,March,90,1,1200090000.0,R23|J24|J31|J61,,This paper presents a general-equilibrium mode...,73,95,97,1,['p01266'],['david wildasin'],0.0,1,only_males,single_author,22,1.0,20.508737,3.020851,-8.782874,,13.629609,2.612245,21.532784,3.069577,21.577062,3.071631,1,0,0,0,0,0,1,0,0,1,4,219,,,JR,"Labor and Demographic Welfare; Urban, Rural, R...",0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,"[J, R]"
4,unequal societies: income distribution and the...,1200090196129,American Economic Review,1,2000,March,90,1,1200090000.0,D31|P16|I22|E62,,This paper develops a theory of inequality and...,96,129,99,1,['p04639'],['roland benabou'],0.0,1,only_males,single_author,33,12.0,21.714343,3.077973,-1.841818,,13.407542,2.595817,19.117576,2.950608,23.758182,3.167927,1,0,0,0,0,0,1,0,0,1,3,205,,,PEID,Economic Systems; Macroeconomics and Monetary ...,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,"[P, E, I, D]"


In [7]:
df['keep'] = df.tags.apply(lambda tags: 'Y' not in tags)

df = df[df.keep == True]
df['all_text'] = df.title_x + df.abstract
df = df[['all_text', 'tags']]
df.head()

Unnamed: 0,all_text,tags
0,optimal adoption of complementary technologies...,"[O, D, G, E]"
1,collateral damage: effects of the japanese ban...,"[G, E]"
2,endogenous inequality in integrated labor mark...,[J]
3,"labor-market integration, investment in risky ...","[J, R]"
4,unequal societies: income distribution and the...,"[P, E, I, D]"


In [8]:
from sklearn.preprocessing import MultiLabelBinarizer

multilabel = MultiLabelBinarizer()
y = multilabel.fit_transform(df['tags'])

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.all_text.to_list(), y,
                                          test_size=0.2,
                                          random_state=123)


In [10]:
def batch_encode(tokenizer, texts, batch_size=256, max_length=128):
    """""""""
    A function that encodes a batch of texts and returns the texts'
    corresponding encodings and attention masks that are ready to be fed 
    into a pre-trained transformer model.
    
    Input:
        - tokenizer:   Tokenizer object from the PreTrainedTokenizer Class
        - texts:       List of strings where each string represents a text
        - batch_size:  Integer controlling number of texts in a batch
        - max_length:  Integer controlling max number of words to tokenize in a given text
    Output:
        - input_ids:       sequence of texts encoded as a tf.Tensor object
        - attention_mask:  the texts' attention mask encoded as a tf.Tensor object
    """""""""
    
    input_ids = []
    attention_mask = []
    
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        inputs = tokenizer.batch_encode_plus(batch,
                                             max_length=max_length,
                                             padding='longest', #implements dynamic padding
                                             truncation=True,
                                             return_attention_mask=True,
                                             return_token_type_ids=False
                                             )
        input_ids.extend(inputs['input_ids'])
        attention_mask.extend(inputs['attention_mask'])
    

In [11]:
# Instantiate DistilBERT tokenizer...we use the Fast version to optimize runtime
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

# Encode X_train
#X_train_ids, X_train_attention = tokenizer(X_train, padding="max_length", truncation=True)
inputs = tokenizer(X_train, padding="max_length", truncation=True)
X_train_ids, X_train_attention = inputs['input_ids'], inputs['attention_mask']
# Encode X_test
#X_test_ids, X_test_attention = tokenizer(X_test, padding="max_length", truncation=True)
test_inputs = tokenizer(X_test, padding="max_length", truncation=True)
X_test_ids, X_test_attention = test_inputs['input_ids'], test_inputs['attention_mask']

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [12]:
np.array(X_train_attention)

array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]])

In [13]:
import tensorflow_addons as tfa

def build_model(transformer, num_clases, max_length=512):
    """
    Template for building a model off of the BERT or DistilBERT architecture
    for a binary classification task.
    
    Input:
      - transformer:  a base Hugging Face transformer model object (BERT or DistilBERT)
                      with no added classification head attached.
      - max_length:   integer controlling the maximum number of encoded tokens 
                      in a given sequence.
    
    Output:
      - model:        a compiled tf.keras.Model with added classification layers 
                      on top of the base pre-trained model architecture.
    """
    
    weight_initializer = tf.keras.initializers.GlorotNormal(seed=123) 
    
    # Define input layers
    input_ids_layer = tf.keras.layers.Input(shape=(max_length,), 
                                            name='input_ids', 
                                            dtype='int32')
    input_attention_layer = tf.keras.layers.Input(shape=(max_length,), 
                                                  name='input_attention', 
                                                  dtype='int32')
    
    last_hidden_state = transformer([input_ids_layer, input_attention_layer])[0]

    cls_token = last_hidden_state[:, 0, :]
    
    ##                                                 ##
    ## Define additional dropout and dense layers here ##
    ##                                                 ##
    

    output = tf.keras.layers.Dense(num_clases, 
                                   activation='sigmoid',
                                   kernel_initializer=weight_initializer,  
                                   kernel_constraint=None,
                                   bias_initializer='zeros'
                                   )(cls_token)
    
    # Define the model
    model = tf.keras.Model([input_ids_layer, input_attention_layer], output)
    
    # Compile the model
    model.compile(tf.keras.optimizers.Adam(lr=0.01), 
                  loss=tfa.losses.SigmoidFocalCrossEntropy(),
                  )
    
    return model

In [18]:

# The bare, pretrained DistilBERT transformer model outputting raw hidden-states 
# and without any specific head on top.
config = DistilBertConfig(dropout=0.2, 
                          attention_dropout=0.2, 
                          output_hidden_states=True)
distilBERT = TFDistilBertModel.from_pretrained('distilbert-base-uncased', config=config)

# Freeze DistilBERT layers to preserve pre-trained weights 
for layer in distilBERT.layers:
    layer.trainable = False

# Build model
model = build_model(distilBERT, 19)

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['vocab_transform', 'vocab_projector', 'vocab_layer_norm', 'activation_13']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.




  "The `lr` argument is deprecated, use `learning_rate` instead.")


In [19]:
# Train the model
train_history1 = model.fit(
    x = [np.asarray(X_train_ids), np.asarray(X_train_attention)],
    y = y_train,
    epochs = 10,
    batch_size = 16,
    validation_data = ([np.asarray(X_test_ids), np.asarray(X_test_attention)], y_test),
    verbose=1
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
# Unfreeze DistilBERT weights to enable fine-tuning
for layer in distilBERT.layers:
    layer.trainable = True

# Lower the learning rate to prevent destruction of pre-trained weights
optimizer = tf.keras.optimizers.Adam(lr=0.01)

# Recompile model after unfreezing


# Define callbacks
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss',
                                                  mode='min',
                                                  min_delta=0,
                                                  patience=0,
                                                  restore_best_weights=True)

# Train the model
train_history2 = model.fit(
    x = [np.asarray(X_train_ids), np.asarray(X_train_attention)],
    y = y_train,
    epochs = 10,
    batch_size = 16,
    validation_data = ([np.asarray(X_test_ids), np.asarray(X_test_attention)], y_test),
    verbose=1
)

  "The `lr` argument is deprecated, use `learning_rate` instead.")


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
 30/157 [====>.........................] - ETA: 1:24 - loss: 0.4277

In [20]:

# Generate predictions
y_pred = model.predict([np.asarray(X_test_ids), np.asarray(X_test_attention)])
y_pred_thresh = np.where(y_pred >= 0.3, 1, 0)

# Get evaluation results
accuracy = Accuracy(y_test, y_pred_thresh)

# Log evaluation metrics




print('Accuracy:  ', accuracy)    # 0.8801
    # 0.9656

Accuracy:   0.45977777777777773


In [None]:

# Build train_history
history_df1 = pd.DataFrame(train_history1.history)
history_df2 = pd.DataFrame(train_history2.history)
history_df = history_df1.append(history_df2, ignore_index=True)

# Plot training and validation loss over each epoch
history_df.loc[:, ['loss', 'val_loss']].plot()
plt.title(label='Training + Validation Loss Over Time', fontsize=17, pad=19)
plt.xlabel('Epoch', labelpad=14, fontsize=14)
plt.ylabel('Binary Crossentropy Loss', labelpad=16, fontsize=14)
print("Minimum Validation Loss: {:0.4f}".format(history_df['val_loss'].min()))

# Save figure
plt.savefig('figures/balanced_trainvalloss.png', dpi=300.0, transparent=True)

# Log the figure
experiment.log_image('figures/balanced_trainvalloss.png', name='Train Validation Loss')