In [None]:
!nvidia-smi

In [None]:
# import tensorflow as tf
# tpu = tf.distribute.cluster_resolver.TPUClusterResolver.connect()
# tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)
# with tpu_strategy.scope():
#     model = tf.keras.Sequential( … ) # define your model normally
#     model.compile( … )
    
# model.fit(training_dataset, epochs=EPOCHS, steps_per_epoch=…)

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input/abstract'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        print(dirname)

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from sklearn.model_selection import train_test_split
trainData = pd.read_csv('/kaggle/input/abstract/train.csv', sep=',', header="infer")
testData = pd.read_csv('/kaggle/input/abstract/test.csv', sep = ',', header = "infer")
testLabels = pd.read_csv('/kaggle/input/abstract/sample_submission.csv', sep = ',', header = "infer")

In [None]:
train_data, remaining = train_test_split(trainData, train_size=0.85, random_state=34)
test_data, val_data = train_test_split(remaining, train_size=0.7, random_state=34)
train_data.shape, val_data.shape, test_data.shape

# Dataset Exploration

In [None]:
trainData.head()

In [None]:
trainData.describe()

In [None]:
len(trainData[trainData['Computer Science'] == 1])

In [None]:
list(trainData.groupby(['Computer Science', 'Physics', 'Mathematics', 'Statistics', 'Quantitative Biology', 'Quantitative Finance']).aggregate('Quantitative Finance'))

In [None]:
trainData.columns.values

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS

# -> Plot of class distributions

In [None]:
categories = list(trainData.columns.values[3:])
sns.set(font_scale = 1)
plt.figure(figsize=(15,8))
ax= sns.barplot(x = categories, y = trainData.iloc[:,3:].sum().values)
plt.title("Abstract of each category", fontsize=24)
plt.ylabel('Number of abstracts', fontsize=18)
plt.xlabel('Abstract Type ', fontsize=18)
#adding the text labels
rects = ax.patches
labels = trainData.iloc[:,3:].sum().values
for rect, label in zip(rects, labels):
    height = rect.get_height()
    ax.text(rect.get_x() + rect.get_width()/2, height + 5, label, ha='center', va='bottom', fontsize=18)
    
plt.show()


In [None]:
sns.set(font_scale = 1)
plt.figure(figsize=(15,8))
multiLabel_counts = trainData.iloc[:,3:].sum(axis=1).value_counts()

ax = sns.barplot(x = multiLabel_counts.index,y = multiLabel_counts)

plt.title("Abstracts with multiple labels ")
plt.ylabel('Number of Abstracts', fontsize=18)
plt.xlabel('Number of labels', fontsize=18)
#adding the text labels
rects = ax.patches
labels = multiLabel_counts.values
for rect, label in zip(rects, labels):
    height = rect.get_height()
    ax.text(rect.get_x() + rect.get_width()/2, height + 5, label, ha='center', va='bottom')
plt.show()

# Transformer Model Implementation From Hugging Face

In [None]:
!pip install focal_loss
from focal_loss import BinaryFocalLoss
from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification, DistilBertTokenizerFast
import pandas as pd
import re
from sklearn.model_selection import train_test_split
import tensorflow as tf
import pickle

print("TF Version: ", tf.__version__)
print("Eager mode: ", tf.executing_eagerly())
print("GPU is", "available" if tf.config.experimental.list_physical_devices("GPU") else "NOT AVAILABLE")


class DistilBertTrain:
    def __init__(self):
        self.trainData = pd.read_csv('/kaggle/input/abstract/train.csv', sep=',', header="infer")
        self.trainData_cp = self.trainData.copy()
        self.MODEL_NAME = 'distilbert-base-uncased'
        self.X = 'ABSTRACT'
        self.Y = [3,4,5,6,7,8]
        self.num_classes = len(self.Y)
        self.BATCH_SIZE = 16
        self.MAX_LENGTH = 512
        self.N_EPOCHS = 5
        self.lr=1e-5
        self.tokenizer = DistilBertTokenizerFast.from_pretrained(self.MODEL_NAME)
        self.model = TFDistilBertForSequenceClassification.from_pretrained(self.MODEL_NAME, num_labels = self.num_classes)
        
    def preprocess(self):
        self.trainData_cp[self.X] = self.trainData[self.X].apply(lambda x: re.sub("\n"," ",x))
        train_data, val_data = train_test_split(self.trainData_cp, train_size=0.85, random_state=34)
        val_data, test_data = train_test_split(val_data, train_size=0.7, random_state=34)
        return train_data, val_data, test_data
        
    def distilBertTokenization(self, train_data, val_data, test_data):
        train_encodings = self.tokenizer(train_data.ABSTRACT.to_list(), truncation=True, padding=True)
        val_encodings = self.tokenizer(val_data.ABSTRACT.to_list(), truncation=True, padding=True)
        test_encodings = self.tokenizer(test_data.ABSTRACT.to_list(), truncation=True, padding=True)
        return train_encodings, val_encodings, test_encodings
    
    def distilBertPipelineGeneration(self, train_encodings, val_encodings, train_data, val_data):
        train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings),
                                    list(train_data.iloc[:,3:9].values)))
        val_dataset = tf.data.Dataset.from_tensor_slices((dict(val_encodings),
                                    list(val_data.iloc[:,3:9].values)))
        # train
        tr_pipe = (train_dataset.shuffle(len(train_data.ABSTRACT))
                  .batch(self.BATCH_SIZE, drop_remainder=True)
                    .prefetch(tf.data.experimental.AUTOTUNE)
                      )

  # valid
        val_pipe = (val_dataset.batch(self.BATCH_SIZE, drop_remainder=True)
                  .prefetch(tf.data.experimental.AUTOTUNE)
                    )
    
        return tr_pipe, val_pipe
    
    def fit(self, tr_data, vl_data):
        def scheduler(epoch, lr):
            if epoch < 2:
                return lr
            else:
                return lr * tf.math.exp(-0.1)
        
        optimizer = tf.keras.optimizers.Adam(learning_rate= self.lr)

        earlystp = tf.keras.callbacks.EarlyStopping(
                    monitor='val_loss',
                    min_delta=0,
                    patience=3,
                    verbose=1,
                    mode='auto',
                    baseline=None,
                    restore_best_weights=True)



        lr_schedule = tf.keras.callbacks.LearningRateScheduler(scheduler)

        loss = BinaryFocalLoss(gamma=2)

        self.model.compile(optimizer, loss, metrics=['accuracy'])
        self.model.fit(tr_data, epochs=self.N_EPOCHS, batch_size=self.BATCH_SIZE, validation_data = vl_data, callbacks =[lr_schedule, earlystp], verbose=1)
        return self.model
    
    def save(self):
        model_name = 'distilbert_base_uncased_model'
        self.model.save_pretrained('./model/+{model_name}+')
        with open('./model/info.pkl', 'wb') as f:
            pickle.dump(('distilbert_base_uncased_model', self.MAX_LENGTH), f)
            
#     def load(self):
#         new_model = TFDistilBertForSequenceClassification.from_pretrained('./model/distilbert_base_uncased_model')
#         self.model_name, self.MAX_LENGTH = pickle.load(open('./model/info.pkl', 'rb'))
            
    



# Class instance creation and invokation

In [None]:
model = DistilBertTrain()

# Data Preprocessing

In [None]:
train_data, val_data, test_data = model.preprocess()

# DistilBERT Tokenization

In [None]:
train_encodings, val_encodings, test_encodings = model.distilBertTokenization(train_data, val_data, test_data)

# Tenforflow IO Pipeline creation using tf.data 

In [None]:
tr_pipe, val_pipe = model.distilBertPipelineGeneration(train_encodings, val_encodings, train_data, val_data)

# Model Training

In [68]:
distilBert=model.fit(tr_pipe, val_pipe)

In [69]:
import matplotlib.pyplot as plt

def plot_graphs(history, metric):
  plt.plot(history.history[metric])
  plt.plot(history.history['val_'+metric], '')
  plt.xlabel("Epochs")
  plt.ylabel(metric)
  plt.legend([metric, 'val_'+metric])
  plt.show()
    
plot_graphs(distilBert, 'loss')


In [87]:
test_data.ABSTRACT[:3].to_list()

['  We present a new proof of a fundamental result concerning cycles of random\npermutations which gives some intuition for the connection between Touchard\npolynomials and the Poisson distribution. We also introduce a rather novel\npermutation statistic and study its distribution. This quantity, indexed by\n$m$, is the number of sets of size $m$ fixed by the permutation. This leads to\na new and simpler derivation of the exponential generating function for the\nnumber of covers of certain multisets.\n',
 '  This paper is a contribution to the study of the universal Horn fragment of\npredicate fuzzy logics, focusing on the proof of the existence of free models\nof theories of Horn clauses over Rational Pavelka predicate logic. We define\nthe notion of a term structure associated to every consistent theory T over\nRational Pavelka predicate logic and we prove that the term models of T are\nfree on the class of all models of T. Finally, it is shown that if T is a set\nof Horn clauses, th

In [165]:
# tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
# test_encodings = tokenizer(test_data.ABSTRACT.to_list(), truncation=True, padding=True)
# test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings),
#                                     list(test_data.iloc[:,3:9].values)))
# test_da = (test_dataset.batch(1))
# preds=distilBert.predict(test_da)
# pred_labels = [1 if pred >0.5 else 0 for predictions in preds['logits'] for pred in predictions]
def logits_to_labels(preds):
    labels = np.zeros(preds['logits'].shape)
    for i in np.arange(len(preds['logits'])):
        for j in np.arange(6):
            if preds['logits'][i][j] >0.5:
                labels[i][j] = 1
            else:
                continue
    return labels
        
labels = logits_to_labels(preds)

In [185]:
predDF =pd.DataFrame(labels, columns = ['Computer Science', 'Physics', 'Mathematics',
       'Statistics', 'Quantitative Biology', 'Quantitative Finance'])
predDF.to_csv('predictions.csv')

In [172]:
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support
cm = confusion_matrix(test_data.iloc[:,3:].values.argmax(axis=1), labels.argmax(axis=1))
cm

array([[831,  10,  36,  24,   2,   0],
       [ 34, 519,  20,   2,   6,   0],
       [ 45,  22, 380,  24,   0,   0],
       [121,   4,  14,  41,   5,   0],
       [ 16,   9,   0,   2,  16,   0],
       [ 17,   1,   1,   0,   0,   0]])

In [191]:
# precision, recall, fscore ,_= precision_recall_fscore_support(test_data.iloc[:,3:].values, labels)
print('precision', precision)
print('Recall', recall)
print('F1 score', fscore)

precision [0.80765456 0.91840278 0.86705202 0.79044118 0.63636364 0.        ]
Recall [0.91140642 0.84775641 0.73529412 0.78181818 0.36842105 0.        ]
F1 score [0.85639958 0.88166667 0.79575597 0.78610603 0.46666667 0.        ]


In [None]:
# model_name = 'distilbert_base_uncased_model'
distilBert.save_pretrained('./model/distilbert_base_uncased_model')
with open('./model/info.pkl', 'wb') as f:
    pickle.dump(('distilbert_base_uncased_model', 512), f)

# Model Prediction

In [None]:
class DistilBertPredict:
    def __init__(self, model, test_data):
        self.
    
    
#     def predict(self, test_data):

In [None]:
DistilBertConfig()

# Dynamic sentence splitting and check for the confidence on each consecutive addition to last sentence

In [None]:
import spacy
# ! python -m spacy download en_core_web_lg
# spacy_lg = spacy.load('en_core_web_lg')
! python -m spacy download en_core_web_sm
spacy_sm = spacy.load('en_core_web_sm')

In [None]:

def dynamicSentenceSelection(text, model):
    sent_list = list(spacy_sm(train_data.ABSTRACT[0]).sents)
    num_sents = len(sent_list)
    prob_list = np.zeros([num_sents])
    
    for i in np.arange(num_sents):
        np.append(prob_list, model.predict(sent_list[:i]))
        
    np.mean(prob_list)
    return prob_list
        
dynamicSentenceSelection(train_data.ABSTRACT[0])

In [None]:
Hyperparameter Tuning

1. Learning rate
2. Early stopping
3. Gradient clipping
4. number of epochs to train
5. dropout
6. regularization