# 1. Importing Tools

Then, I used TFAutoModelForCasualLM and AutoTokenizer to automatically load the correct model based on a specific checkpoint. A checkpoint contains the weights of a pre-trained model.

In this case, I imported the DistilGPT-2 checkpoint. I also set the end-of-sequence token as a padding token.



In [1]:
from transformers import TFAutoModelForCausalLM, AutoTokenizer, AdamWeightDecay, pipeline, create_optimizer
from transformers import DefaultDataCollator
import plotly.express as px
import plotly.io as pio
import pandas as pd
import math
import os
from sklearn import tree, metrics
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import export_graphviz
os.environ["TOKENIZERS_PARALLELISM"] = "false"
pio.renderers.default = 'notebook_connected'

In [3]:
import tqdm

In [4]:

tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
tokenizer.pad_token = tokenizer.eos_token
model = TFAutoModelForCausalLM.from_pretrained("distilgpt2", pad_token_id=tokenizer.eos_token_id)

Downloading: 100%|██████████| 762/762 [00:00<00:00, 281kB/s]
Downloading: 100%|██████████| 1.04M/1.04M [00:00<00:00, 2.56MB/s]
Downloading: 100%|██████████| 456k/456k [00:00<00:00, 1.18MB/s]
Downloading: 100%|██████████| 1.36M/1.36M [00:00<00:00, 3.09MB/s]
Downloading: 100%|██████████| 328M/328M [00:23<00:00, 14.2MB/s] 
2023-01-29 22:20:40.264872: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2023-01-29 22:20:40.264948: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


Metal device set to: Apple M2

systemMemory: 16.00 GB
maxCacheSize: 5.33 GB



All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at distilgpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


# check systems

In [1]:
import sys
import tensorflow.keras
import pandas as pd
import sklearn as sk
import scipy as sp
import tensorflow as tf
import platform
print(f"Python Platform: {platform.platform()}")
print(f"Tensor Flow Version: {tf.__version__}")
print(f"Keras Version: {tensorflow.keras.__version__}")
print()
print(f"Python {sys.version}")
print(f"Pandas {pd.__version__}")
print(f"Scikit-Learn {sk.__version__}")
print(f"SciPy {sp.__version__}")
gpu = len(tf.config.list_physical_devices('GPU'))>0
print("GPU is", "available" if gpu else "NOT AVAILABLE")

Python Platform: macOS-13.1-arm64-arm-64bit
Tensor Flow Version: 2.11.0
Keras Version: 2.11.0

Python 3.10.8 | packaged by conda-forge | (main, Nov 22 2022, 08:25:29) [Clang 14.0.6 ]
Pandas 1.5.3
Scikit-Learn 1.2.1
SciPy 1.10.0
GPU is available


# 2. Loading Data

In [26]:
dataframe = pd.read_excel('data/publications/final_database_of_papers.xlsx',index_col=0)

In [27]:
dataframe.describe()

Unnamed: 0,article_id
count,3140.0
mean,34613920.0
std,1953457.0
min,29289380.0
25%,33580380.0
50%,35298630.0
75%,36242750.0
max,36564900.0


In [28]:
dataframe.columns

Index(['company_name', 'article_id', 'title', 'keywords', 'publication_date',
       'abstract', 'journal', 'doi', 'keyword_display'],
      dtype='object')

In [29]:
dataframe.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3140 entries, 0 to 3139
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   company_name      3140 non-null   object        
 1   article_id        3140 non-null   int64         
 2   title             3140 non-null   object        
 3   keywords          3140 non-null   object        
 4   publication_date  3140 non-null   datetime64[ns]
 5   abstract          3140 non-null   object        
 6   journal           3140 non-null   object        
 7   doi               3135 non-null   object        
 8   keyword_display   3140 non-null   object        
dtypes: datetime64[ns](1), int64(1), object(7)
memory usage: 245.3+ KB


# gather useful columns

In [16]:
df = dataframe[['abstract', 'title' , 'article_id']]

In [17]:
""" function to split df into train and test with sklearn train_test_split"""
def split_train_test(df, test_size=0.2):
    df = df.sample(frac=1).reset_index(drop=True)
    train, test = train_test_split(df, test_size=test_size, random_state=42)
    return train, test

In [34]:
train_gpt, test_gpt = split_train_test(df)

In [21]:

abstracts = [len(x.split()) for x in df["abstract"]]
px.histogram(abstracts, nbins=400, marginal="rug", labels={"value":"Article Length (words)"})

# Tokenize then lock and load into training

To tokenize the data I defined a generic tokenization function, and then I applied this function to all the samples by using map(). Inside the tokenization function, I used the tokenizer imported in the beginning.

The tokenizer has some important parameters to set:

column to tokenize. In this case “abstract”.
padding. In this case = “max_lenght” to pad a sequence to the maximum length specified by the max_length parameter.
truncation. If true, truncates sequences longer than the maximum length, specified by the max_length parameter.
max_length. Specifies the maximum length of a sequence.
Please note that by default the map() method sends batches of 1000 samples.

In [37]:

# The tokenization function
def tokenization(data):
    tokens = tokenizer(data, padding="max_length", truncation=True, max_length=300)
    return tokens
train_gpt = train_gpt.reset_index(drop=True)
test_gpt = test_gpt.reset_index(drop=True)
train = train_gpt[['abstract']]
val = test_gpt[['abstract']]
# Apply the tokenizer in batch mode and drop all the columns except the tokenization result
train_token = train_gpt['abstract'].map(tokenization)
val_token = test_gpt['abstract'].map(tokenization)

In [39]:
def create_labels(text):
    text["labels"] = text["input_ids"].copy()
    return text

# Add the labels column using map()
lm_train = train_token.map(create_labels)
lm_val = val_token.map(create_labels)

In [76]:
train_set = tf.data.Dataset.from_tensor_slices(dict(lm_train))
validation_set = tf.data.Dataset.from_tensor_slices(dict(lm_val))

In [43]:
train_set = train_set.batch(16)
train_set = train_set.shuffle(42)
validation_set = validation_set.batch(16)
validation_set = validation_set.shuffle(42)


In [77]:
# Setting up the learning rate scheduler
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate=0.0005,
    decay_steps=500,
    decay_rate=0.95,
    staircase=False)
    
# Exponential decay learning rate
optimizer = AdamWeightDecay(learning_rate=lr_schedule, weight_decay_rate=0.01)

In [78]:
model.compile(optimizer=optimizer, loss=model.compute_loss)
model.summary()

Model: "tfgpt2lm_head_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 transformer (TFGPT2MainLaye  multiple                 81912576  
 r)                                                              
                                                                 
Total params: 81,912,576
Trainable params: 81,912,576
Non-trainable params: 0
_________________________________________________________________


In [79]:
# This cell is optional
from transformers.keras_callbacks import PushToHubCallback

model_name = "GPT-2_PubMed"
push_to_hub_model_id = f"{model_name}-finetuned-papers"



In [80]:

from huggingface_hub import login
login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [85]:

# Fit with callbacks
#model.fit(lm_train, validation_data=validation_set, epochs=1, workers=9, use_multiprocessing=True)

# try using pytorch

In [84]:
import pandas as pd
from sklearn.model_selection import train_test_split
import re


In [86]:
df= pd.read_excel('data/publications/final_database_of_papers.xlsx',index_col=0)
train_test_ratio = 0.9
train_valid_ratio = 7/9
df_full_train, df_test = train_test_split(df, train_size = train_test_ratio, random_state = 1)
df_train, df_valid = train_test_split(df_full_train, train_size = train_valid_ratio, random_state = 1)

In [89]:
def build_dataset(df, dest_path):
    f = open(dest_path, 'w')
    data = ''
    summaries = df['abstract'].tolist()
    for summary in summaries:
        summary = str(summary).strip()
        summary = re.sub(r"\s", " ", summary)
        bos_token = '<BOS>'
        eos_token = '<EOS>'
        data += bos_token + ' ' + summary + ' ' + eos_token + '\n'
        
    f.write(data)

In [90]:
build_dataset(df_train, 'train.txt')
build_dataset(df_valid, 'valid.txt')
build_dataset(df_test, 'test.txt')
