In [4]:
!pip install transformers

In [None]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split


In [5]:
dataset = pd.read_csv('dataset_merged_notes.csv')
dataset.head()

Unnamed: 0.1,Unnamed: 0,input,output
0,0,Carl Whitebeck died in July 2021 Carl Whitebec...,Carl Whitbeck’s sudden death in July 2021 leav...
1,3,Carl Whitbeck took over management in June 201...,"Carl ""Pepper"" Whitbeck formally took over the ..."
2,5,Carl Whitbeck has been a member of the team si...,Whitbeck had been a member of the team since 2...
3,8,Michael Graham worked alongside Carl Whitbeck ...,"Michael Graham, who worked alongside Whitbeck ..."
4,13,Robert Houle is the US high-yield portfolio ma...,"Robert Houle, US high-yield portfolio manager ..."


In [6]:
# given input text
dataset['input'].iloc[0]

'Carl Whitebeck died in July 2021 Carl Whitebeck’s death leaves a significant gap for the team The People Pillar was downgraded from Above Average to Average'

In [7]:
# target text
dataset['output'].iloc[0]

'Carl Whitbeck’s sudden death in July 2021 leaves a significant gap in overall experience for the team and leads to a downgrade of the People Pillar to Average from Above Average. '

In [8]:
dataset['max_length_input'] = dataset['input'].apply(lambda x:len(x.split(' ')))
dataset['max_length_input'].max()

80

In [9]:
dataset['max_length_output'] = dataset['output'].apply(lambda x:len(x.split(' ')))
dataset['max_length_output'].max()

60

In [7]:
X = dataset['input']
y = dataset['output']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10,random_state = 42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(803,) (90,) (803,) (90,)


In [8]:
train_df = pd.concat([X_train,y_train],axis=1, ignore_index=True)
test_df = pd.concat([X_test, y_test],axis=1, ignore_index=True)

In [9]:
train_df= train_df.rename(columns={0:'input',1:'output'})
test_df= test_df.rename(columns={0:'input',1:'output'})

In [10]:
if torch.cuda.is_available():
   dev = torch.device("cuda:0")
   print("Running on the GPU")
else:
   dev = torch.device("cpu")
   print("Running on the CPU")

Running on the GPU


In [11]:
batch_size=8
num_of_batches=int(len(X_train)/batch_size)

In [13]:
from IPython.display import HTML, display
def progress(loss,value, max=100):
 return HTML(""" Batch loss :{loss}
      <progress    
value='{value}'max='{max}',style='width: 100%'>{value}
      </progress>
             """.format(loss=loss,value=value, max=max))

In [16]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("JulesBelveze/t5-small-headline-generator")
model = AutoModelForSeq2SeqLM.from_pretrained("JulesBelveze/t5-small-headline-generator")

In [17]:
model.to(dev)

T5ForConditionalGeneration(
  (shared): Embedding(32100, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32100, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseReluDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Dro

In [18]:
from transformers import Adafactor
optimizer = Adafactor(model.parameters(),lr=1e-3,
                      eps=(1e-30, 1e-3),
                      clip_threshold=1.0,
                      decay_rate=-0.8,
                      beta1=None,
                      weight_decay=0.0,
                      relative_step=False,
                      scale_parameter=False,
                      warmup_init=False)

2022-11-15 11:36:21.629321: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-11-15 11:36:21.781249: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2022-11-15 11:36:21.813154: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [19]:
def trainfun(model,train_df,batch_size,num_of_epochs,optimizer):
  model.train()
  num_of_batches=int(len(train_df)/batch_size)
  loss_per_10_steps=[]
  for epoch in range(1,num_of_epochs+1):
    print('Running epoch: {}'.format(epoch))
    running_loss=0
    out = display(progress(1, num_of_batches+1), display_id=True)
    for i in range(num_of_batches):
      # print(f'Batch {i}')
      inputbatch=[]
      labelbatch=[]
      new_df=train_df[i*batch_size:i*batch_size+batch_size]
      for indx,row in new_df.iterrows():
        input = str(row['input']) + '[SEP]' 
        labels = row['output']+'[SEP]'   
        inputbatch.append(input)
        labelbatch.append(labels)
      inputbatch=tokenizer.batch_encode_plus(inputbatch,padding=True,max_length=512,return_tensors='pt')["input_ids"]
      labelbatch=tokenizer.batch_encode_plus(labelbatch,padding=True,max_length=512,return_tensors="pt") ["input_ids"]
      inputbatch=inputbatch.to(dev)
      labelbatch=labelbatch.to(dev)

    # clear out the gradients of all Variables 
      optimizer.zero_grad()

    # Forward propogation
      outputs = model(input_ids=inputbatch, labels=labelbatch)
      loss = outputs.loss
      loss_num=loss.item()
      logits = outputs.logits
      running_loss+=loss_num
      if i%10 ==0:      
        loss_per_10_steps.append(loss_num)
      out.update(progress(loss_num,i, num_of_batches+1))

    # calculating the gradients
      loss.backward()

    #updating the params
      optimizer.step()
    
    running_loss=running_loss/int(num_of_batches)
    print('Epoch: {} , Running loss: {}'.format(epoch,running_loss))
  return model,loss_per_10_steps

In [20]:
model,loss_per_10_steps=trainfun(model,train_df,batch_size,15,optimizer)

Running epoch: 1


  "`max_length` is ignored when `padding`=`True` and there is no truncation strategy. "


Epoch: 1 , Running loss: 1.3118797218799592
Running epoch: 2


Epoch: 2 , Running loss: 0.936604059934616
Running epoch: 3


Epoch: 3 , Running loss: 0.7693475010991097
Running epoch: 4


Epoch: 4 , Running loss: 0.6386855405569076
Running epoch: 5


Epoch: 5 , Running loss: 0.5482774794101715
Running epoch: 6


Epoch: 6 , Running loss: 0.4483312977850437
Running epoch: 7


Epoch: 7 , Running loss: 0.3845948837697506
Running epoch: 8


Epoch: 8 , Running loss: 0.3208825723826885
Running epoch: 9


Epoch: 9 , Running loss: 0.2683559723198414
Running epoch: 10


Epoch: 10 , Running loss: 0.22361986547708512
Running epoch: 11


Epoch: 11 , Running loss: 0.18686548475176096
Running epoch: 12


Epoch: 12 , Running loss: 0.1673224437981844
Running epoch: 13


Epoch: 13 , Running loss: 0.14868485525250436
Running epoch: 14


Epoch: 14 , Running loss: 0.1278131880238652
Running epoch: 15


Epoch: 15 , Running loss: 0.1067809484526515


In [21]:
# save model
torch.save(model.state_dict(), "t5_model.pt")

In [22]:
def prediction(model, sent):
  model.eval()
  sent1 = str(sent) + "[SEP]"
  input_ids = tokenizer.encode(sent1, return_tensors="pt",max_length=512,padding=True)
  input_ids=input_ids.to(dev)
  outputs = model.generate(input_ids,num_beams=8, do_sample=True, min_length=10, max_length=512)
  #print(outputs)
  z=tokenizer.decode(outputs[0],skip_special_tokens=True,min_length=512)
  return(z)

In [None]:
test_df['predicted']=test_df['input'].apply(lambda x: prediction(model,x))
test_df['predicted']=test_df['predicted'].apply(lambda x: x.replace('[SEP]',''))
test_df.to_csv('res_testset.csv')

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
model = AutoModelForSeq2SeqLM.from_pretrained("ramsrigouthamg/t5-large-paraphraser-diverse-high-quality")
tokenizer = AutoTokenizer.from_pretrained("ramsrigouthamg/t5-large-paraphraser-diverse-high-quality")


def paraphrase(s):   
    context = s
    text = "paraphrase: "+context + " </s>"

    encoding = tokenizer.encode_plus(text,max_length =128, padding=True, return_tensors="pt")
    input_ids,attention_mask  = encoding["input_ids"].to(dev), encoding["attention_mask"].to(dev)
    model.eval()
    diverse_beam_outputs = model.generate(
        input_ids=input_ids,attention_mask=attention_mask,
        max_length=128,
        early_stopping=True,
        num_beams=5,
        num_beam_groups = 5,
        num_return_sequences=1,
        diversity_penalty = 0.70
    )
#     print ("Original: ",context)
    for beam_output in diverse_beam_outputs:
        sent = tokenizer.decode(beam_output, skip_special_tokens=True,clean_up_tokenization_spaces=True)
        return (sent)
    
test_df['paraphrased'] = test_df['predicted'].apply(paraphrase)