In [1]:
! pip install simplet5

import random
import torch
import re
import os
import string
import numpy as np
import pandas as pd

from simplet5 import SimpleT5

DEPRECATION: pytorch-lightning 1.5.10 has a non-standard dependency specifier torch>=1.7.*. pip 24.0 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of pytorch-lightning or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063

[notice] A new release of pip is available: 23.3.2 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip




  from .autonotebook import tqdm as notebook_tqdm





Global seed set to 42


In [2]:
class Settings:

    MODEL_TYPE = "t5"
    MODEL_NAME = "t5-base"

    DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # training data directory
    TRAIN_DATA = "../data/train.csv"
    TEST_DATA = "../data/test_text.csv"
    VAL_DATA = "../data/validation.csv"

    Columns = ['titles', 'text']

    USE_GPU = None
    if str(DEVICE) == "cuda":
        USE_GPU=True
    else:
        USE_GPU = False

    EPOCHS = 5

    encoding = 'utf-8'
    columns_dict = {"titles": "target_text", "text": "source_text"}
    df_column_list = ['source_text', 'target_text']
    SUMMARIZE_KEY = "summarize: "
    SOURCE_TEXT_KEY = 'source_text'
    BATCH_SIZE = 4
    source_max_token_len = 128
    target_max_token_len = 50
    train_df_len = 5000
    test_df_len = 100

In [3]:
class Preprocess:
    def __init__(self):
        self.settings = Settings

    def clean_text(self, text):
        text = text.lower()
        text = re.sub('^.*?- ', '', text)
        text = re.sub('\[.*?\]', '', text)
        text = re.sub('https?://\S+|www\.\S+', '', text)
        text = re.sub('<.*?>+', '', text)
        text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
        text = re.sub('\n', '', text)
        text = re.sub('\w*\d\w*', '', text)
        return text

    def preprocess_data_training(self, data_path):
        df = pd.read_csv(data_path, encoding=self.settings.encoding)
        # simpleT5 expects dataframe to have 2 columns: "source_text" and "target_text"
        df = df.rename(columns=self.settings.columns_dict)
        df = df[self.settings.df_column_list]
        df[self.settings.SOURCE_TEXT_KEY]=df[self.settings.SOURCE_TEXT_KEY].map(lambda x: self.clean_text(x))
        # T5 model expects a task related prefix: since it is a summarization task, we will add a prefix "summarize: "
        df[self.settings.SOURCE_TEXT_KEY] = self.settings.SUMMARIZE_KEY + df[self.settings.SOURCE_TEXT_KEY]
        return df

    def preprocess_data_testing(self, data_path):
        df = df = pd.read_csv(data_path, encoding=self.settings.encoding)
        df[self.settings.Columns[1]]=df[self.settings.Columns[1]].map(lambda x: self.clean_text(x))
        return df

In [4]:
class T5Model:
    def __init__(self, model_type, model_name):
        self.model = SimpleT5()
        self.model.from_pretrained(model_type=model_type,
                                   model_name=model_name)

    def load_model(self, model_type, model_path, use_gpu: bool):
        try:
            self.model.load_model(
                model_type=model_type,
                model_dir=model_path,
                use_gpu=use_gpu
            )

        except BaseException as ex:
            print("error occurred while loading model ", str(ex))

In [5]:
class Train:
    def __init__(self):
        # initialize required class
        self.settings = Settings
        self.preprocess = Preprocess()

        # initialize required variables
        self.t5_model = None
        self.train_df = None
        self.test_df = None
        self.val_df = None
        self.df_ = None

    def __initialize(self):
        try:
            self.t5_model = T5Model(model_name=self.settings.MODEL_NAME,
                                    model_type=self.settings.MODEL_TYPE)

        except BaseException as ex:
            print("error occurred while loading model ", str(ex))

    def set_seed(self, seed_value=42):
        random.seed(seed_value)
        np.random.seed(seed_value)
        torch.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)

    def train(self):
        try:
            self.t5_model.model.train(train_df=self.train_df,
                                      eval_df=self.val_df,
                                      source_max_token_len=self.settings.source_max_token_len,
                                      target_max_token_len=self.settings.target_max_token_len,
                                      batch_size=self.settings.BATCH_SIZE, max_epochs=self.settings.EPOCHS,
                                      use_gpu=self.settings.USE_GPU)
        except BaseException as ex:
            print("error occurred while loading model ", str(ex))

    def run(self):
        try:
            print("Loading and Preparing the Dataset-----!! ")
            self.test_df = self.preprocess.preprocess_data_testing(self.settings.TEST_DATA)
            self.train_df = self.preprocess.preprocess_data_training(self.settings.TRAIN_DATA)
            self.val_df = self.preprocess.preprocess_data_training(self.settings.VAL_DATA)
            print(self.train_df.head())
            print("Dataset Successfully Loaded and Prepared-----!! ")
            print("Loading and Initializing the T5 Model -----!! ")
            self.__initialize()
            print("Model Successfully Loaded and Initialized-----!! ")

            print("------------------Starting Training-----------!!")
            self.set_seed()
            self.train()
            print("Training complete-----!!!")

        except BaseException as ex:
            print("Following Exception Occurred---!! ", str(ex))

In [6]:
print(Settings.USE_GPU)
print(Settings.DEVICE)

True
cuda


In [7]:
t= Train()
t.run()

Loading and Preparing the Dataset-----!! 
                                         source_text  \
0  summarize: thierry mariani sur la liste du ras...   
1  summarize: cest désormais officiel  alain jupp...   
2  summarize: surnommés les juges des pauvres  tr...   
3  summarize: dans une interview accordée au figa...   
4  summarize: le préjudice est estimé à  millions...   

                                         target_text  
0  L'information n'a pas été confirmée par l'inté...  
1  Le maire de Bordeaux ne fait plus partie des R...  
2  En 2020, les tribunaux d'instance fusionnent a...  
3  Les médecins jugés "gros prescripteurs d'arrêt...  
4  Il aura fallu mobiliser 90 gendarmes pour cett...  
Dataset Successfully Loaded and Prepared-----!! 
Loading and Initializing the T5 Model -----!! 


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Model Successfully Loaded and Initialized-----!! 
------------------Starting Training-----------!!


Missing logger folder: c:\Users\mouha\Desktop\3A\Projet NLP NEW ARTICLE GENERATION\News-Articles-Title-Generation\lightning_logs

  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 222 M 
-----------------------------------------------------
222 M     Trainable params
0         Non-trainable params
222 M     Total params
891.614   Total estimated model params size (MB)


Validation sanity check:   0%|          | 0/2 [00:00<?, ?it/s]

  rank_zero_warn(


                                                                      

Global seed set to 42
  rank_zero_warn(


Epoch 1:   7%|▋         | 405/5726 [01:22<18:04,  4.91it/s, loss=2.21, v_num=0, train_loss_step=2.360, val_loss_step=2.270, val_loss_epoch=2.120, train_loss_epoch=2.420] error occurred while loading model  unique_by_key: failed to synchronize: cudaErrorIllegalAddress: an illegal memory access was encountered
Training complete-----!!!


In [8]:
! ( cd outputs; ls )

Le chemin d'acc�s sp�cifi� est introuvable.


In [9]:
t.test_df

Unnamed: 0,ID,text
0,0,laccès à leurs origines une fois la majorité a...
1,1,en françois bayrou sétait associé à emmanuel ...
2,2,ils ne passeront pas noël ensemble le quotidie...
3,3,dans un message publié pour noël le fondateur ...
4,4,le suspense a duré jusquau bout le mardi sept...
...,...,...
1495,1495,jeanluc guillotel nétait pas venu les mains vi...
1496,1496,des dizaines de véhicules utilitaires volés da...
1497,1497,la totalité du produit de la vente ira à ce no...
1498,1498,alors que lépidémie gagne du terrain outreatla...


In [10]:
t5_model = t.t5_model
t5_model.model.device= torch.device("cpu")
t5_model.model.predict(t.test_df['text'].iloc[0])

RuntimeError: CUDA error: an illegal memory access was encountered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
from tqdm import tqdm
pred=[]
for i in tqdm(range(t.test_df.shape[0])):
  pred.append(t5_model.model.predict(t.test_df['text'].iloc[i])[0])

  0%|          | 0/1500 [00:00<?, ?it/s]


RuntimeError: CUDA error: an illegal memory access was encountered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
t.test_df['titles'] = pred
t.test_df.to_csv('submission.csv',index=False)

In [None]:
t5_model.model.save('model_path',save_format="h5")


AttributeError: 'SimpleT5' object has no attribute 'save'