In [47]:
import torch
import torch.nn as nn
from datasets import load_dataset
from tokenizers import	Tokenizer
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers.pre_tokenizers import Whitespace
from pathlib import Path
from torch.utils.data import Dataset,DataLoader,random_split
import warnings
import torch
import torch.nn as nn
from torch.utils.data import Dataset
import warnings
import math
from torch.utils.tensorboard import SummaryWriter


In [48]:
!pip install tensorboard




In [49]:
def get_config():
    return {

        "batch_size":8,
        "num_epochs":20,
        "lr": 10**-4,
        "seq_len":350,
        "d_model":512,
        "lang_src":"en",
        "lang_tgt":'fr',
        "model_folder":"weights",
        "model_filename":"t_model_",
        "preload":None,
        "tokenizer_file":"tokenizer_{0}.json",
        "experiment_name":'runs/t_model_'
    }
def get_weights_file_path(config,epoch:str) :
    model_folder = config['model_folder']
    model_basename = config['model_basename']
    model_filename = f"{model_basename}{epoch}.pt"
    return str(Path('.')/model_folder/model_filename)

In [50]:

class InputEmbedding(nn.Module):
    def __init__(self, d_model: int, vocab_size: int):
        """
        Classe d'embedding pour les entrées, qui convertit les identifiants de tokens en
        représentations vectorielles de dimension d_model. Utilise une couche d'embedding
        pour mapper chaque mot à un vecteur dans un espace de dimension d_model.

        Paramètres:
        - d_model (int): Dimension des vecteurs d'embedding.
        - vocab_size (int): Taille du vocabulaire, définissant le nombre d'entrées
                            possibles pour l'embedding.
        """
        super().__init__()
        self.d_model = d_model
        self.vocab_size = vocab_size
        self.embedding = nn.Embedding(vocab_size, d_model)

    def forward(self, x):
        """
        Méthode de passage avant qui applique l'embedding aux identifiants de tokens et
        multiplie le résultat par la racine carrée de la dimension d_model pour
        normaliser l'échelle des vecteurs d'embedding.

        Paramètres:
        - x: Tenseur d'entrée contenant les identifiants de tokens.

        Retourne:
        - Tenseur des vecteurs d'embedding, dimensionné par la racine carrée de d_model.
        """
        return self.embedding(x) * math.sqrt(self.d_model)

# Après cela, il est nécessaire d'intégrer le positional encoding :
# Cette étape consiste à mapper la phrase originale dans une liste de vecteurs
# en utilisant la couche d'embedding. Il est crucial de fournir au modèle
# des informations sur la position de chaque mot dans la phrase.
# Pour ce faire, il faut créer ces vecteurs de position et les ajouter
# aux vecteurs d'embedding.


In [51]:
#apres il faut le positional encoding:
# mappé la phrase original dans une list des vecteur
# en utilisant le layer d'embedding et on veut apres donner pour le modele l'information
# sur la position de chaque mot dans la phrase
# donc il faut creer ces vecteur et les ajouter au vecteur d'embadding

# exemple on a une phrase your cat is
# on utilise pe(pos,2i) = sin(pos/1000 a la puissance (2i/d_model))

# PE(pos,2i+1)= cos(pos/1000 a la puissance (2i/d_model))

In [52]:
class PositionEncoding(nn.Module):
    """
    Classe pour le codage positionnel, qui fournit des informations sur la position
    de chaque mot dans une phrase, ce qui est essentiel pour les architectures
    de type transformateur.

    Paramètres:
    - d_model (int): Dimension des vecteurs d'embedding.
    - seq_len (int): Longueur maximale d'une phrase.
    - dropout (float): Taux de dropout pour la régularisation.
    """
    def __init__(self, d_model:int,seq_len:int,dropout:float) -> None:
        super().__init__()
        self.d_model=d_model
        self.seq_len=seq_len
        self.dropout=nn.Dropout(dropout)
        # sequence_len to d_model (seq_len,d_model)
        pe = torch.zeros(seq_len,d_model)
        # Create a vector of shape(seq_len)
        position = torch.arange(0, seq_len,dtype=torch.float).unsqueeze(1) # (Seq_len,1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)) # (d_model / 2)
        #Apply the sin to even positions
        pe[:,0::2]=torch.sin(position*div_term)
        pe[:,1::2]=torch.cos(position*div_term)
        pe = pe.unsqueeze(0)#(1,Seq_Len,d_model)

        self.register_buffer('pe',pe) # quand le module est sauvegarder
    def forward(self, x):

        """
        Méthode de passage avant pour ajouter le codage positionnel au tenseur d'entrée.

        Paramètres:
        - x: Tenseur d'entrée contenant les vecteurs d'embedding.

        Retourne:
        - Tenseur d'entrée avec le codage positionnel ajouté, soumis à un dropout.
        """
        x= x + (self.pe[:,:x.shape[1],:]).requires_grad_(False)
        return self.dropout(x)






In [53]:
"""
# La normalisation de couche est une technique utilisée dans les réseaux de neurones
# pour stabiliser et améliorer l'apprentissage des modèles profonds. Contrairement à la
# normalisation par lot qui normalise à travers la dimension de lot, la normalisation de couche
# normalise à travers les caractéristiques pour chaque exemple individuel. Cela peut être
# particulièrement utile lorsque les tailles de lot sont petites ou varient, comme dans le cas
# des réseaux de neurones récurrents (RNN) ou des modèles de transformateur.
"""


class LayerNormalization(nn.Module):
    def __init__(self,eps:float=10**-6) -> None:
        super().__init__()
        self.eps= eps
        self.alpha=nn.Parameter(torch.ones(1)) # Multiplied
        self.bias=nn.Parameter(torch.zeros(1)) # added

    def forward(self,x):
        mean=x.mean(dim=-1,keepdim=True)
        std= x.std(dim=-1,keepdim=True)
        return self.alpha *(x-mean)/(std+self.eps)+ self.bias


class FeedFordwarBlock(nn.Module):
    def __init__(self, d_model:int,d_ff:int,dropout:float) -> None:
        super().__init__()
        self.linear_1=nn.Linear(d_model,d_ff) #W1 and B1
        self.dropout=nn.Dropout(dropout)
        self.linear_2=nn.Linear(d_ff,d_model) # W2 and B2


    def forward(self,x):
        # (Batch,seq_len,d_model) --> (batch,Seq_len,d_ff)
        return self.linear_2(self.dropout(torch.relu(self.linear_1(x))))




class MultiHeadAttentionBlock(nn.Module):
    # h le nombre de tete
    def __init__(self, d_model, h:int,dropout: float)-> None:
        super().__init__()
        self.h=h
        assert d_model % h == 0 , "d_model is not divisible by h"
        self.d_k=d_model//h
        self.w_q=nn.Linear(d_model,d_model)
        self.w_k=nn.Linear(d_model,d_model)
        self.w_v=nn.Linear(d_model,d_model)

        self.w_o = nn.Linear(d_model,d_model) # WO
        self.dropout=nn.Dropout(dropout)
    @staticmethod
    def attention(query,key,value,mask,dropout:nn.Dropout):
        d_k = query.shape[-1]
        # @ matrix multiplication dans pytroch
        # (batch,h,seq_len,d_k) --> (batch , h,Seq_len,seq_len)
        attention_scores= (query @ key.transpose(-2,-1))/math.sqrt(d_k)
        if mask is not None:
            attention_scores.masked_fill_(mask==0, -1e9)
        # (BAtch , h , seq_len,seq_len)
        attention_scores= attention_scores.softmax(dim=-1)
        if dropout is not None:
            attention_scores=dropout(attention_scores)

        return(attention_scores @ value),attention_scores




    def forward(self,q,k,v,mask):
        query =  self.w_q(q) # ( batch ,seq_len,d_model) => (batch,seq_len,d_model)
        key= self.w_k(k) # ( batch ,seq_len,d_model) => (batch,seq_len,d_model)
        value = self.w_v(v) # ( batch ,seq_len,d_model) => (batch,seq_len,d_model)
        # divide the query key value in small matrix
        # and give chaque matrice deux tete diffrent
        # (Batch ,seq_len,d_model) =>(batch,seq_len,h,dk)=>(batch,h,seq_len,dk)
        # on veut que chaque head il Seq_len par d_k
        # la methode view : avoir le meme batch mais ne pas spliter le sentence mais le embedding en H partie
        query = query.view(query.shape[0], query.shape[1], self.h, self.d_k).transpose(1, 2)
        key = key.view(key.shape[0], key.shape[1], self.h, self.d_k).transpose(1, 2)
        value = value.view(value.shape[0], value.shape[1], self.h, self.d_k).transpose(1, 2)


        x , self.attention_scores= MultiHeadAttentionBlock.attention(query,key,value,mask,self.dropout)
        # (batch,h,seq_len,d_k) --> (Batch,seq_len, h,d_k) --> (Batch,seq_len,d_model)
        x = x.transpose(1,2).contiguous().view(x.shape[0],-1,self.h*self.d_k)
        # (Batch,seq_len,d_model) => (Batch,seq_len,d_model)
        return self.w_o(x)




In [54]:
class ResidualConnection(nn.Module):
    def __init__(self,dropout:float)->None:
        super(ResidualConnection, self).__init__()  # Appel du constructeur parent

        self.dropout = nn.Dropout(dropout)
        #skip norm betwen head norm and the previues layer

        self.norm = LayerNormalization()

    def forward(self, x,sublayer):
        return x + self.dropout(sublayer(self.norm(x)))


In [55]:
class EncoderBlock(nn.Module):
    def __init__(self,self_attention_block:MultiHeadAttentionBlock,feed_forward_block:FeedFordwarBlock,dropout:float)->None:

        super().__init__()
        self.self_attention_block=self_attention_block
        self.feed_forward_block=feed_forward_block
        self.residual_connection=nn.ModuleList([ResidualConnection(dropout) for _ in range(2)])

    def forward(self,x,src_mask):
# On appelle cela l'attention auto-régulée (self-attention), car chaque mot de la phrase interagit
# avec les autres mots de la même phrase. Contrairement à cela, dans le décodeur, les requêtes
# (queries) proviennent du décodeur et se réfèrent aux clés (keys) et valeurs (values) fournies
# par l'encodeur. Cette distinction permet au décodeur de se concentrer sur les éléments pertinents
# de l'encodage, facilitant ainsi la génération de la sortie basée sur l'entrée encodée.

        x = self.residual_connection[0](x, lambda x:self.self_attention_block(x,x,x,src_mask))
        x =self.residual_connection[1](x ,self.feed_forward_block)
        return x



class Encoder(nn.Module):
    def __init__(self, layers: nn.ModuleList) -> None:
        super().__init__()
        self.layers = layers
        self.norm=LayerNormalization()

    def forward(self,x, mask) :
        for layer in self.layers :
            x= layer(x,mask)


        return self.norm(x)






In [64]:
class DecoderBlock(nn.Module):
    def __init__(self,self_attention_block:MultiHeadAttentionBlock,cross_attention:MultiHeadAttentionBlock,feed_forward_block:FeedFordwarBlock,dropout:float) -> None:
        super().__init__()
        self.self_attention_block= self_attention_block
        self.corss_attention=cross_attention
        self.feed_forward_block=feed_forward_block
        #3 residual connection
        self.residual_connections = nn.ModuleList([ResidualConnection(dropout) for _ in range(3)])


    def forward(self,x ,encoder_output,src_mask,tgt_mask):
        x= self.residual_connections[0](x,lambda x:self.self_attention_block(x,x,x,tgt_mask))
        x=self.residual_connections[1](x,lambda x:self.corss_attention(x,encoder_output,encoder_output,src_mask))
        x=self.residual_connections[2](x,self.feed_forward_block)
        return x



class Decoder(nn.Module):
    def __init__(self, layers: nn.ModuleList) -> None:
        super().__init__()
        self.layers = layers
        self.norm = LayerNormalization()
    def forward(self,x,encoder_output,src_mask,tgt_mask):
        for layer in self.layers:
            x=layer(x,encoder_output,src_mask,tgt_mask)
        return self.norm(x)

# mais il faut le layer de projection pour passer (seq,d_model)  vers la postion de mot


class ProjectionLayer(nn.Module):
    def __init__(self,d_model:int , vocab_size:int):
        super().__init__()
        self.proj=nn.Linear(d_model,vocab_size)

    def forward(self,x):
        return torch.log_softmax(self.proj(x),dim=-1)



In [57]:
# Transformers block

class Transformer(nn.Module):
    def __init__(self,encoder :Encoder,decoder:Decoder,src_embed:InputEmbedding,tgt_embed:InputEmbedding,src_pos:PositionEncoding,tgt_pos:PositionEncoding,projection_layer:ProjectionLayer):
        super().__init__()

        self.encoder=encoder
        self.decoder=decoder
        self.src_embed=src_embed
        self.tgt_embed=tgt_embed
        self.src_pos=src_pos
        self.tgt_pos=tgt_pos
        self.projection_layer=projection_layer


    def encode(self,src,src_mask):
        src=self.src_embed(src)
        src=self.src_pos(src)
        return self.encoder(src,src_mask)

    def decode(self,encoder_output,src_mask,tgt,tgt_mask):
        tgt=self.tgt_embed(tgt)
        tgt=self.tgt_pos(tgt)
        return self.decoder(tgt,encoder_output,src_mask,tgt_mask)
    def project(self,x):
        return self.projection_layer(x)


def build_transformer(src_vocab_size:int,tgt_vocab_size:int,
                      src_seq_len:int,
                      tgt_seq_len:int,
                      d_model=512,N: int=6,
                      h:int=8,
                     dropout:float=0.1,d_ff=2048) -> Transformer:



    # on commence avec l'embading layer
    src_embed=InputEmbedding(d_model,src_vocab_size)
    tgt_embed=InputEmbedding(d_model,tgt_vocab_size)
    # le positional encoding layers
    src_pos=PositionEncoding(d_model,src_seq_len,dropout)
    tgt_pos=PositionEncoding(d_model,tgt_seq_len,dropout)
    # encoder_blocks =[]
    encoder_blocks=[]
    for _ in range(N):
        encoder_self_attention_block=MultiHeadAttentionBlock(d_model,h,dropout)
        feed_forward_block=FeedFordwarBlock(d_model,d_ff,dropout)
        encoder_block=EncoderBlock(encoder_self_attention_block,feed_forward_block,dropout)

        encoder_blocks.append(encoder_block)
    # create the decoder blocks
    decoder_blocks=[]
    for _ in range(N):
        decoder_self_attention_block=MultiHeadAttentionBlock(d_model,h,dropout)
        decoder_cross_attention_block=MultiHeadAttentionBlock(d_model,h,dropout)
        feed_forward_block=FeedFordwarBlock(d_model,d_ff,dropout)
        decoder_block=DecoderBlock(decoder_self_attention_block,decoder_cross_attention_block,feed_forward_block,dropout)

        decoder_blocks.append(decoder_block)

    # creer l' encoder and the decoder
    encoder=Encoder(nn.ModuleList(encoder_blocks))
    decoder=Decoder(nn.ModuleList(decoder_blocks))
    projection_layer=ProjectionLayer(d_model,tgt_vocab_size)

    ## Initialiser les paramètres

    transformer = Transformer(encoder,decoder,src_embed,tgt_embed,src_pos,tgt_pos,projection_layer)

    for p in transformer.parameters():

        if p.dim() > 1:
            nn.init.xavier_uniform_(p)

    return transformer



















In [58]:
class BilingualDataset(Dataset):
    def __init__(self, ds, tokenizer_src, tokenizer_tgt, src_lang, tgt_lang, seq_len):
        super().__init__()
        self.ds = ds
        self.tokenizer_src = tokenizer_src
        self.tokenizer_tgt = tokenizer_tgt

        self.src_lang = src_lang
        self.tgt_lang = tgt_lang
        self.seq_len = seq_len  # longueur maximale des séquences

        # Convertir les tokens spéciaux en nombres (correctement avec torch.tensor)
        self.sos_token = torch.tensor([tokenizer_tgt.token_to_id('[SOS]')], dtype=torch.int64)
        self.eos_token = torch.tensor([tokenizer_tgt.token_to_id('[EOS]')], dtype=torch.int64)
        self.pad_token = torch.tensor([tokenizer_tgt.token_to_id('[PAD]')], dtype=torch.int64)


    def __len__(self):
        return len(self.ds)

    def __getitem__(self, index):
        # Récupérer la paire source-cible
        src_target_pair = self.ds[index]
        src_text = src_target_pair['translation'][self.src_lang]
        tgt_text = src_target_pair['translation'][self.tgt_lang]

        # Convertir en ids de tokens
        enc_input_tokens = self.tokenizer_src.encode(src_text).ids
        dec_input_tokens = self.tokenizer_tgt.encode(tgt_text).ids

        # Calculer les tokens de padding nécessaires
        enc_num_padding_tokens = self.seq_len - len(enc_input_tokens) - 2  # Pour [SOS] et [EOS]
        dec_num_padding_tokens = self.seq_len - len(dec_input_tokens) - 1  # Pour [SOS]

        # Si la phrase est trop longue, lever une erreur
        if enc_num_padding_tokens < 0 or dec_num_padding_tokens < 0:
            raise ValueError("La phrase est trop longue pour la séquence définie.")

        # Encoder l'entrée avec [SOS], [EOS], et du padding
        encoder_input = torch.cat([
            self.sos_token,  # [SOS]
            torch.tensor(enc_input_tokens, dtype=torch.int64),  # Les tokens encodés
            self.eos_token,  # [EOS]
            torch.tensor([self.pad_token] * enc_num_padding_tokens, dtype=torch.int64)  # Padding
        ])

        # Décoder l'entrée avec [SOS] et padding
        decoder_input = torch.cat([
            self.sos_token,  # [SOS]
            torch.tensor(dec_input_tokens, dtype=torch.int64),  # Les tokens décodés
            torch.tensor([self.pad_token] * dec_num_padding_tokens, dtype=torch.int64)  # Padding
        ])

        # La sortie du décodeur sera les tokens décodés avec [EOS] mais sans [SOS]
        label = torch.cat([
            torch.tensor(dec_input_tokens, dtype=torch.int64),  # Les tokens décodés
            self.eos_token,  # [EOS]
            torch.tensor([self.pad_token] * dec_num_padding_tokens, dtype=torch.int64)  # Padding
        ])
        assert encoder_input.size(0)==self.seq_len
        assert decoder_input.size(0)== self.seq_len
        assert label.size(0) == self.seq_len

        return {
            "encoder_input":encoder_input,# (Seq_len)
            "decoder_input":decoder_input ,#(seq_len)
            "encoder_mask": (encoder_input!=self.pad_token).unsqueeze(0).unsqueeze(0).int(), #(1,1,seq_len)
            "decoder_mask":(decoder_input!=self.pad_token).unsqueeze(0).unsqueeze(0).int() & causal_mask(decoder_input.size(0)), #(1,Seq) & (1,seq_len,seq_len)
            "label":label,
            "src_text":src_text,
            "tgr_text":tgt_text

        }


def causal_mask(size):
    mask = torch.triu(torch.ones(1,size,size),diagonal=1).type(torch.int)
    return mask == 0

In [59]:
# pip install datasets
# pip install tokenizers

In [60]:
def get_all_sentences(ds,lang):
    for item in ds:
        yield item['translation'][lang]


def get_or_build_tokenizer(config,ds,lang):

    # config['tokinzer_file'] ="../tokinzers/tokinezer_{0}.json"
    tokenizer_path = Path(config['tokenizer_file'].format(lang))
    if not Path.exists(tokenizer_path):
        tokenizer = Tokenizer(WordLevel(unk_token='[UNK]'))
        tokenizer.pre_tokenizer = Whitespace()
        trainer=WordLevelTrainer(special_tokens=["[UNK]","[PAD]","[SOS]",'[EOS]'],min_frequency=2)
        tokenizer.train_from_iterator(get_all_sentences(ds,lang),trainer=trainer)
        tokenizer.save(str(tokenizer_path))
    else:
        tokenizer=Tokenizer.from_file(str(tokenizer_path))

    return tokenizer


def get_ds(config):
    ds_raw=load_dataset('opus_books',f"{config['lang_src']}-{config['lang_tgt']}",split='train')

    # Build tokiniers
    tokinzer_src=get_or_build_tokenizer(config,ds_raw,config['lang_src'])
    tokinzer_tgt=get_or_build_tokenizer(config,ds_raw,config['lang_tgt'])

    # train test split
    train_ds_size=int(0.9*len(ds_raw))
    val_ds_size=len(ds_raw)-train_ds_size
    train_ds_raw,val_ds_raw=random_split(ds_raw,[train_ds_size,val_ds_size])
    train_ds=BilingualDataset(train_ds_raw,tokinzer_src,tokinzer_src,config['lang_src'],config['lang_tgt'],config["seq_len"])
    val_ds=BilingualDataset(val_ds_raw,tokinzer_src,tokinzer_src,config['lang_src'],config['lang_tgt'],config["seq_len"])

    max_len_src=0
    max_len_tgt=0
    for item in ds_raw:
        src_ids=tokinzer_src.encode(item["translation"][config['lang_src']]).ids
        tgt_ids=tokinzer_tgt.encode(item["translation"][config['lang_tgt']]).ids
        max_len_src=max(max_len_src,len(src_ids))
        max_len_tgt=max(max_len_tgt,len(tgt_ids))

    print(f"Max length of source sentence {max_len_src}")
    print(f'Max length of target sentence :{max_len_tgt}')
    train_dataloader=DataLoader(train_ds,batch_size=config['batch_size'],shuffle=True)
    val_dataloader=DataLoader(val_ds,batch_size=1,shuffle=True)
    return train_dataloader , val_dataloader , tokinzer_src , tokinzer_tgt









In [61]:
def get_model(config,vocab_src_len,vocab_tgt_len):
    model = build_transformer(vocab_src_len, vocab_tgt_len, config["seq_len"], config['seq_len'], d_model=config['d_model'])
    return model


In [66]:
from tqdm import  tqdm
def train_model(config):
    # Define the device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f" using device {device}")
    Path(config['model_folder']).mkdir(parents=True, exist_ok=True)



    train_dataloader , val_dataloader , tokenizer_src , tokenizer_tgt =get_ds(config)
    model = get_model(config,tokenizer_src.get_vocab_size(),tokenizer_tgt.get_vocab_size()).to(device)
    # TensorBoard(pour visualiser les charts les trucs)
    writer= SummaryWriter(config['experiment_name'])
    optimizer=torch.optim.Adam(model.parameters(),lr=config['lr'],eps=1e-9)

    # restore the state of the optimizer and the model
    initial_epoch=0
    global_step=0
    if config['preload']:
        model_filename = get_weights_file_path(config,config['preload'])
        print(f'Preloading model {model_filename}')
        state = torch.load(model_filename)
        initial_epoch = state['epoch'] +1
        optimizer.load_state_dict(state['optimizer_state_dict'])
        global_step = state['global_step']

    loss_fn = nn.CrossEntropyLoss(ignore_index=tokenizer_src.token_to_id('[PAD]'),label_smoothing=0.1).to(device)
    for epoch in range(initial_epoch,config['num_epochs']):
        model.train()
        batch_iterator = tqdm(train_dataloader,desc = f"Processing epoch {epoch:02d}")
        for batch in batch_iterator:
            encoder_input = batch['encoder_input'].to(device) # (B,Seq_len)
            decoder_input = batch['decoder_input'].to(device) #(B, Seq_len)
            encoder_mask = batch['encoder_mask'].to(device) # (B,1,1,Seq_len)
            decoder_mask = batch['decoder_mask'].to(device) #(B,1,Seq_len,Seq_len)
            encoder_output=model.encode(encoder_input,encoder_mask) # (B,seq_len,d_model)
            decoder_output=model.decode(encoder_output,encoder_mask,decoder_input,decoder_mask)# (B,seq_len,d_model)
            proj_ouput=model.project(decoder_output) # (B,Seq_len,tgt_vocabolry_size)
            label=batch['label'].to(device) #(B,Seq_len) what is the position in the vocabolary
            # transfrom  (B,Seq_len,tgt_vocab_size)--> (B*seq_len,tgt_vocab_size)
            loss = loss_fn(proj_ouput.view(-1,tokenizer_tgt.get_vocab_size()),label.view(-1))

            batch_iterator.set_postfix(loss=f"{loss.item():6.3f}")

            # log the loss
            writer.add_scalar('train loss',loss.item(),global_step)
            writer.flush()
            #
            loss.backward()
            # update the loss
            optimizer.step()
            optimizer.zero_grad()
            global_step +=1
        model_filename= get_weights_file_path(config,f'{epoch:02d}')
        torch.save({
            'epoch':epoch,
            "model_state_dict":model.state_dict(),
            "optimizer_state_dict":optimizer.state_dict(),
            "global_step":global_step,
        },model_filename)








In [None]:
warnings.filterwarnings("ignore")
config= get_config()
train_model(config)

In [1]:
import torch
torch.cuda.is_available()

False