In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import urllib.request


path_to_save_the_file = "Data/text"
urllib.request.urlretrieve(
    "https://www.gutenberg.org/cache/epub/5711/pg5711.txt", path_to_save_the_file
)


('Data/text', <http.client.HTTPMessage at 0x28fcdaa85d0>)

In [2]:
with open(path_to_save_the_file, "r", encoding="utf8") as the_file:
    text = the_file.read()
print(text[0:500] + "...")

﻿The Project Gutenberg eBook of Germinal
    
This ebook is for the use of anyone anywhere in the United States and
most other parts of the world at no cost and with almost no restrictions
whatsoever. You may copy it, give it away or re-use it under the terms
of the Project Gutenberg License included with this ebook or online
at www.gutenberg.org. If you are not located in the United States,
you will have to check the laws of the country where you are located
before using this eBook.

Title: Ger...


In [4]:
start_index = text.find("Première Partie")
end_index = text.find("*** END OF THE PROJECT GUTENBERG EBOOK GERMINAL ***")
text = text[start_index:end_index]
print(f"Size of the book (as the number of characters): {len(text)}")
text_array = np.array(list(text))

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le.fit(text_array)
text_encoded=le.transform(text_array)
print(text_encoded)

Size of the book (as the number of characters): 1011350
[35 63 50 ...  1  1  1]


In [36]:
from sklearn.preprocessing import MinMaxScaler
import torch
from torch.utils.data import Dataset, DataLoader, TensorDataset


def create_sequences(data, seq_length):
    sequences = []
    labels=[]
    for i in range(len(data) - seq_length):
        sequence = data[i+seq_length:i:-1]
        sequences.append(sequence)
        labels.append(data[i+seq_length])
    return np.array(sequences),np.array(labels)

seq_length =10 

sequences,labels = create_sequences(text_encoded, seq_length)

train_val_size = int(0.8 * len(sequences))
X_train_val, y_train_val = sequences[:train_val_size],labels[:train_val_size]
X_test,y_test = sequences[train_val_size:],labels[train_val_size:]

train_size = int(0.8 * len(X_train_val))
X_train,y_train=sequences[:train_size],labels[:train_size]
X_val,y_val=sequences[train_size:train_val_size],labels[train_size:train_val_size]

X_train,y_train = torch.tensor(X_train),torch.tensor(y_train)
X_val,y_val = torch.tensor(X_val),torch.tensor(y_val)

torch.manual_seed(1)


batch_size = 32  
train_loader = DataLoader(TensorDataset(X_train, y_train), batch_size=batch_size, shuffle=True,drop_last=True)
val_loader = DataLoader(TensorDataset(X_val, y_val), batch_size=batch_size, shuffle=False,drop_last=True)

In [63]:
import torch.nn as nn


class RNN(nn.Module):
    def __init__(self, token_size: int, embed_dim: int, rnn_hidden_size: int,num_stack_layers:int):
        super(RNN, self).__init__()
        self.embedding = nn.Embedding(token_size, embed_dim)
        self.num_stack_layers=num_stack_layers

        self.rnn_hidden_size = rnn_hidden_size
        self.rnn = nn.LSTM(embed_dim, rnn_hidden_size, batch_first=True,num_layers=num_stack_layers)
        self.fc = nn.Linear(rnn_hidden_size, 1)

    def forward(
        self, X: torch.Tensor, h: torch.Tensor, c: torch.Tensor
    ) -> torch.Tensor:
       



        out = self.embedding(X).squeeze(1)
        out,(h,c)= self.rnn(out, (h, c))
        out = self.fc(out[:,-1,:])
        return out,h,c


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

input_size = 1 
hidden_size = 32  
num_epochs = 30
learning_rate = 0.001
num_stack_layers=2
token_size = le.classes_.size
embed_dim = 32



model = RNN(token_size,embed_dim, hidden_size,num_stack_layers)
model = model.to(device)

loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
# Entraînement du modèle
for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0 

    for inputs, labels in train_loader:
        inputs, labels=inputs.to(device), labels.to(device)
        h0=torch.zeros(num_stack_layers,inputs.size(0),hidden_size)
        c0=torch.zeros(num_stack_layers,inputs.size(0),hidden_size)
        h0,c0 = h0.to(device),c0.to(device)

        output,_,_=model(inputs,h0,c0)


        loss=loss_fn(output.squeeze(1),labels.float())
        total_loss+=loss.item()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()            
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss/len(train_loader)}')

# Évaluation du modèle
    model.eval()
    test_loss = 0
    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs, labels=inputs.to(device), labels.to(device)
            h0=torch.zeros(num_stack_layers,inputs.size(0),hidden_size)
            c0=torch.zeros(num_stack_layers,inputs.size(0),hidden_size)
            h0,c0 = h0.to(device),c0.to(device)
            output,_,_=model(inputs,h0,c0)
            loss=criterion(output,labels)
            test_loss+=loss.item()
        print(f'Test Loss: {test_loss/len(val_loader)}')

Epoch [1/30], Loss: 4653.509662650429
Test Loss: 2080.4442312989054
Epoch [2/30], Loss: 4650.185901908531
Test Loss: 1644.7905549882335
Epoch [3/30], Loss: 4650.044820674812
Test Loss: 1225.0638917488388
Epoch [4/30], Loss: 4650.140255711051
Test Loss: 812.2780590057373
Epoch [5/30], Loss: 4650.0000203993695
Test Loss: 642.9327829880051


In [None]:
model.eval()
with torch.no_grad():
    predicted_std=model(X_test_std).numpy()
    predicted_mean=model(X_test_mean).numpy()