<a href="https://colab.research.google.com/github/mounibnasr45/AmiAgri.github.io/blob/main/Untitled15.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import os
import sentencepiece as spm
import numpy as np
import tensorflow as tf

# Load data from article and summary directories
def load_data(article_dir, summary_dir):
    articles, summaries = [], []
    for filename in os.listdir(article_dir):
        article_path = os.path.join(article_dir, filename)
        summary_path = os.path.join(summary_dir, filename)

        if os.path.exists(article_path) and os.path.exists(summary_path):
            # Use 'ISO-8859-1' to avoid decoding errors
            with open(article_path, 'r', encoding='ISO-8859-1') as f:
                article = f.read().strip()
            with open(summary_path, 'r', encoding='ISO-8859-1') as f:
                summary = f.read().strip()

            if article and summary:
                articles.append(article)
                summaries.append(summary)

    return articles, summaries

articles, summaries = load_data(
    r'/content/data/articles',
    r'/content/data/summarize'
)

# Load SentencePiece model for tokenization
sp = spm.SentencePieceProcessor(model_file=r'/content/en_32k.sentencepiece')

# Update the max_length parameter in the tokenize function
def tokenize(text, max_length=256):
    # Encode the text using SentencePiece
    tokenized_input = sp.encode(text, out_type=int)

    # Create a mask (1 for actual tokens, 0 for padding)
    input_length = len(tokenized_input)
    input_mask = np.ones(input_length, dtype=int)  # Initialize mask with 1s

    # Add padding if necessary
    if input_length < max_length:
        padding_length = max_length - input_length
        tokenized_input = np.pad(tokenized_input, (0, padding_length), 'constant', constant_values=0)
        input_mask = np.pad(input_mask, (0, padding_length), 'constant', constant_values=0)

    return tokenized_input, input_mask
def detokenize(ids):
    return sp.decode(ids)
# Encode articles and summaries with the updated max_length
encoded_articles = [tokenize(article) for article in articles]
encoded_summaries = [tokenize(summary) for summary in summaries]

In [3]:
# Function to pad sequences to a fixed length
def pad_sequences(sequences, max_length):
    padded_sequences = []
    for seq in sequences:
        if len(seq) < max_length:
            # Pad with zeros if sequence is shorter than max_length
            padded_seq = np.pad(seq, (0, max_length - len(seq)), mode='constant', constant_values=0)
        else:
            # Truncate the sequence if it's longer than max_length
            padded_seq = seq[:max_length]
        padded_sequences.append(padded_seq)
    return np.array(padded_sequences)

# Convert the data into a Dataset object
def create_tf_dataset(articles, summaries, batch_size=32, buffer_size=10000, max_length=256):
    # Convert lists to tensors with padding
    input_ids = pad_sequences([article[0] for article in articles], max_length)
    input_masks = pad_sequences([article[1] for article in articles], max_length)
    target_ids = pad_sequences([summary[0] for summary in summaries], max_length)
    target_masks = pad_sequences([summary[1] for summary in summaries], max_length)

    # Create a dataset from the tokenized inputs and summaries
    dataset = tf.data.Dataset.from_tensor_slices(((input_ids, input_masks), (target_ids, target_masks)))

    # Shuffle and batch the dataset
    dataset = dataset.shuffle(buffer_size).batch(batch_size).prefetch(tf.data.experimental.AUTOTUNE)

    return dataset

# Example usage
batch_size = 64
max_length = 256  # Set the maximum length for padding
dataset = create_tf_dataset(encoded_articles, encoded_summaries, batch_size=batch_size, max_length=max_length)

# Display one line of the data
for data in dataset.take(1):
    (input_data, target_data) = data
    input_ids, input_masks = input_data
    target_ids, target_masks = target_data

    # Display the first line of input and target data
    print("Input IDs:", input_ids.numpy()[0])   # First line of input ids
    print("Input Masks:", input_masks.numpy()[0])   # First line of input masks
    print("Target IDs:", target_ids.numpy()[0])  # First line of target ids
    print("Target Masks:", target_masks.numpy()[0])  # First line of target masks
    break


Input IDs: [ 1290    63     9  2870     7   223    21  2557  2233 19783  1290    63
     9  1632     8   166   388    12  4234 11220     8 23940  2384  2233
    57 16201   662    18   715     7   804   343  2180  7923    29  8642
  3441  8237     3 25342     3 25618   489  5783 13649 16936   137    37
 24146    52    26   258 13207   112     3     2 19853   357 16235  6441
   540    12  4956  2231    21     8  8926    13     8  6578 28583     5
    37   804    47   885  1019     6    28 12806   511  6677  8642  3441
  8237     3    18     8  3888  4668     3    18  6733    91     8   166
   356     5   299  1290    63     9   808     8   511   356     6   274
     3  4076  4733   223    45     3 24279   323    16     8  2204    52
    12  2054    11  1369     3     9  6177    18 14577     5  1290    63
     9  5899 15627    10    96   196    31    51     3 28004    82  6441
   540    45    48  5892    21     8 28583  8926     6  6055    34   656
   128  1750    12    70  1342     5   9

In [4]:
def data_generator(articles, summaries, batch_size, sp, max_length=512):
    num_samples = len(articles)

    while True:
        for offset in range(0, num_samples, batch_size):
            batch_articles = articles[offset:offset + batch_size]
            batch_summaries = summaries[offset:offset + batch_size]

            # Tokenize the batch data
            tokenized_articles = [tokenize(article, max_length) for article in batch_articles]
            tokenized_summaries = [tokenize(summary, max_length) for summary in batch_summaries]

            # Pad tokenized sequences to ensure uniform shape
            article_inputs = tf.keras.preprocessing.sequence.pad_sequences(
                [t[0] for t in tokenized_articles], maxlen=max_length, padding='post'
            )
            article_masks = tf.keras.preprocessing.sequence.pad_sequences(
                [t[1] for t in tokenized_articles], maxlen=max_length, padding='post'
            )
            summary_inputs = tf.keras.preprocessing.sequence.pad_sequences(
                [t[0] for t in tokenized_summaries], maxlen=max_length, padding='post'
            )
            summary_masks = tf.keras.preprocessing.sequence.pad_sequences(
                [t[1] for t in tokenized_summaries], maxlen=max_length, padding='post'
            )

            # Yield a tuple of inputs and a tuple of outputs
            yield (
                (article_inputs, article_masks),
                (summary_inputs, summary_masks)
            )


In [None]:
import tensorflow as tf

def data_generator(articles, summaries, batch_size, sp, max_length=256):
    num_samples = len(articles)

    while True:
        for offset in range(0, num_samples, batch_size):
            batch_articles = articles[offset:offset + batch_size]
            batch_summaries = summaries[offset:offset + batch_size]

            # Tokenize and ensure consistent max_length
            tokenized_articles = [sp.encode_as_ids(article)[:max_length] for article in batch_articles]
            tokenized_summaries = [sp.encode_as_ids(summary)[:max_length] for summary in batch_summaries]

            # Pad tokenized sequences to ensure uniform shape
            article_inputs = tf.keras.preprocessing.sequence.pad_sequences(
                tokenized_articles, maxlen=max_length, padding='post', truncating='post'
            )
            summary_inputs = tf.keras.preprocessing.sequence.pad_sequences(
                tokenized_summaries, maxlen=max_length, padding='post', truncating='post'
            )

            # Create masks (1 for non-padded tokens, 0 for padded)
            article_masks = tf.cast(article_inputs != 0, tf.int32)
            summary_masks = tf.cast(summary_inputs != 0, tf.int32)

            # Yield a tuple of inputs and a tuple of outputs
            yield (
                (article_inputs, article_masks),
                (summary_inputs, summary_masks)
            )

# Define output signature for the generator
output_signature = (
    (
        tf.TensorSpec(shape=(None, max_length), dtype=tf.int32),  # article inputs
        tf.TensorSpec(shape=(None, max_length), dtype=tf.int32)   # article masks
    ),
    (
        tf.TensorSpec(shape=(None, max_length), dtype=tf.int32),  # summary inputs
        tf.TensorSpec(shape=(None, max_length), dtype=tf.int32)   # summary masks
    )
)

# Convert generator into a Dataset using output_signature
dataset = tf.data.Dataset.from_generator(
    lambda: data_generator(articles, summaries, batch_size, sp, max_length),
    output_signature=output_signature
)

# Compile and train the model
model = transformer_decoder_model(vocab_size=sp.vocab_size())
model.compile(
    optimizer='adam',
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),

    metrics=[tf.keras.metrics.SparseCategoricalAccuracy()]  # Add accuracy metric
)

# Define steps per epoch and validation steps
steps_per_epoch = len(articles) // batch_size

# Train the model using the dataset
history = model.fit(
    dataset,
    steps_per_epoch=steps_per_epoch,
    epochs=30
)

# Display training history
print(history.history)  # This will show loss and accuracy after training


Epoch 1/30


In [21]:
from sklearn.model_selection import train_test_split

# Split your data into training and validation sets
train_articles, val_articles, train_summaries, val_summaries = train_test_split(
    articles, summaries, test_size=0.2, random_state=42
)

# Create a training dataset
train_dataset = create_tf_dataset(train_articles, train_summaries, batch_size=batch_size)

# Create a validation dataset
val_dataset = create_tf_dataset(val_articles, val_summaries, batch_size=batch_size)

# Evaluate the model on the validation dataset
val_steps = len(val_articles) // batch_size
evaluation_results = model.evaluate(val_dataset, steps=val_steps)

# Display evaluation metrics
print("Evaluation results:")
for metric, value in zip(model.metrics_names, evaluation_results):
    print(f"{metric}: {value:.4f}")


ValueError: Exception encountered when calling Functional.call().

[1mInvalid input shape for input Tensor("functional_3_1/Cast:0", shape=(None,), dtype=int32). Expected shape (None, None), but input has incompatible shape (None,)[0m

Arguments received by Functional.call():
  • inputs=('tf.Tensor(shape=(None,), dtype=string)', 'tf.Tensor(shape=(None,), dtype=string)')
  • training=False
  • mask=('None', 'None')

In [5]:
from tensorflow.keras import layers

# Transformer model definition
def transformer_decoder_model(vocab_size, d_model=512, num_heads=4, num_layers=16, dff=2048, dropout_rate=0.1):
    inputs = layers.Input(shape=(None,), dtype=tf.int32)
    masks = layers.Input(shape=(None,), dtype=tf.int32)

    # Embedding layer
    embedding = layers.Embedding(vocab_size, d_model)(inputs)

    # Expand mask dimensions to [batch_size, 1, seq_length]
    expanded_masks = layers.Lambda(lambda x: tf.expand_dims(x, axis=1))(masks)

    # Transformer decoder layers
    for _ in range(num_layers):
        attn_output = layers.MultiHeadAttention(
            num_heads=num_heads,
            key_dim=d_model,
            dropout=dropout_rate
        )(embedding, embedding, attention_mask=expanded_masks)

        # Residual connection + normalization
        attn_output = layers.LayerNormalization(epsilon=1e-6)(attn_output + embedding)

        # Feedforward network
        ffn_output = layers.Dense(dff, activation='relu')(attn_output)
        ffn_output = layers.Dense(d_model)(ffn_output)
        embedding = layers.LayerNormalization(epsilon=1e-6)(ffn_output + attn_output)

    outputs = layers.Dense(vocab_size)(embedding)
    return tf.keras.Model(inputs=[inputs, masks], outputs=outputs)


In [51]:
import numpy as np

# Tokenizer function (assuming sp is the SentencePiece model instance)
def tokenize(text, max_length=100):
    tokenized_input = sp.encode(text, out_type=int)

    # Pad the input tokens if shorter than max_length
    input_length = len(tokenized_input)
    if input_length < max_length:
        tokenized_input = np.pad(tokenized_input, (0, max_length - input_length), 'constant', constant_values=0)

    # Create a mask (1 for actual tokens, 0 for padding)
    input_mask = np.where(tokenized_input != 0, 1, 0)

    # Reshape to 2D arrays (batch_size=1, sequence_length)
    return np.array(tokenized_input).reshape(1, -1), np.array(input_mask).reshape(1, -1)

# Convert predicted token indices back to a sentence
def tokens_to_sentence(token_indices):
    # Decode token indices back to text using the tokenizer (assuming sp is the SentencePiece model)
    tokens = sp.decode(token_indices)
    return tokens

# Example input text
input_text = """IAAF launches fight against drugs. The IAAF - athletics' world governing body - has met anti-doping officials, coaches, and athletes to co-ordinate the fight against drugs in sport.

Two task forces have been set up to examine doping and nutrition issues. It was also agreed that a programme to "de-mystify" the issue to athletes, the public, and the media was a priority. "Nothing was decided to change things - it was more to have a forum of the stakeholders allowing them to express themselves," said an IAAF spokesman. "Getting everyone together gave us a lot of food for thought." About 60 people attended Sunday's meeting in Monaco, including IAAF chief Lamine Diack and Namibian athlete Frankie Fredericks, now a member of the Athletes' Commission. "I am very happy to see you all, members of the athletics family, respond positively to the IAAF call to sit together and discuss what more we can do in the fight against doping," said Diack. "We are the leading Federation in this field and it is our duty to keep our sport clean." The two task forces will report back to the IAAF Council, at its April meeting in Qatar.
"""

# Tokenize the input sentence
input_seq, input_mask = tokenize(input_text)

# Predict using the model
predicted_logits = model.predict([input_seq, input_mask])

predicted_probs = tf.nn.softmax(predicted_logits, axis=-1)
print("predicted_probs:", predicted_probs[0])
predicted_token_indices = np.argmax(predicted_probs, axis=-1)

print("Predicted Probabilities argmax:", predicted_token_indices)


# Remove padding (if any) from the predicted token indices
predicted_token_indices = predicted_token_indices[0].tolist()
predicted_token_indices = [idx for idx in predicted_token_indices if idx != 0]  # Remove padding (token 0)

# Convert predicted token indices to sentence
predicted_sentence = tokens_to_sentence(predicted_token_indices)

# Output the results
print("Predicted Token Indices:", predicted_token_indices)
print("Predicted Summary:", predicted_sentence)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 151ms/step
predicted_probs: tf.Tensor(
[[3.0360398e-01 9.4068389e-07 2.4188583e-04 ... 8.4235484e-07
  1.0045437e-06 1.0406445e-06]
 [3.0358395e-01 9.4069367e-07 2.4191383e-04 ... 8.4245261e-07
  1.0045846e-06 1.0407039e-06]
 [3.0357921e-01 9.4070771e-07 2.4191040e-04 ... 8.4240253e-07
  1.0046725e-06 1.0406787e-06]
 ...
 [3.0360258e-01 9.4065274e-07 2.4190571e-04 ... 8.4232357e-07
  1.0045371e-06 1.0406496e-06]
 [3.0353776e-01 9.4089864e-07 2.4192478e-04 ... 8.4254543e-07
  1.0047346e-06 1.0408761e-06]
 [3.0361092e-01 9.4059601e-07 2.4189806e-04 ... 8.4229930e-07
  1.0045035e-06 1.0406118e-06]], shape=(276, 32000), dtype=float32)
Predicted Probabilities argmax: [[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0