In [None]:
#!pip install transformers datasets sentence-transformers schedule

In [1]:
import time
import schedule
import random
from datetime import datetime
from transformers import pipeline, set_seed
from sentence_transformers import SentenceTransformer, util
from datasets import load_dataset

class LowResourcePipeline:
    def __init__(self):
        print("Initialisation du Pipeline CPU-Friendly...")

        # 1. Génération (DistilGPT2 : ~300MB, très rapide)
        self.generator = pipeline(
            'text-generation',
            model='distilgpt2',
            device=-1 # Force CPU
        )

        # 2. Résumé (T5-Small : ~240MB, excellent ratio perf/taille)
        self.summarizer = pipeline(
            'summarization',
            model='t5-small',
            device=-1
        )

        # 3. Quality Control (MiniLM : le standard pour la similarité rapide)
        self.qc_model = SentenceTransformer('all-MiniLM-L6-v2', device='cpu')

        # 4. Filtre Éthique (Liste de mots - O(1) complexity)
        self.banned_words = ["hate", "violence", "illegal", "badword"]

    def load_data(self):
        """Charge et subsample 5% du dataset IMDB"""
        print("Chargement du dataset IMDB...")

        # Chargement en mode streaming pour économiser la RAM si besoin,
        # mais ici on charge tout pour le subsampling précis
        dataset = load_dataset("imdb", split="train")

        # Subsampling à 5% (Question 4)
        total_rows = len(dataset)
        subset_size = int(total_rows * 0.05)

        print(f"Subsampling : Conservation de {subset_size} exemples sur {total_rows} (5%)")

        # On utilise le temps actuel comme graine pour que le mélange soit différent à chaque fois
        return dataset.shuffle(seed=int(time.time())).select(range(subset_size))

    def ethical_filter(self, text):
        """Question 10: Filtre éthique basique"""
        text_lower = text.lower()
        for word in self.banned_words:
            if word in text_lower:
                return False
        return True

    def evaluate_quality(self, prompt, generated_text, summary):
        """Question 6 & 9: Évaluation et QC"""
        metrics = {}

        # A. Vérification de longueur
        metrics['length_ok'] = len(generated_text.split()) > 20

        # B. Vérification de répétition (ratio unique n-grams)
        words = generated_text.split()
        unique_words = set(words)
        metrics['repetition_ratio'] = len(unique_words) / len(words) if words else 0

        # C. Similarité Sémantique (Prompt vs Résumé)
        # Si le résumé du texte généré est proche du prompt, le modèle n'a pas dévié.
        embeddings = self.qc_model.encode([prompt, summary], convert_to_tensor=True)
        similarity = util.cos_sim(embeddings[0], embeddings[1]).item()
        metrics['semantic_score'] = similarity

        # Décision globale
        is_valid = (
            metrics['length_ok'] and
            metrics['repetition_ratio'] > 0.5 and
            metrics['semantic_score'] > 0.3
        )

        return is_valid, metrics

    def process_item(self, prompt_text):
        """Exécute le pipeline pour un item"""
        # 1. Génération
        # On limite max_new_tokens pour la vitesse
        gen_output = self.generator(
            prompt_text,
            max_new_tokens=100,
            pad_token_id=50256,
            truncation=True
        )
        # .strip() retire les espaces et sauts de ligne parasites au début/fin
        generated_text = gen_output[0]['generated_text'].strip()

        # 2. Filtre Éthique
        if not self.ethical_filter(generated_text):
            return None, "REJET_ETHIQUE"

        # 3. Résumé
        # T5 a besoin d'un prefixe parfois, mais t5-small gère bien le brut
        sum_output = self.summarizer(
            generated_text,
            max_length=40,
            min_length=5,
            do_sample=False
        )
        summary_text = sum_output[0]['summary_text'].strip()

        # 4. Évaluation
        is_valid, metrics = self.evaluate_quality(prompt_text, generated_text, summary_text)

        # On prépare les résultats
        result_data = {
            "prompt": prompt_text,
            "generated": generated_text,
            "summary": summary_text,
            "metrics": metrics
        }

        if not is_valid:
            # On retourne les données pour affichage des métriques, même en cas de rejet
            return result_data, f"REJET_QUALITE (Score: {metrics['semantic_score']:.2f})"

        return result_data, "SUCCES"

    def run_batch(self):
        """Job principal exécuté par l'automate"""
        print(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] Démarrage du batch planifié...")
        data = self.load_data()

        # Sélection de 5 indices aléatoires distincts pour garantir la variété
        indices = random.sample(range(len(data)), 5)
        sample_batch = data.select(indices)
        total_items = len(sample_batch)

        results = []
        for i, item in enumerate(sample_batch):
            # On utilise les 50 premiers caractères comme prompt
            prompt = item['text'][:50]
            result, status = self.process_item(prompt)

            print(f"\\n--- Item {i+1}/{total_items} ---")

            if result:
                print(f"Prompt: {result['prompt']}")

                # Truncate affichage texte généré (300 chars max pour l'affichage)
                gen_text = result['generated']
                if len(gen_text) > 300:
                    gen_text = gen_text[:300] + "..."
                print(f"Texte généré: {gen_text}")

                print(f"Summary: {result['summary']}")
                print(f"Métriques:")
                for key, val in result['metrics'].items():
                    print(f"  - {key}: {val}")

                if "SUCCES" in status:
                    results.append(result)

            print(f"Statut: {status}")

        print(f"\\nBatch terminé. {len(results)} contenus valides générés.")

# --- AUTOMATION (Question 8) ---
def start_scheduler():
    pipeline_instance = LowResourcePipeline()

    # Exécuter immédiatement une fois pour tester
    pipeline_instance.run_batch()

    # Planifier toutes les heures
    schedule.every(1).minutes.do(pipeline_instance.run_batch)

    print("En attente du prochain job (Ctrl+C pour quitter)...")
    while True:
        schedule.run_pending()
        time.sleep(1)

if __name__ == "__main__":
    start_scheduler()

  from .autonotebook import tqdm as notebook_tqdm


Initialisation du Pipeline CPU-Friendly...


Device set to use cpu
Device set to use cpu


[2025-11-21 00:30:23] Démarrage du batch planifié...
Chargement du dataset IMDB...


Downloading readme: 7.81kB [00:00, 15.9MB/s]


Subsampling : Conservation de 1250 exemples sur 25000 (5%)


Both `max_new_tokens` (=256) and `max_length`(=40) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Your max_length is set to 40, but your input_length is only 26. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=13)
Both `max_new_tokens` (=256) and `max_length`(=40) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


\n--- Item 1/5 ---
Prompt: .... this movie basks too much in its own innocenc
Texte généré: .... this movie basks too much in its own innocencianity and the only thing that’s really good about it is that it doesn't feel like it’s all about being a child. It’s a movie that’s just that it’s all about making you want to be a child and not just looking for a mother. But it’s the movie that’s t...
Summary: this movie basks too much in its own innocencianity and the only thing that’s really good about it is that it doesn’t feel like it’s all about being a child . the movie is just about making you want to be a . child and not just looking for a mother .
Métriques:
  - length_ok: True
  - repetition_ratio: 0.5384615384615384
  - semantic_score: 0.580538272857666
Statut: SUCCES
\n--- Item 2/5 ---
Prompt: IT IS A PIECE OF CRAP! not funny at all. during th
Texte généré: IT IS A PIECE OF CRAP! not funny at all. during thriller week in the US.
Summary: thriller week in the us is not funny at all .

Both `max_new_tokens` (=256) and `max_length`(=40) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


\n--- Item 3/5 ---
Prompt: This is a thriller with a good concept, good actin
Texte généré: This is a thriller with a good concept, good actin' a bad action, bad action. If you're a bad guy, you can be a bad guy, and, indeed, if you're a bad guy, you can be a bad guy, and, indeed, if you're a bad guy, you can be a bad guy, and, indeed, if you're a bad guy, you can be a bad guy, and, indeed...
Summary: this thriller is a thriller with a good concept, good actin' a bad action, bad action . if you're a . bad guy, you can be an . good guy, and, indeed, if . you've got a problem, you have to get a job done . it's a great thriller with good concept .
Métriques:
  - length_ok: True
  - repetition_ratio: 0.24390243902439024
  - semantic_score: 0.7888089418411255
Statut: REJET_QUALITE (Score: 0.79)


Both `max_new_tokens` (=256) and `max_length`(=40) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


\n--- Item 4/5 ---
Prompt: I read the negative comments before viewing this f
Texte généré: I read the negative comments before viewing this fiddle






The only thing I ever like about the subject is the way it is written, and I think it's a really good thing to read it.
It is, I think, the most interesting thing about the Fiddle is that it is actually a puzzle puzzle, and it has to be s...
Summary: the fiddle is actually a puzzle puzzle, and it has to be solved . it's a lot of puzzle design and puzzle design that goes through the puzzle .
Métriques:
  - length_ok: True
  - repetition_ratio: 0.5955056179775281
  - semantic_score: 0.08857010304927826
Statut: REJET_QUALITE (Score: 0.09)


Both `max_new_tokens` (=256) and `max_length`(=40) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


\n--- Item 5/5 ---
Prompt: When we were in junior high school, some of us boy
Texte généré: When we were in junior high school, some of us boycotted the gym for the school game and he gave us a message.


“It was all about getting to work, and being the best,” he said. “I had to put a smile on my face and ask that you take pride in yourself.”
A couple of months after the gym was shut down,...
Summary: when we were in junior high school, some of us boycotted the gym for the school game . a couple of months after the gym was shut down, the school was back in the gym .
Métriques:
  - length_ok: True
  - repetition_ratio: 0.6744186046511628
  - semantic_score: 0.4413517713546753
Statut: SUCCES
\nBatch terminé. 2 contenus valides générés.
En attente du prochain job (Ctrl+C pour quitter)...


KeyboardInterrupt: 