In [None]:
# Installer les dépendances (à lancer une seule fois dans Colab)
!pip install flask flask-ngrok pyngrok transformers pillow torch torchvision torchaudio opencv-python moviepy gtts git+https://github.com/openai/whisper.git --quiet

# Import des modules nécessaires
from flask import Flask, request, send_file, jsonify
from pyngrok import ngrok
from threading import Thread
import os
import shutil
import cv2
from PIL import Image
import torch
from transformers import BlipProcessor, BlipForConditionalGeneration, BartTokenizer, BartForConditionalGeneration
import whisper
from moviepy.editor import VideoFileClip, ImageSequenceClip, AudioFileClip
from gtts import gTTS

# Création de l'app Flask
app = Flask(__name__)

# Choix du device GPU si disponible
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Chargement des modèles au démarrage (uniquement une fois)
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)
whisper_model = whisper.load_model("base")
bart_tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
bart_model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn").to(device)

def extract_frames(video_path, fps_extract=1):
    os.makedirs("frames", exist_ok=True)
    cap = cv2.VideoCapture(video_path)
    fps = int(cap.get(cv2.CAP_PROP_FPS))
    count, saved = 0, 0
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        if count % fps == 0:
            path = f"frames/frame_{saved:04d}.jpg"
            cv2.imwrite(path, frame)
            saved += 1
        count += 1
    cap.release()
    return saved

def generate_captions(num_frames):
    captions = []
    for i in range(num_frames):
        img = Image.open(f"frames/frame_{i:04d}.jpg").convert("RGB")
        inputs = processor(images=img, return_tensors="pt").to(device)
        output = blip_model.generate(**inputs)
        caption = processor.tokenizer.decode(output[0], skip_special_tokens=True)
        captions.append(caption)
    return captions

def transcribe_audio(video_path):
    clip = VideoFileClip(video_path)
    audio_path = "audio.wav"
    clip.audio.write_audiofile(audio_path, verbose=False, logger=None)
    result = whisper_model.transcribe(audio_path)
    return result["text"]

def summarize_text(transcription, captions):
    document = "Transcription: " + transcription + "\n" + "Visual description: " + " ".join(captions)
    inputs = bart_tokenizer.encode(document, return_tensors="pt", max_length=1024, truncation=True).to(device)
    summary_ids = bart_model.generate(inputs, max_length=150, min_length=40, length_penalty=2.0, num_beams=4)
    return bart_tokenizer.decode(summary_ids[0], skip_special_tokens=True)

def create_summary_video(summary, num_frames=10):
    tts = gTTS(text=summary, lang="en")
    tts_path = "summary_audio.mp3"
    tts.save(tts_path)
    audio_clip = AudioFileClip(tts_path)
    audio_duration = audio_clip.duration
    frame_files = sorted([os.path.join("frames", f) for f in os.listdir("frames") if f.endswith(".jpg")])[:num_frames]
    if not frame_files:
        raise Exception("Aucune image trouvée.")
    frame_duration = audio_duration / len(frame_files)
    video_clip = ImageSequenceClip(frame_files, durations=[frame_duration]*len(frame_files))
    final_clip = video_clip.set_audio(audio_clip)
    output_path = "video_resumee_synced.mp4"
    final_clip.write_videofile(output_path, fps=1, codec="libx264", verbose=False, logger=None)
    return output_path

def cleanup():
    shutil.rmtree("frames", ignore_errors=True)
    for f in ["audio.wav", "summary_audio.mp3"]:
        if os.path.exists(f): os.remove(f)

def generate_summary(video_path):
    num_frames = extract_frames(video_path)
    captions = generate_captions(num_frames)
    transcription = transcribe_audio(video_path)
    summary = summarize_text(transcription, captions)
    summary_video_path = create_summary_video(summary, num_frames=min(num_frames, 10))
    cleanup()
    return summary_video_path

# Route pour uploader la vidéo et recevoir la vidéo résumée
@app.route('/upload_video', methods=['POST'])
def upload_video():
    if 'file' not in request.files:
        return jsonify({"error": "No file part"}), 400
    file = request.files['file']
    if file.filename == '':
        return jsonify({"error": "No selected file"}), 400

    video_path = f"uploaded_{file.filename}"
    file.save(video_path)

    try:
        summary_video_path = generate_summary(video_path)
    except Exception as e:
        return jsonify({"error": str(e)}), 500
    finally:
        if os.path.exists(video_path):
            os.remove(video_path)

    return send_file(summary_video_path, mimetype='video/mp4')

@app.route("/", methods=["GET"])
def index():
    return jsonify({"message": "API fonctionne", "endpoint": "/upload_video"})

# Démarrage de ngrok pour exposer l'API publiquement
def start_ngrok():
    # Met ici ton token ngrok personnel, si tu en as un
    os.environ["NGROK_AUTH_TOKEN"] = "2rRtD0JOoMbRrKsgLY5gIsC418i_3gtqQqcw6Z4QimS2uS8R5"
    public_url = ngrok.connect(5000)
    print("✅ API publique ngrok :", public_url)

def start_flask():
    app.run()

# Lancement ngrok + flask simultanément
start_ngrok()
Thread(target=start_flask).start()


  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Using device: cpu
✅ API publique ngrok : NgrokTunnel: "https://87d3-34-150-210-96.ngrok-free.app" -> "http://localhost:5000"
 * Serving Flask app '__main__'


SyntaxError: invalid syntax (<ipython-input-2-9813a36c5caa>, line 1)

In [None]:
# Installer les dépendances (à faire une seule fois)
!pip install flask transformers pillow torch torchvision torchaudio opencv-python moviepy gtts git+https://github.com/openai/whisper.git --quiet

# Imports
from flask import Flask, request, send_file, jsonify
import os
import shutil
import cv2
from PIL import Image
import torch
from transformers import BlipProcessor, BlipForConditionalGeneration, BartTokenizer, BartForConditionalGeneration
import whisper
from moviepy.editor import VideoFileClip, ImageSequenceClip, AudioFileClip
from gtts import gTTS

# Création app Flask
app = Flask(__name__)

# Device GPU si dispo
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Chargement des modèles (une seule fois)
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)
whisper_model = whisper.load_model("base")
bart_tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
bart_model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn").to(device)

# Extraction des frames (1 frame par seconde)
def extract_frames(video_path, fps_extract=1):
    os.makedirs("frames", exist_ok=True)
    cap = cv2.VideoCapture(video_path)
    fps = int(cap.get(cv2.CAP_PROP_FPS))
    count, saved = 0, 0
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        if count % fps == 0:
            path = f"frames/frame_{saved:04d}.jpg"
            cv2.imwrite(path, frame)
            saved += 1
        count += 1
    cap.release()
    return saved

# Générer captions pour chaque frame
def generate_captions(num_frames):
    captions = []
    for i in range(num_frames):
        img = Image.open(f"frames/frame_{i:04d}.jpg").convert("RGB")
        inputs = processor(images=img, return_tensors="pt").to(device)
        output = blip_model.generate(**inputs)
        caption = processor.tokenizer.decode(output[0], skip_special_tokens=True)
        captions.append(caption)
    return captions

# Transcrire audio de la vidéo avec Whisper
def transcribe_audio(video_path):
    clip = VideoFileClip(video_path)
    audio_path = "audio.wav"
    clip.audio.write_audiofile(audio_path, verbose=False, logger=None)
    result = whisper_model.transcribe(audio_path)
    return result["text"]

# Résumer texte avec BART
def summarize_text(transcription, captions):
    document = "Transcription: " + transcription + "\n" + "Visual description: " + " ".join(captions)
    inputs = bart_tokenizer.encode(document, return_tensors="pt", max_length=1024, truncation=True).to(device)
    summary_ids = bart_model.generate(inputs, max_length=150, min_length=40, length_penalty=2.0, num_beams=4)
    return bart_tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Créer vidéo résumée avec audio TTS
def create_summary_video(summary, num_frames=10):
    tts = gTTS(text=summary, lang="en")
    tts_path = "summary_audio.mp3"
    tts.save(tts_path)
    audio_clip = AudioFileClip(tts_path)
    audio_duration = audio_clip.duration
    frame_files = sorted([os.path.join("frames", f) for f in os.listdir("frames") if f.endswith(".jpg")])[:num_frames]
    if not frame_files:
        raise Exception("No frames found.")
    frame_duration = audio_duration / len(frame_files)
    video_clip = ImageSequenceClip(frame_files, durations=[frame_duration]*len(frame_files))
    final_clip = video_clip.set_audio(audio_clip)
    output_path = "video_resumee_synced.mp4"
    final_clip.write_videofile(output_path, fps=1, codec="libx264", verbose=False, logger=None)
    return output_path

# Nettoyage fichiers temporaires
def cleanup():
    shutil.rmtree("frames", ignore_errors=True)
    for f in ["audio.wav", "summary_audio.mp3"]:
        if os.path.exists(f): os.remove(f)

# Fonction principale résumé vidéo
def generate_summary(video_path):
    num_frames = extract_frames(video_path)
    captions = generate_captions(num_frames)
    transcription = transcribe_audio(video_path)
    summary = summarize_text(transcription, captions)
    summary_video_path = create_summary_video(summary, num_frames=min(num_frames, 10))
    cleanup()
    return summary_video_path

# Route upload vidéo -> vidéo résumée
@app.route('/upload_video', methods=['POST'])
def upload_video():
    if 'file' not in request.files:
        return jsonify({"error": "No file part"}), 400
    file = request.files['file']
    if file.filename == '':
        return jsonify({"error": "No selected file"}), 400

    video_path = f"uploaded_{file.filename}"
    file.save(video_path)

    try:
        summary_video_path = generate_summary(video_path)
    except Exception as e:
        return jsonify({"error": str(e)}), 500
    finally:
        if os.path.exists(video_path):
            os.remove(video_path)

    return send_file(summary_video_path, mimetype='video/mp4')

@app.route("/", methods=["GET"])
def index():
    return jsonify({"message": "API fonctionne", "endpoint": "/upload_video"})

# Lancer Flask (serveur local)
if __name__ == '__main__':
    app.run(host='0.0.0.0', port=5000)


  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Using device: cpu
 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:5000
 * Running on http://172.28.0.12:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m


In [None]:
!ngrok authtoken 2rRtD0JOoMbRrKsgLY5gIsC418i_3gtqQqcw6Z4QimS2uS8R5


Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml
