In [1]:
!pip install moviepy openai-whisper transformers torch




In [25]:
!pip install PyPDF2


Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [26]:
import os
import whisper
from transformers import pipeline
from moviepy.editor import VideoFileClip

# ---------- 0) استخراج الصوت من الفيديو ----------
def extract_audio_from_video(video_path, audio_path="temp_audio.wav"):
    video = VideoFileClip(video_path)
    video.audio.write_audiofile(audio_path)
    return audio_path

# ---------- 1) تحويل الصوت لنص (Transcript) ----------
def transcribe_audio(file_path):
    model = whisper.load_model("base")  # يمكن استخدام small أو medium
    result = model.transcribe(file_path)  # يتعرف على اللغة أوتوماتيك
    return result["text"]

# ---------- 2) توليد وصف (Description) ----------
def generate_description(text):
    text = clean_text(text)

    # لو النص قصير جدًا، نرجع النص نفسه
    if len(text.split()) < 50:
        return text

    summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
    chunks = split_text(text, max_words=200)
    summaries = []
    for chunk in chunks:
        try:
            summary = summarizer(chunk, max_length=40, min_length=10, do_sample=False)
            summaries.append(summary[0]['summary_text'])
        except Exception as e:
            print("⚠️ خطأ في جزء من النص، تم تخطيه:", e)
    return " ".join(summaries)


# ---------- 3) التعامل مع أي نوع محتوى ----------
def process_content(file_path):
    ext = os.path.splitext(file_path)[1].lower()

    if ext in [".mp4", ".mov", ".avi"]:  # فيديو
        print("🎬 محتوى فيديو - جاري استخراج الصوت ...")
        audio_path = extract_audio_from_video(file_path)
        transcript = transcribe_audio(audio_path)
    elif ext in [".mp3", ".wav", ".m4a"]:  # صوت
        print("🎤 محتوى صوت - جاري التحويل لنص ...")
        transcript = transcribe_audio(file_path)
    elif ext in [".txt", ".pdf"]:  # نص/كتاب
        print("📖 محتوى نصي - جاري قراءة الملف ...")
        if ext == ".txt":
            with open(file_path, "r", encoding="utf-8") as f:
                transcript = f.read()
        else:
            # قراءة PDF (بـ PyPDF2)
            import PyPDF2
            pdf_reader = PyPDF2.PdfReader(file_path)
            transcript = ""
            for page in pdf_reader.pages:
                transcript += page.extract_text() + "\n"
    else:
        raise ValueError("الملف غير مدعوم. استخدم فيديو/صوت/نص.")

    description = generate_description(transcript)
    return transcript, description

# ---------- 4) مثال على الاستخدام ----------
if __name__ == "__main__":
    file_path = "Lecture 1.pdf"  # ضع هنا الملف: فيديو، صوت، نص
    transcript, description = process_content(file_path)

    print("\n--- Transcript ---\n", transcript)
    print("\n--- Description ---\n", description)


📖 محتوى نصي - جاري قراءة الملف ...


Device set to use cpu



--- Transcript ---
 Introduction to Artificial 
Intelligence
 الذكاءالصناعى واألنظمة الخبيرة
First Term 2022 -2023
Dr. Hany El-Ghaish
1
Course
Info
●Course Title: Artificial Intelligence and Expert
Systems
●Course Code: CCE3219
●Grading:
–Term work: 30Points
–Practical/Oral : 30Points
–Final Exam: 90Points
●Lecture: 4hours
●Section: 2hours
●TA: Eng. Sondos
2
Text Book

Lecture 1  
Introduction To AI
4

What is AI?

Thinking humanly
•Cognitive science: the brain as an information  
processing machine
•Requires scientific theories of how the brain works
•How to understand cognition as a  
computational process?
•Introspection: try to think about how we think
•Predict and test behavior of human subjects
•Image the brain, examine neurological data
•The latter two methodologies are the domains  
of cognitive science and cognitive neuroscience
•Turing (1950) "Computing machinery and intelligence"
•The Turing Test
•What capabilities would a computer need to have to pass  
the Turing Test?
•N

In [12]:
!pip install yt-dlp openai-whisper transformers torch



Collecting yt-dlp
  Downloading yt_dlp-2025.9.5-py3-none-any.whl.metadata (177 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/177.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m174.1/177.1 kB[0m [31m5.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m177.1/177.1 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
Downloading yt_dlp-2025.9.5-py3-none-any.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m17.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: yt-dlp
Successfully installed yt-dlp-2025.9.5


In [19]:
import yt_dlp
import whisper
from transformers import pipeline
import re

# ---------- 1) تحميل الصوت من YouTube ----------
def download_youtube_audio(url):
    output_file = "youtube_audio"  # بدون امتداد
    ydl_opts = {
        'format': 'bestaudio/best',
        'outtmpl': output_file,
        'postprocessors': [{
            'key': 'FFmpegExtractAudio',
            'preferredcodec': 'mp3',
            'preferredquality': '192',
        }],
        'quiet': False
    }

    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        ydl.download([url])

    final_file = output_file + ".mp3"
    print(f"✅ تم تحميل الصوت: {final_file}")
    return final_file

# ---------- 2) تحويل الصوت لنص ----------
def transcribe_audio(file_path):
    model = whisper.load_model("base")
    result = model.transcribe(file_path)
    return result["text"]

# ---------- 3) تنظيف النص ----------
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^ء-يA-Za-z0-9 .,!?]', '', text)
    return text.strip()

# ---------- 4) تقسيم النص الطويل ----------
def split_text(text, max_words=200):
    words = text.split()
    chunks = []
    for i in range(0, len(words), max_words):
        chunks.append(" ".join(words[i:i+max_words]))
    return chunks

# ---------- 5) توليد وصف آمن ----------
def generate_description(text):
    summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
    text = clean_text(text)
    chunks = split_text(text, max_words=200)
    summaries = []
    for chunk in chunks:
        try:
            summary = summarizer(chunk, max_length=40, min_length=10, do_sample=False)
            summaries.append(summary[0]['summary_text'])
        except Exception as e:
            print("⚠️ خطأ في جزء من النص، تم تخطيه:", e)
    return " ".join(summaries)

# ---------- 6) التنفيذ ----------
if __name__ == "__main__":
    youtube_url = "https://www.youtube.com/shorts/9IZOLy-Hz4c"  # أي رابط فيديو أو Shorts
    audio_file = download_youtube_audio(youtube_url)

    print("\n🎤 جاري تحويل الصوت لنص ...")
    transcript = transcribe_audio(audio_file)
    transcript = clean_text(transcript)
    print("\n--- Transcript ---\n", transcript)

    print("\n📝 جاري توليد الوصف ...")
    description = generate_description(transcript)
    print("\n--- Description ---\n", description)


[youtube] Extracting URL: https://www.youtube.com/shorts/9IZOLy-Hz4c
[youtube] 9IZOLy-Hz4c: Downloading webpage
[youtube] 9IZOLy-Hz4c: Downloading tv simply player API JSON
[youtube] 9IZOLy-Hz4c: Downloading tv client config
[youtube] 9IZOLy-Hz4c: Downloading tv player API JSON
[info] 9IZOLy-Hz4c: Downloading 1 format(s): 251
[download] Sleeping 3.00 seconds as required by the site...
[download] Destination: youtube_audio
[download] 100% of  295.71KiB in 00:00:00 at 548.26KiB/s 
[ExtractAudio] Destination: youtube_audio.mp3
Deleting original file youtube_audio (pass -k to keep)
✅ تم تحميل الصوت: youtube_audio.mp3

🎤 جاري تحويل الصوت لنص ...






--- Transcript ---
 Mark my words. AI is far more dangerous than nukes rock paper scissors Shoot I won this is a good beginning with my plan to dominate the human race

📝 جاري توليد الوصف ...


Device set to use cpu
Your max_length is set to 40, but your input_length is only 33. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=16)



--- Description ---
  AI is far more dangerous than nukes rock paper scissors . Mark my words. Mark my . words. 'I'm going to dominate the human race'
