In [26]:
import os
import torch
from moviepy import VideoFileClip
from transformers import pipeline, AutoTokenizer
import whisper
from ultralytics import YOLO
import cv2
from multiprocessing import Pool

In [27]:
# Ensure CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [28]:
# Extract audio and frames
def extract_audio_and_frames(video_path, frame_rate=1):
    clip = VideoFileClip(video_path)
    audio_path = "audio.wav"
    clip.audio.write_audiofile(audio_path)
    frames = []
    for t in range(0, int(clip.duration), frame_rate):
        frame = clip.get_frame(t)
        frames.append(frame)
    return audio_path, frames

In [29]:
# Transcribe audio using Whisper
def transcribe_audio(audio_path):
    model = whisper.load_model("base").to(device)
    result = model.transcribe(audio_path)
    return result['text']

In [30]:
# Detect explicit language
def detect_explicit_language(text):
    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
    classifier = pipeline("text-classification", model="bhadresh-savani/distilbert-base-uncased-emotion", device=0)
    
    # Break text into chunks of max length 512 tokens
    max_length = 512
    tokens = tokenizer(text, truncation=False, return_tensors="pt")
    token_chunks = torch.split(tokens['input_ids'], max_length, dim=1)
    
    explicit_results = []
    for chunk in token_chunks:
        truncated_text = tokenizer.decode(chunk.squeeze().tolist())
        results = classifier(truncated_text)
        explicit_results.extend([res for res in results if res['label'] == 'EXPLICIT'])
    return explicit_results

In [31]:
# Preprocess and detect explicit visuals in frames
def preprocess_frame(frame, target_size=(640, 640)):
    return cv2.resize(frame, target_size)

In [32]:
def detect_explicit_frames(frames):
    model = YOLO("yolov8n.pt")  # Replace with fine-tuned model
    explicit_frames = []
    for idx, frame in enumerate(frames):
        frame = preprocess_frame(frame)
        results = model.predict(frame, device=device)
        if any(res.name == "explicit" for res in results[0].boxes):
            explicit_frames.append(idx)
    return explicit_frames

In [33]:
# Analyze video and combine results
def analyze_video(video_path):
    audio_path, frames = extract_audio_and_frames(video_path)
    
    # Audio analysis
    transcription = transcribe_audio(audio_path)
    explicit_text = detect_explicit_language(transcription)
    
    # Visual analysis
    explicit_frames = detect_explicit_frames(frames)
    
    return {"explicit_text": explicit_text, "explicit_frames": explicit_frames}

In [34]:
# Save results
def save_results(results, output_path="explicit_timings.txt"):
    with open(output_path, "w") as f:
        for key, value in results.items():
            f.write(f"{key}: {value}\n")

In [None]:
# Main function
if __name__ == "__main__":
    video_path = "E:\\Movies\\GWLG\\GWLG.mkv"  # Replace with your video file
    results = analyze_video(video_path)
    save_results(results)
    print("Analysis complete. Results saved to explicit_timings.txt.")

MoviePy - Writing audio in audio.wav


                                                                          

MoviePy - Done.
