# Demo 4 ML Project 

**#1** Upload a .png/.jpg file on a new dataset in order to predict a frame image. You can also use .mov/.mp4/.avi type in order to predict video. <br>
**NB:** For reliable results your image/video must be of the size used in dataset for training which is **640x640**.

**#2** You need to install Ultralytics library in order to execute YOLO model(s).

In [None]:
!pip install ultralytics

# Image mode - YOLO

In [None]:
from ultralytics import YOLO
from PIL import Image

model = YOLO("/kaggle/input/best-weights-for-model/one-of-the-yolo-models")
img = Image.open(INPUT_FILE_PATH)


res = model.predict(img)[0]

print("Total detections: " + str(len(res)))

res = res.plot(line_width=1)
res = res[:, :, ::-1]

res = Image.fromarray(res)
res.save("output.png")

#NB: You can also specify the threshold confidence by adding to the predict function the parameter conf = 0.5 or the value you desire. By default it should be 0.25 or 0.5 it depends on YOLO version. 

# Video mode - YOLO

In [None]:
from ultralytics import YOLO 

model = YOLO("/kaggle/input/best-weights-for-model/best yolov8m.pt")
results = model.predict(source = INPUT_FILE_PATH, save = True, project = 'runs/detect', name = 'exp')

In [None]:
# Average of People in Video
total_boxes = 0.0
sum_boxes = 0.0

for result in results:
    boxes = result.boxes  
    sum_boxes += len(boxes)
    total_boxes += 1
    
avg_detection = sum_boxes/total_boxes


In [None]:
print(round(avg_detection))

**#3** Now you can make predictions using Faster RCNN with backbone Resnet50 method. 

In [None]:
# Initialize the model

from torchvision.models.detection import fasterrcnn_resnet50_fpn
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor


print("Initializing Faster R-CNN with ResNet50 backbone...")


num_classes = 2 # person + background
        
model = fasterrcnn_resnet50_fpn(
    pretrained = True,
    progress = True,
    pretrained_backbone = True
)
        
in_features = model.roi_heads.box_predictor.cls_score.in_features
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)

model.to(device)
model.load_state_dict(torch.load(f"/kaggle/input/best-weights-for-model/best resnet50.pth", map_location = device))

In [None]:
# Import necessary library

import os
import cv2
import numpy as np
import torchvision
from torchvision import transforms
from PIL import Image
# Progress bar
from tqdm.auto import tqdm

# Image mode - Faster RCNN w/ Resnet 50

In [None]:
# Single Image Prediction and Visualization Functions

def predict_image(model, image_path, device, confidence_threshold=0.5):

    # Load and preprocess image
    image = Image.open(image_path).convert('RGB')
    image_array = np.array(image)
    
    # Convert to tensor and normalize
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])
    
    image_tensor = transform(image).unsqueeze(0).to(device)
    
    # Set model to evaluation mode
    model.eval()
    
    with torch.no_grad():
        predictions = model(image_tensor)
    
    # Process predictions
    pred = predictions[0]
    scores = pred['scores'].cpu().numpy()
    boxes = pred['boxes'].cpu().numpy()
    labels = pred['labels'].cpu().numpy()
    
    # Filter by confidence threshold
    keep = scores >= confidence_threshold
    
    filtered_predictions = {
        'boxes': boxes[keep],
        'scores': scores[keep],
        'labels': labels[keep],
        'original_image': image_array
    }
    
    return filtered_predictions


def visualize_prediction(predictions, save_path=None, figsize=(12, 8)):
    
    image = predictions['original_image']
    boxes = predictions['boxes']
    scores = predictions['scores']
    
    fig, ax = plt.subplots(1, 1, figsize=figsize)
    ax.imshow(image)
    
    ax.set_title(f'People Detection \nDetections: {len(boxes)}', 
                fontsize=14, fontweight='bold')
    ax.axis('off')
    
    # Color scheme for boxes
    colors = ['red', 'blue']
    
    # Draw prediction boxes
    for i, (box, score) in enumerate(zip(boxes, scores)):
        x1, y1, x2, y2 = box
        color = colors[i % len(colors)]
        
        # Draw bounding box
        rect = patches.Rectangle(
            (x1, y1), x2-x1, y2-y1,
            linewidth=1, edgecolor=color, facecolor='none'
        )
        ax.add_patch(rect)
        
        # Add confidence label
        label = f'{score:.3f}'
        ax.text(x1, y1-10, label, 
               bbox=dict(facecolor=color, alpha=0.7),
               color='white', fontsize=6, weight='bold')
    
    # Add detection summary
    if len(boxes) > 0:
        summary_text = f"Detections: {len(boxes)}\n"
        summary_text += f"Confidence range: {scores.min():.3f} - {scores.max():.3f}\n"
        summary_text += f"Avg confidence: {scores.mean():.3f}"
        
        ax.text(0.02, 0.98, summary_text,
               transform=ax.transAxes, fontsize=10,
               bbox=dict(boxstyle="round,pad=0.5", facecolor='white', alpha=0.8),
               verticalalignment='top')
    else:
        ax.text(0.02, 0.98, "No detections found",
               transform=ax.transAxes, fontsize=12, color='red',
               bbox=dict(boxstyle="round,pad=0.5", facecolor='white', alpha=0.8),
               verticalalignment='top', weight='bold')
    
    plt.tight_layout()
    
    if save_path:
        plt.savefig(save_path, dpi = 300, bbox_inches = 'tight')
        print(f"Visualization saved to: {save_path}")
    
    plt.show()


def predict_and_visualize(model, image_path, device, confidence_threshold = 0.5, save_path = None):
    
    # Make predictions
    predictions = predict_image(model, image_path, device, confidence_threshold)
      
    # Visualize results
    visualize_prediction(predictions, save_path)
        
    return predictions

In [None]:
predictions = predict_and_visualize(
    model=model,
    image_path=INPUT_FILE_PATH,
    device=device,
    confidence_threshold = 0.5
)

# Video mode - Faster RCNN w/ Resnet 50

In [None]:
# Video Prediction Functions

def predict_video(model, video_path, device, confidence_threshold = 0.5, 
                 output_path = None, skip_frames = 0, max_frames = None):
    
    # Open video
    cap = cv2.VideoCapture(video_path)

    fps = int(cap.get(cv2.CAP_PROP_FPS))
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

    # Create temporary directory for frames
    temp_dir = f"{WORKING_PATH}/temp_video_frames"
    os.makedirs(temp_dir, exist_ok=True)
    
    # Setup output video writer if needed
    out = None
    if output_path:
        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
        out = cv2.VideoWriter(output_path, fourcc, fps // skip_frames, (width, height))
        print(f"Output will be saved to: {output_path}")
    
    # Processing variables
    frame_predictions = []
    frame_count = 0
    processed_frames = 0
    total_detections = 0
    
    try:
        pbar = tqdm(total=min(total_frames // skip_frames, max_frames or float('inf')), 
                   desc="Processing frames")
        
        while True:
            ret, frame = cap.read()
            if not ret:
                break
            
            frame_count += 1
            
            # Skip frames if needed
            if frame_count % skip_frames != 0:
                continue
            
            # Check max frames limit
            if max_frames and processed_frames >= max_frames:
                break
            
            # Save frame temporarily
            temp_frame_path = os.path.join(temp_dir, f"temp_frame_{processed_frames:06d}.jpg")
            cv2.imwrite(temp_frame_path, frame)
            
            # Use existing predict_image function
            predictions = predict_image(model, temp_frame_path, device, confidence_threshold)
            
            frame_predictions.append({
                'frame_number': frame_count,
                'predictions': predictions,
                'timestamp': frame_count / fps
            })
            
            if out:
                # Draw predictions on frame using OpenCV (simple version)
                annotated_frame = frame.copy()
                for box, score in zip(predictions['boxes'], predictions['scores']):
                    x1, y1, x2, y2 = box.astype(int)
                    cv2.rectangle(annotated_frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
                    cv2.putText(annotated_frame, f'Person: {score:.2f}', (x1, y1-10), 
                               cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 0), 2)
                
                # Add frame info
                cv2.putText(annotated_frame, f'Detections: {len(predictions["boxes"])}', 
                           (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 255), 2)
                
                out.write(annotated_frame)
            
            # Clean up temporary frame
            os.remove(temp_frame_path)
            
            total_detections += len(predictions['boxes'])
            processed_frames += 1
            pbar.update(1)
            
            # Update progress bar description
            if processed_frames % 30 == 0:  # Update every 30 frames
                avg_detections = total_detections / processed_frames
                pbar.set_description(f"Processing frames (avg: {avg_detections:.1f} det/frame)")
        
        pbar.close()
    
    finally:
        cap.release()
        if out:
            out.release()
        # Clean up temp directory
        if os.path.exists(temp_dir):
            import shutil
            shutil.rmtree(temp_dir)

In [None]:
results = predict_video(
    model = model,
    video_path = INPUT_FILE_PATH,
    device = device,
    confidence_threshold = 0.5,
    output_path = 'output_with_detections.mp4',
    skip_frames = 2  
)