In [None]:
from sys import version
print(version)

##### Working Code

In [2]:
import cv2 as cv
import numpy as np
import threading
import pandas as pd
import torch
import torch.nn as nn
from ultralytics import YOLO
from sklearn.preprocessing import StandardScaler
import socket

# NUMPY Print Options 
np.set_printoptions(threshold=3, suppress=True)

# Paths for models
lstm_model_path = "C:\\Users\\I3D\\Desktop\\TP-RNN\\model\\LSTM.pth"
yolo_model = "C:\\Users\\I3D\\Desktop\\TP-RNN\\model\\yolov8n.pt"
detector = YOLO(yolo_model)

# LSTM Model Definition
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size, dropout_prob=0.2):
        super(LSTM, self).__init__()
        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout_prob
        )
        self.fc = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(dropout_prob)

    def forward(self, x):
        out, _ = self.lstm(x)
        out = out[:, -1, :]  # Take the output of the last time step
        out = self.dropout(out)
        out = self.fc(out)
        return out

# Load LSTM Model
def load_lstm_model():
    model = LSTM(input_size=2, hidden_size=128, num_layers=4, output_size=2, dropout_prob=0.3)
    model.load_state_dict(torch.load(lstm_model_path, map_location=torch.device("cpu")))
    model.eval()
    return model

# Initialize LSTM model and scaler
lstm_model = load_lstm_model()
scaler = StandardScaler()

# Camera setup
cap = cv.VideoCapture(0, cv.CAP_DSHOW)
# cap = cv.VideoCapture('C:\\Users\\harip\\Desktop\\RNNImplement\\Video\\SampleTest.mp4')

cap.set(cv.CAP_PROP_FRAME_WIDTH, 1920)
cap.set(cv.CAP_PROP_FRAME_HEIGHT, 1080)

# UDP Connection Setup
# UDP_IP = "10.168.62.95"
UDP_IP = "192.168.137.166"
# UDP_IP = "10.162.61.154" #To run on pc only
UDP_PORT = 8000
sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)

# Threading
latest_frame = None
lock = threading.Lock()
person_history = {}  # Stores past positions for each person
person_id_counter = 0  

# Camera frame buffer
def rtsp_cam_buffer(vcap):
    global latest_frame
    while vcap.isOpened():
        ret, frame = vcap.read()
        if not ret:
            break
        with lock:
            latest_frame = frame

# Start threading
t1 = threading.Thread(target=rtsp_cam_buffer, args=(cap,))
t1.daemon = True
t1.start()
print('Camera Starting...')

# Function to match people across frames using Nearest Neighbor
def match_people(previous_positions, current_positions):
    matched = {}
    available = current_positions.copy()
    
    for pid, prev_pos in previous_positions.items():
        if available:
            nearest = min(available, key=lambda p: np.linalg.norm(np.array(p) - np.array(prev_pos[-1])))
            matched[pid] = prev_pos + [nearest]
            available.remove(nearest)
    
    global person_id_counter
    for new_pos in available:
        person_id_counter += 1
        matched[person_id_counter] = [new_pos]

    return matched

# Function to apply Exponential Moving Average (EMA) for smoothing
def apply_ema(predictions, alpha=0.95): # Orginally was 0.8
    smoothed_pred = predictions[0]  
    for i in range(1, len(predictions)):
        smoothed_pred = alpha * predictions[i] + (1 - alpha) * smoothed_pred
    return smoothed_pred

# Main loop
while cap.isOpened():
    with lock:
        if latest_frame is None:
            continue
        img = latest_frame.copy()

    frame_h, frame_w = img.shape[:2]
    displayed_h, displayed_w = 1080, 1920
    ratio_h, ratio_w = displayed_h / frame_h, displayed_w / frame_w
    results = detector(img)
    detections = results[0].boxes.data.cpu().numpy()
    person_detections = [d for d in detections if int(d[5]) == 0]

    current_positions = []
    for detection in person_detections:
        x1, y1, x2, y2, conf, cls = detection.astype(int)
        if int(cls) == 0:
            x_mid = int((x1 + x2) / 2 * ratio_w)
            y_mid = int(y2 * ratio_h)
            current_positions.append((x_mid, y_mid))

            # Draw bounding box around the detected person
            cv.rectangle(img, 
                         (int(x1 * ratio_w), int(y1 * ratio_h)), 
                         (int(x2 * ratio_w), int(y2 * ratio_h)), 
                         (0, 255, 0), 3)  # Green Box

    # Match people across frames
    person_history = match_people(person_history, current_positions)

    resized_img = cv.resize(img, (displayed_w, displayed_h), interpolation=cv.INTER_LINEAR)
    sending_string = f"Detected,{len(person_history)},"

    for pid, positions in person_history.items():
        if len(positions) < 6:  
            continue  

        positions = positions[-6:]  
        person_history[pid] = positions

        input_array = np.array(positions, dtype=float)
        scaled_input = scaler.fit_transform(input_array)
        current_input = torch.tensor(scaled_input, dtype=torch.float32).unsqueeze(0)

        future_predictions = []
        with torch.no_grad():
            for _ in range(3):
                pred = lstm_model(current_input)
                pred = pred.cpu().numpy().squeeze()
                pred = scaler.inverse_transform(pred.reshape(1, -1))
                future_predictions.append(pred[0])

        avg_pred = apply_ema(future_predictions)

        if len(positions) > 3:
            velocities = np.diff(positions[-4:], axis=0)  
            avg_velocity = np.mean(velocities, axis=0)  
            pred_x = positions[-1][0] + avg_velocity[0] * 3  
            pred_y = positions[-1][1] + avg_velocity[1] * 3  
        else:
            pred_x, pred_y = avg_pred[0], avg_pred[1]

        alpha = 0.7
        pred_x = int(alpha * pred_x + (1 - alpha) * avg_pred[0]) 
        pred_y = int(alpha * pred_y + (1 - alpha) * avg_pred[1])

        if len(positions) > 1:
            dx, dy = np.diff(positions[-2:], axis=0)[-1]
            if np.linalg.norm([dx, dy]) < 10:  
                pred_x, pred_y = positions[-1]

        print(f"p{pid}, {pred_x}, {pred_y}, {positions[-1][0]}, {positions[-1][1]}")
        sending_string += f"p{pid}, {pred_x}, {pred_y}, {positions[-1][0]}, {positions[-1][1]}, "

        cv.circle(resized_img, (pred_x, pred_y), 10, (0, 0, 255), -1)  
        cv.circle(resized_img, (positions[-1][0], positions[-1][1]), 5, (0, 255, 255), -1)  

    MESSAGE = bytes(sending_string, 'ascii')
    sock.sendto(MESSAGE, (UDP_IP, UDP_PORT))

    cv.imshow('Frame', resized_img)

    if cv.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv.destroyAllWindows()
sock.close()


Camera Starting...

0: 384x640 1 person, 5 chairs, 79.0ms
Speed: 7.8ms preprocess, 79.0ms inference, 6.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 4 chairs, 40.2ms
Speed: 2.1ms preprocess, 40.2ms inference, 1.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 4 chairs, 36.6ms
Speed: 2.5ms preprocess, 36.6ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 4 chairs, 37.3ms
Speed: 2.6ms preprocess, 37.3ms inference, 1.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 5 chairs, 35.0ms
Speed: 2.0ms preprocess, 35.0ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 5 chairs, 37.3ms
Speed: 1.5ms preprocess, 37.3ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)
p1, 365, 299, 365, 299

0: 384x640 1 person, 5 chairs, 37.3ms
Speed: 2.5ms preprocess, 37.3ms inference, 0.5ms postprocess per image at shape (1, 3, 384, 640)
p1, 36