In [1]:
 ### 1. Mount Google Drive ###

from google.colab import drive

drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
ROOT_DIR = '/content/gdrive/My Drive/lstm_model'

In [3]:
!pip install ultralytics

Collecting ultralytics
  Downloading ultralytics-8.2.19-py3-none-any.whl (757 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m757.9/757.9 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
Collecting thop>=0.1.1 (from ultralytics)
  Downloading thop-0.1.1.post2209072238-py3-none-any.whl (15 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.8.0->ultralytics)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.8.0->ultralytics)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.8.0->ultralytics)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.8.0->ultralytics)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.

In [4]:
import os

from ultralytics import YOLO

yolo_model = YOLO(os.path.join(ROOT_DIR, 'detect/train/weights/best.pt'))

In [5]:
import cv2
import torch
import numpy as np
import torch.nn as nn

In [6]:
# Define LSTM Model for Detection Adjustment
class ActionLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers):
        super(ActionLSTM, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, input_size)  # Match input size for delta output

    def forward(self, x):
        out, _ = self.lstm(x)
        # out = self.fc(out[:, -1, :])  # Take the output of the last time step
        out = self.fc(out)
        return out

# Example LSTM parameters
input_size = 4  # For example, (x1, y1, x2, y2, confidence, class_id)
hidden_size = 512
num_layers = 2

action_lstm = ActionLSTM(input_size, hidden_size, num_layers)

# This function is used to calculate the deltas between two consecutive

In [7]:
def load_video_frames(video_path):
    # This function loads video frames using OpenCV
    cap = cv2.VideoCapture(video_path)
    frames = []
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        # Convert frame from BGR to RGB
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        frames.append(frame)
    cap.release()
    return frames

def match_detections(d1, d2):
    # Simplest matching based on the closest center point
    if not d1 or not d2:
        return [], []

    matched = []
    for det in d1:
        # Calculate center of each detection in d1
        center_x1 = (det[0] + det[2]) / 2
        center_y1 = (det[1] + det[3]) / 2

        # Find the closest detection in d2
        distances = [(np.hypot(center_x1 - ((d[0]+d[2])/2), center_y1 - ((d[1]+d[3])/2)), i) for i, d in enumerate(d2)]
        distances.sort()
        matched_index = distances[0][1]
        matched.append((det, d2[matched_index]))

    # Unpack matched pairs
    matched_d1, matched_d2 = zip(*matched) if matched else ([], [])
    return matched_d1, matched_d2

def calculate_deltas(d1, d2):
    # Assumes d1 and d2 are tuples of matched detections
    deltas = [np.array(d2[i]) - np.array(d1[i]) for i in range(len(d1))]
    return deltas

def collect_data_for_lstm(video_dataset, yolo_model):
    all_features = []
    all_targets = []

    for video in video_dataset:
        frames = load_video_frames(video)
        sequence_features = []
        sequence_targets = []

        for i in range(len(frames) - 1):
            current_frame = frames[i]
            next_frame = frames[i+1]

            # Run YOLO detection
            current_detections = [det[:4] for det in yolo_model(current_frame)[0].boxes.data.tolist() if det[4] > 0.5]  # Convert to list and slice to (x1, y1, x2, y2, conf, class_id)
            next_detections = [det[:4] for det in yolo_model(next_frame)[0].boxes.data.tolist() if det[4] > 0.5]

            # Match detections from current to next frame
            matched_current, matched_next = match_detections(current_detections, next_detections)

            # Calculate deltas
            deltas = calculate_deltas(matched_current, matched_next)

            sequence_features.extend(matched_current)
            sequence_targets.extend(deltas)

        # Convert lists to tensors and append to overall dataset
        if sequence_features and sequence_targets:
            lstm_inputs = torch.tensor(sequence_features, dtype=torch.float32)
            lstm_targets = torch.tensor(sequence_targets, dtype=torch.float32)
            all_features.append(lstm_inputs)
            all_targets.append(lstm_targets)

    return all_features, all_targets

# Example usage
video1_path = os.path.join(ROOT_DIR, 'test_vid/IMG_1745.mp4')
video2_path = os.path.join(ROOT_DIR, 'test_vid/IMG_1746.mp4')
video3_path = os.path.join(ROOT_DIR, 'test_vid/IMG_1747.mp4')
video4_path = os.path.join(ROOT_DIR, 'test_vid/IMG_1748.mp4')
video5_path = os.path.join(ROOT_DIR, 'test_vid/IMG_1749.mp4')
video_dataset = [video1_path, video2_path, video3_path, video4_path]
all_features, all_targets = collect_data_for_lstm(video_dataset, yolo_model)
# dataset = collect_data_for_lstm(video_dataset)

video_testing = [video5_path]
testing_features, testing_targets = collect_data_for_lstm(video_testing, yolo_model)




  return F.conv2d(input, weight, bias, self.stride,


0: 640x384 6 bottless, 109.9ms
Speed: 13.0ms preprocess, 109.9ms inference, 1970.0ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 6 bottless, 6.5ms
Speed: 5.6ms preprocess, 6.5ms inference, 1.2ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 6 bottless, 6.7ms
Speed: 2.7ms preprocess, 6.7ms inference, 1.1ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 6 bottless, 6.5ms
Speed: 2.7ms preprocess, 6.5ms inference, 1.3ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 6 bottless, 7.0ms
Speed: 2.6ms preprocess, 7.0ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 6 bottless, 8.1ms
Speed: 3.1ms preprocess, 8.1ms inference, 1.2ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 6 bottless, 5.8ms
Speed: 2.6ms preprocess, 5.8ms inference, 1.1ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 6 bottless, 5.8ms
Speed: 3.0ms preprocess, 5.8ms inference, 1.0ms postprocess per image at shape (1

  lstm_targets = torch.tensor(sequence_targets, dtype=torch.float32)


[1;30;43mKết quả truyền trực tuyến bị cắt bớt đến 5000 dòng cuối.[0m
0: 640x384 8 bottless, 11.8ms
Speed: 2.9ms preprocess, 11.8ms inference, 1.3ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 8 bottless, 9.4ms
Speed: 2.8ms preprocess, 9.4ms inference, 1.4ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 8 bottless, 9.9ms
Speed: 2.9ms preprocess, 9.9ms inference, 1.3ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 8 bottless, 9.7ms
Speed: 2.9ms preprocess, 9.7ms inference, 1.7ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 8 bottless, 9.9ms
Speed: 2.9ms preprocess, 9.9ms inference, 1.5ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 8 bottless, 9.9ms
Speed: 2.7ms preprocess, 9.9ms inference, 1.5ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 8 bottless, 9.0ms
Speed: 2.7ms preprocess, 9.0ms inference, 1.7ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 8 bottless, 9.2ms
Speed: 2.3ms pre

In [8]:
from torch.utils.data import Dataset, DataLoader

class DetectionDataset(Dataset):
    def __init__(self, features, targets, look_back=10):
        """
        Args:
            features: A list of tensors containing features for each time step.
            targets: A list of targets corresponding to each feature set.
            look_back: Number of past time steps to use as input features.
        """
        self.features = features
        self.targets = targets
        self.look_back = look_back
        self.data = self.create_dataset(features, targets)

    def create_dataset(self, features, targets):
        data = []
        for i in range(len(features) - self.look_back):
            input_sequence = features[i:i + self.look_back]
            target_sequence = targets[i + 1:i + 1 + self.look_back]
            data.append((input_sequence, target_sequence))
        return data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

def prepare_data(features, targets, look_back):
  dataset = DetectionDataset(features, targets, look_back)
  return DataLoader(dataset, batch_size=32, shuffle=True, num_workers=1)

In [9]:
# Assuming `all_features` and `all_targets` are lists of tensors collected from `collect_data_for_lstm`
all_features = torch.vstack(all_features)  # Assuming all_features is a list of tensors
all_targets = torch.vstack(all_targets)  # Assuming all_targets is a list of tensors
look_back = 10
train_loader = prepare_data(all_features, all_targets, look_back)

all_testing_features = torch.vstack(testing_features)
all_testing_targets = torch.vstack(testing_targets)
look_back = 10
test_loader = prepare_data(all_testing_features, all_testing_targets, look_back)

In [10]:
# Check if CUDA is available
if torch.cuda.is_available():
    print("CUDA is available! Training on GPU.")
else:
    print("CUDA is not available. Training on CPU.")

CUDA is available! Training on GPU.


In [11]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [12]:
def to_device(data, device):
    """Move tensor(s) to chosen device"""
    if isinstance(data, (list,tuple)):
        return [to_device(x, device) for x in data]
    return data.to(device, non_blocking=True)

In [13]:
class DeviceDataLoader():
    """Wrap a dataloader to move data to a device"""
    def __init__(self, dl, device):
        self.dl = dl
        self.device = device

    def __iter__(self):
        """Yield a batch of data after moving it to device"""
        for b in self.dl:
            yield to_device(b, self.device)

    def __len__(self):
        """Number of batches"""
        return len(self.dl)

In [14]:
train_loader = DeviceDataLoader(train_loader, device)
test_loader = DeviceDataLoader(test_loader, device)

In [15]:
action_lstm = action_lstm.to(device)
action_lstm

ActionLSTM(
  (lstm): LSTM(4, 512, num_layers=2, batch_first=True)
  (fc): Linear(in_features=512, out_features=4, bias=True)
)

In [16]:
for data in train_loader:
    action_lstm.train()
    features, targets = data  # features are YOLO outputs, targets are deltas
    print(features.shape)
    break

torch.Size([32, 10, 4])


In [17]:
import torch.optim as optim

# Example pseudo-code for a training loop
loss_fn = nn.MSELoss()
optimizer = optim.Adam(action_lstm.parameters(), lr = 0.0001)

num_epochs = 160
training_loss = []
testing_loss = []

for epoch in range(num_epochs):
    for data in train_loader:
        action_lstm.train()
        features, targets = data  # features are YOLO outputs, targets are deltas
        # print(features.shape)
        # print(targets.shape)
        lstm_outputs = action_lstm(features)  # Predict deltas
        loss = loss_fn(lstm_outputs, targets)  # Calculate loss
        optimizer.zero_grad()
        loss.backward()  # Backpropagate error
        optimizer.step()  # Update weights

    # testing
    if epoch % 10 == 0:
      with torch.no_grad():
        # compute the loss function for the training set
        total_loss = 0
        number_of_batches = 0
        for data in train_loader:
            action_lstm.eval()
            features, targets = data
            lstm_outputs = action_lstm(features)
            loss = loss_fn(lstm_outputs, targets)
            total_loss += loss.item()
            number_of_batches += 1
        # calculate the average loss for the epoch
        training_loss.append(total_loss/number_of_batches)


        # # compute the loss function for the testing set
        total_loss = 0
        number_of_batches = 0
        for data in test_loader:
            action_lstm.eval()
            features, targets = data
            lstm_outputs = action_lstm(features)
            loss = loss_fn(lstm_outputs, targets)
            total_loss += loss.item()
            number_of_batches += 1
        # calculate the average loss for the epoch
        testing_loss.append(total_loss/number_of_batches)

      # print the loss
      print('Epoch: ', epoch, '\tTraining loss: ', '% 4f' % training_loss[-1], '\tTesting loss: ', '% 4f' % testing_loss[-1])
      # print('Epoch: ', epoch, '\tTraining loss: ', '% 4f' % training_loss[-1])

Epoch:  0 	Training loss:   306.735984 	Testing loss:   349.689255
Epoch:  10 	Training loss:   300.991391 	Testing loss:   372.210074
Epoch:  20 	Training loss:   286.405477 	Testing loss:   365.814756
Epoch:  30 	Training loss:   265.590666 	Testing loss:   351.949728
Epoch:  40 	Training loss:   242.984160 	Testing loss:   354.150543
Epoch:  50 	Training loss:   227.664300 	Testing loss:   356.745360
Epoch:  60 	Training loss:   209.885531 	Testing loss:   356.770002
Epoch:  70 	Training loss:   196.131672 	Testing loss:   357.990543
Epoch:  80 	Training loss:   186.640300 	Testing loss:   357.244938
Epoch:  90 	Training loss:   175.690924 	Testing loss:   363.420708
Epoch:  100 	Training loss:   174.734051 	Testing loss:   368.970608
Epoch:  110 	Training loss:   163.450991 	Testing loss:   358.946145
Epoch:  120 	Training loss:   159.671434 	Testing loss:   357.896614
Epoch:  130 	Training loss:   156.556648 	Testing loss:   357.453738
Epoch:  140 	Training loss:   152.660058 	Tes

In [18]:
# Adjust YOLO Detections based on LSTM outputs
def adjust_detections(original_detections, lstm_adjustments):
    """
    Adjust the original YOLO detections based on LSTM outputs.

    Args:
        original_detections (Tensor): The original detections from YOLO, shape [N, 6], where N is the number of detections,
                                      and columns represent (x1, y1, x2, y2, confidence, class_id).
        lstm_adjustments (Tensor): Adjustments from LSTM, shape [N, 5], columns represent deltas for (x1, y1, x2, y2, confidence).

    Returns:
        Tensor: Adjusted detections.
    """
    # Ensure lstm_adjustments are scaled to the same range as original_detections
    adjusted_detections = [d.clone() for d in original_detections]  # Clone to avoid modifying the original data

    for i, adj in enumerate(lstm_adjustments):
        adjusted_detections[i][:, :4] += adj  # Apply deltas
        adjusted_detections[i][:, 0:4] = torch.clamp(adjusted_detections[i][:, 0:4], 0, 1)  # Normalize coordinates
        adjusted_detections[i][:, 4] = torch.sigmoid(adjusted_detections[i][:, 4])  # Normalize confidence scores

    return adjusted_detections

In [19]:
def enhance_detection_with_lstm(frames, yolo, lstm_model, device_v):
    lstm_model.eval()  # Ensure the LSTM model is in evaluation mode
    adjusted_detections = []

    for i, frame in enumerate(frames):
        # Run YOLO detection
        raw_detections = yolo(frame)[0].boxes.data.tolist()

        # Filter detections with confidence > 0.5 and extract coordinates
        filtered_detections = [d[:4] for d in raw_detections if d[4] > 0.5]

        # Check if there are any detections to process
        if not filtered_detections:
            adjusted_detections.append([])
            continue

        # Prepare LSTM inputs (feature vectors) and targets (deltas)
        features = torch.tensor(filtered_detections, dtype=torch.float32).to(device_v)  # Extract necessary features

        lstm_outputs = lstm_model(features.unsqueeze(0))  # Add batch dimension if needed
        lstm_outputs = lstm_outputs.squeeze(0)  # Remove batch dimension if needed

        # Adjust YOLO detections based on LSTM outputs
        # adjusted_detections = adjust_detections(detections, lstm_outputs)
        # Adjust original detections
        # Ensure to convert lstm_outputs to the same format as features before adjustment
        adjusted = features + lstm_outputs.detach()

        # Ensure the coordinates remain within the image boundaries and pack them back into the original format
        adjusted[:, :2] = torch.clamp(adjusted[:, :2], 0)  # Clamp x1, y1
        adjusted[:, 2:] = torch.clamp(adjusted[:, 2:], 0)  # Clamp x2, y2

        adjusted_detections.append(adjusted.cpu().numpy().tolist())  # Convert back to list if needed

    return adjusted_detections

In [20]:
action_lstm

ActionLSTM(
  (lstm): LSTM(4, 512, num_layers=2, batch_first=True)
  (fc): Linear(in_features=512, out_features=4, bias=True)
)

In [26]:
video1_path_test = os.path.join(ROOT_DIR, 'test_vid/IMG_1748.mp4')
frames_test = load_video_frames(video1_path_test)

outputs = enhance_detection_with_lstm(frames_test, yolo_model, action_lstm, device)


0: 640x384 8 bottless, 11.9ms
Speed: 5.2ms preprocess, 11.9ms inference, 2.2ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 8 bottless, 9.8ms
Speed: 2.8ms preprocess, 9.8ms inference, 1.7ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 8 bottless, 10.0ms
Speed: 3.0ms preprocess, 10.0ms inference, 1.6ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 8 bottless, 11.0ms
Speed: 3.1ms preprocess, 11.0ms inference, 1.6ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 8 bottless, 7.8ms
Speed: 3.4ms preprocess, 7.8ms inference, 1.1ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 8 bottless, 5.9ms
Speed: 3.4ms preprocess, 5.9ms inference, 1.2ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 8 bottless, 6.2ms
Speed: 3.2ms preprocess, 6.2ms inference, 1.2ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 8 bottless, 6.4ms
Speed: 3.2ms preprocess, 6.4ms inference, 1.1ms postprocess per image at shape (1,

In [27]:
# test_tung = np.array(outputs[10])
# # Remove the first dimension
# # test_tung_squeezed = np.squeeze(test_tung, axis=0)

# # Print the new shape to confirm
# print(test_tung.shape)
print(outputs)

[[[701.9638061523438, 675.1643676757812, 873.256591796875, 837.3531494140625], [698.6807250976562, 866.3424072265625, 869.1261596679688, 1019.9292602539062], [336.3475341796875, 854.9853515625, 488.3656005859375, 1022.3793334960938], [320.3450927734375, 1237.15234375, 466.757080078125, 1417.6300048828125], [512.22119140625, 1038.3817138671875, 671.4802856445312, 1209.961669921875], [130.3226776123047, 1034.6304931640625, 288.5371398925781, 1193.3968505859375], [846.47705078125, 1080.90234375, 1015.858642578125, 1252.9000244140625], [120.5511474609375, 1432.47802734375, 274.84564208984375, 1646.092041015625]], [[702.4774780273438, 674.4447021484375, 873.5514526367188, 837.4768676757812], [698.528564453125, 865.99462890625, 868.4950561523438, 1019.9002075195312], [316.8232116699219, 1233.0552978515625, 463.9985046386719, 1415.72607421875], [347.5185852050781, 869.4318237304688, 501.5956726074219, 1035.8984375], [507.1210632324219, 1038.0589599609375, 666.69140625, 1210.2568359375], [698.

In [28]:
for output_frame in outputs:
  for result in output_frame:
    x1, y1, x2, y2 = result
    print(x1, y1, x2, y2)
    break
  break

701.9638061523438 675.1643676757812 873.256591796875 837.3531494140625


In [29]:
video_path = os.path.join(ROOT_DIR, 'test_vid/IMG_1748.mp4')
video_path_out = os.path.join(ROOT_DIR, 'test_vid/IMG_1748_23_05.mp4')

cap = cv2.VideoCapture(video_path)

# Get video properties
fps = int(cap.get(cv2.CAP_PROP_FPS))
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

# Create VideoWriter object
out = cv2.VideoWriter(video_path_out, cv2.VideoWriter_fourcc(*'MP4V'), fps, (width, height))

# Initialize a frame counter
frame_count = 0

# Process video
while cap.isOpened():
    ret, frame = cap.read()
    if ret:
        # Get detections for the current frame
        if frame_count < len(outputs):
            current_detections = outputs[frame_count]

            # Draw rectangles based on current frame detections
            for result in current_detections:
                x1, y1, x2, y2 = result
                cv2.rectangle(frame, (int(x1), int(y1)), (int(x2), int(y2)), (0, 255, 0), 4)

        # Write the frame with rectangles to the output video
        out.write(frame)

        # Increment frame counter
        frame_count += 1
    else:
        break

# Release everything when job is finished
cap.release()
out.release()
cv2.destroyAllWindows()
print("Video processing complete and the output saved.")

Video processing complete and the output saved.


In [30]:
video_path = os.path.join(ROOT_DIR, 'test_vid/IMG_1748.mp4')
video_path_out = os.path.join(ROOT_DIR, 'test_vid/IMG_1748_yolo.mp4')

cap = cv2.VideoCapture(video_path)

ret, frame = cap.read()
H, W, _ = frame.shape
# out = cv2.VideoWriter(video_path_out, cv2.VideoWriter_fourcc('X','2','6','4'), int(cap.get(cv2.CAP_PROP_FPS)), (W,H))
out = cv2.VideoWriter(video_path_out, cv2.VideoWriter_fourcc(*'MP4V'), int(cap.get(cv2.CAP_PROP_FPS)), (W, H))

threshold = 0.5

while ret:

    results = yolo_model(frame)[0]

    for result in results.boxes.data.tolist():
        x1, y1, x2, y2, score, class_id = result

        if score > threshold:
            cv2.rectangle(frame, (int(x1), int(y1)), (int(x2), int(y2)), (0, 255, 0), 4)
            cv2.putText(frame, results.names[int(class_id)].upper(), (int(x1), int(y1 - 10)),
                        cv2.FONT_HERSHEY_SIMPLEX, 1.3, (0, 255, 0), 3, cv2.LINE_AA)

    out.write(frame)
    ret, frame = cap.read()

cap.release()
out.release()
cv2.destroyAllWindows()


0: 640x384 8 bottless, 11.2ms
Speed: 4.0ms preprocess, 11.2ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 8 bottless, 10.2ms
Speed: 3.7ms preprocess, 10.2ms inference, 1.8ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 8 bottless, 7.2ms
Speed: 4.7ms preprocess, 7.2ms inference, 1.3ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 8 bottless, 9.5ms
Speed: 2.8ms preprocess, 9.5ms inference, 1.5ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 8 bottless, 9.2ms
Speed: 2.8ms preprocess, 9.2ms inference, 1.2ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 8 bottless, 6.8ms
Speed: 2.3ms preprocess, 6.8ms inference, 1.3ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 8 bottless, 9.9ms
Speed: 2.8ms preprocess, 9.9ms inference, 1.2ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 8 bottless, 7.3ms
Speed: 4.7ms preprocess, 7.3ms inference, 1.3ms postprocess per image at shape (1, 3

In [31]:
torch.save(action_lstm.state_dict(), '/content/gdrive/My Drive/lstm_model/lstm_yolo.pth')