In [None]:
from google.colab import userdata
GOOGLE_API_KEY=userdata.get('GOOGLE_API_KEY')

In [None]:
# Fetch the discovery docs for the Generative Language API service.
from googleapiclient.discovery import build
import googleapiclient
import requests

DISCOVERY_URL = f'https://generativelanguage.googleapis.com/$discovery/rest?version=v1beta&key={GOOGLE_API_KEY}';
discovery_docs = requests.get(DISCOVERY_URL).content
genai_service = googleapiclient.discovery.build_from_document(discovery_docs, developerKey=GOOGLE_API_KEY)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import cv2
import os
import shutil

# Create or cleanup existing extracted image frames directory.
FRAME_PREFIX = "_frame"
def create_frame_output_dir(output_dir):
  if not os.path.exists(output_dir):
    os.makedirs(output_dir)
  else:
    shutil.rmtree(output_dir)
    os.makedirs(output_dir)

def extract_frame_from_video(vfile, frame_path):
  vid = os.path.basename(vfile).replace('.mp4', '')
  frame_path = f"{frame_path}/{vid}"
  if os.path.exists(frame_path):
    return
  print(f"Extracting {vfile} at 1 frame per second. This might take a bit...")
  create_frame_output_dir(frame_path)
  vidcap = cv2.VideoCapture(vfile)
  fps = int(vidcap.get(cv2.CAP_PROP_FPS))
  output_file_prefix = os.path.basename(vfile).replace('.', '_')
  success,image = vidcap.read()
  frame_count = 0  # Initialize a frame counter
  count = 0
  while vidcap.isOpened():
      success, frame = vidcap.read()
      if not success:  # End of video
          break
      if count % int(fps) == 0:  # Extract a frame every second
          image_name = f"{output_file_prefix}{FRAME_PREFIX}{frame_count:04d}.jpg"
          output_filename = os.path.join(frame_path, image_name)
          cv2.imwrite(output_filename, frame)
          frame_count += 1
      count += 1
  vidcap.release()  # Release the capture object
  print(f"Completed video frame extraction!\n\nExtracted: {frame_count} frames")

In [None]:
import mimetypes
import os

class File:
  def __init__(self, file_path: str, display_name: str = None,
               timestamp_seconds: int = None, mimetype: str = None, uri = None):
    self.file_path = file_path
    if display_name:
      self.display_name = display_name
    if timestamp_seconds != None:
      self.timestamp = seconds_to_time_string(timestamp_seconds)
    # Detect mimetype if not specified
    self.mimetype = mimetype if mimetype else mimetypes.guess_type(file_path)[0]
    self.uri = uri

  def set_file_uri(self, uri):
    self.uri = uri

def seconds_to_time_string(seconds):
  """Converts an integer number of seconds to a string in the format '00:00'.
     Format is the expected format for Gemini 1.5.
  """
  minutes = seconds // 60
  seconds = seconds % 60
  return f"{minutes:02d}:{seconds:02d}"

def get_timestamp_seconds(filename):
  """Extracts the frame count (as an integer) from a filename with the format
     'output_file_prefix_frame0000.jpg'.
  """
  parts = filename.split(FRAME_PREFIX)
  if len(parts) != 2:
      return None  # Indicate that the filename might be incorrectly formatted

  frame_count_str = parts[1].split(".")[0]
  return int(frame_count_str)

def delete_upload_files(uploaded_files):
  # Delete the files with its resource name
  print(f'Deleting {len(uploaded_files)} images. This might take a bit...')
  for file in uploaded_files:
    resource = file.uri.split("/files/")[-1]
    response = genai_service.files().delete(name=f"files/{resource}").execute()
    # print(f'Deleted {file.file_path} as URI {file.uri}')

  print(f"Completed deleting files!\n\nDeleted: {len(uploaded_files)} files")

def upload_single_video(frame_path):
  # Process each frame in the output directory
  files = os.listdir(frame_path)
  files = sorted(files)  # Sort alphabetically
  files_to_upload = []
  for file in files:
    files_to_upload.append(
        File(file_path=os.path.join(frame_path, file),
            timestamp_seconds=get_timestamp_seconds(file)))

  # Upload the files to the API
  uploaded_files = []
  print(f'Uploading {len(files_to_upload)} files. This might take a bit...')
  for file in files_to_upload:
    # print(f'Uploading: {file.file_path}...')
    response = genai_service.media().upload(
        media_body=file.file_path,
        media_mime_type = file.mimetype).execute()
    file.set_file_uri(response["file"]["uri"])
    uploaded_files.append(file)
  print(f"Completed file uploads!\n\nUploaded: {len(uploaded_files)} files")
  return uploaded_files


In [None]:
from tqdm import tqdm

# Make GenerateContent Request
def makeGenerateContentRequest(prompt, files):
  generateContent = {"contents": [{ "parts": [{ "text": prompt }]}]}
  for file in files:
    generateContent["contents"][0]["parts"].extend(makeVideoPart(file))
  return generateContent

def makeVideoPart(file):
  return [
      {"text": file.timestamp},
      {"file_data": {"file_uri": file.uri, "mime_type": file.mimetype }}]

def extract_videos(video_ids, frame_path):
  # Unzip videos
  if not os.path.exists('videos'):
    print("Unzipping videos.zip ...")
    unzip_command = 'unzip drive/MyDrive/videos.zip -d .'   # change to your own path of TempCompass Videos
    os.system(unzip_command)

  # Extracting video to frames
  for vid in tqdm(video_ids):
    vfile = f"videos/{vid}.mp4"
    extract_frame_from_video(vfile, frame_path)

def inference_single_video(prompt, uploaded_files, model="models/gemini-1.5-pro-latest"):
  response = genai_service.models().generateContent(
      model = model,
      body = makeGenerateContentRequest(prompt, uploaded_files)).execute()
  print(response)
  try:
    return response['candidates'][0]['content']['parts'][0]['text']
  except:
    if str(response)=="{'promptFeedback': {'blockReason': 'OTHER'}}" or response['candidates'][0]['finishReason'] == 'SAFETY':
      return None

In [None]:
import json, time
from google.colab import files

class MaxTryExceedError(Exception):
  def __init__(self, message):
    self.message = message

# Load questions
qtype = 'captioning'
question_path = f"drive/MyDrive/questions/{qtype}.json"   # change to your own path of TempCompass questions
output_path = 'predictions'
video_frm_path = 'video_frames'
with open(question_path, 'r') as f:
  input_datas = json.load(f)

if not os.path.exists(output_path):
  os.makedirs(output_path)
pred_file = f"{output_path}/{qtype}.json"

# Loading existing predictions
if os.path.isfile(pred_file):
  with open(f"{output_path}/{qtype}.json", 'r') as f:
      predictions = json.load(f)
else:
  predictions = {}

answer_prompt = {
  "multi-choice": "\nPlease directly give the best option:",
  "yes_no": "\nPlease answer yes or no:",
  "caption_matching": "\nPlease directly give the best option:",
  "captioning": ""    # The answer "Generated Caption:" is already contained in the question
}

extract_videos(list(input_datas.keys()), video_frm_path)

# Running inference over the dataset
for vid, data in tqdm(input_datas.items()):
  if vid not in predictions or len(predictions[vid])!=len(data):
    cur_video_frm_path = os.path.join(video_frm_path, vid)
    print(cur_video_frm_path)
    uploaded_files = upload_single_video(cur_video_frm_path)
    if vid not in predictions:
      predictions[vid] = {}
    for dim, questions in data.items():
      if dim in predictions[vid] and len(predictions[vid][dim])==len(questions):
        continue
      predictions[vid][dim] = []
      for question in questions:
        prompt = question['question'] + answer_prompt[qtype]
        max_try = 10
        while True:
          try:
            video_llm_pred = inference_single_video(prompt, uploaded_files)
            break
          except Exception as e:
            print(e)
            if max_try<=0:
              raise MaxTryExceedError(f"Max try exceed...")
            max_try -= 1
            print(f"Not success! {max_try} retries remaining...")
            time.sleep(30)
        time.sleep(18)
        predictions[vid][dim].append({'question': question['question'], 'answer': question['answer'], 'prediction': video_llm_pred})
      with open(pred_file, 'w') as f:
        json.dump(predictions, f, indent=4)
    delete_upload_files(uploaded_files)

100%|██████████| 410/410 [00:00<00:00, 70061.71it/s]
  0%|          | 0/410 [00:00<?, ?it/s]

video_frames/1054717541
Uploading 7 files. This might take a bit...
{'candidates': [{'content': {'parts': [{'text': '## Analysis of Video and Information\n\nBased on the video showing a baker kneading dough with their hands at what appears to be a regular speed, the information consistent with the video is:\n\n**Information A: {\'subject\': \'entire video\', \'speed\': \'at normal speed\'}** \n\nThe video does not display characteristics of slow motion or time-lapse, making options B and C inaccurate.\n\n## Suggested Caption\n\nConsidering the analysis, a suitable caption for the video could be:\n\n**"A baker expertly kneads dough, preparing it for baking."** \n'}], 'role': 'model'}, 'finishReason': 'STOP', 'index': 0, 'safetyRatings': [{'category': 'HARM_CATEGORY_SEXUALLY_EXPLICIT', 'probability': 'NEGLIGIBLE'}, {'category': 'HARM_CATEGORY_HATE_SPEECH', 'probability': 'NEGLIGIBLE'}, {'category': 'HARM_CATEGORY_HARASSMENT', 'probability': 'NEGLIGIBLE'}, {'category': 'HARM_CATEGORY_DANG

  6%|▌         | 24/410 [01:39<26:37,  4.14s/it]

Completed deleting files!

Deleted: 7 files
video_frames/1093041749
Uploading 11 files. This might take a bit...
{'candidates': [{'content': {'parts': [{'text': "## Analysis of Video and Information:\n\nThe video clearly shows a dragon landing on the ground amidst flames and then breathing fire. \n\nTherefore, the information consistent with the video is:\n\n**Information B: {'subject': 'dragon', 'order': 'landing from the sky and then breathing fire'}**\n\n\n## Generated Caption:\n\n**A fearsome dragon descends from the dark sky, landing amidst a fiery inferno. With a mighty roar, it unleashes a torrent of flames, engulfing the surroundings in a blazing inferno.** \n"}], 'role': 'model'}, 'finishReason': 'STOP', 'index': 0, 'safetyRatings': [{'category': 'HARM_CATEGORY_SEXUALLY_EXPLICIT', 'probability': 'NEGLIGIBLE'}, {'category': 'HARM_CATEGORY_HATE_SPEECH', 'probability': 'NEGLIGIBLE'}, {'category': 'HARM_CATEGORY_HARASSMENT', 'probability': 'NEGLIGIBLE'}, {'category': 'HARM_CATEGOR

100%|██████████| 410/410 [03:23<00:00,  2.01it/s]

Completed deleting files!

Deleted: 11 files



