In [None]:
class TStarFramework(nn.Module):
    """
    A unified framework for video-based question answering, combining grounding, 
    keyframe searching, and reasoning in a modular pipeline.
    """
    def __init__(self, pretrain_path_or_name="mll-lab/tstar-v1", config: transformers.PretrainedConfig = None):
        """
        Initializes the TStarFramework with pre-trained modules for grounding, searching, and reasoning.
        Args:
            pretrain_path_or_name (str): Path or model name for pretrained TStar components.
            config (transformers.PretrainedConfig): Optional configuration for the model.
        """
        super().__init__()
        self.query_grounder = TStarGrounder(model_path=pretrain_path_or_name, model_base=config)
        self.searcher = None  # VideoSearcher will be initialized dynamically with video-specific parameters
        # self.reasoner = TStarGrounder(model_path=pretrain_path_or_name, model_base=config)
        self.reasoner = self.query_grounder
        self.initialize_searcher(video_path)
    def initialize_searcher(self, video_path: str, search_nframes=8, target_objects=[], cue_objects=[], image_grid_shape=(8,8), threshold=0.6):
        """
        Dynamically initializes the VideoSearcher for the given video path.
        Args:
            video_path (str): Path to the video file.
        """
        self.searcher = TStarSearcher(
            video_path=video_path,
            target_objects=target_objects,
            cue_objects=target_objects,
            search_nframes=search_nframes,
            image_grid_shape=image_grid_shape,
            confidence_threshold=threshold
        )
    def forward(self, frames: List[Image.Image], question: str, options: str, video_path: str) -> Dict[str, any]:
        """
        Runs the full pipeline for question answering over video data.
        Args:
            frames (List[Image.Image]): Initial frames for grounding.
            question (str): The question to be answered.
            options (str): Multiple-choice options for the question.
            video_path (str): Path to the video file.
        Returns:
            Dict[str, any]: A dictionary containing the answer, keyframes, timestamps, and grounding results.
        """
        # Step 1: Grounding (Query Understanding)
        grounding_results = self.query_grounder.inference_query_grounding(frames, question)
        target_objects = grounding_results["target_objects"]
        cue_objects = grounding_results["cue_objects"]
        print(f"Grounding Results - Targets: {target_objects}, Cues: {cue_objects}")
        # Step 2: Reset Detect Objects in YOLO
        grounding_results = self.query_grounder.inference_query_grounding(frames, question)
        target_objects = grounding_results["target_objects"]
        cue_objects = grounding_results["cue_objects"]
        print(f"Grounding Results - Targets: {target_objects}, Cues: {cue_objects}")
        # Step 2: Keyframe Searching
        self.searcher.target_objects = target_objects
        self.searcher.cue_objects = cue_objects
        keyframes, timestamps = self.searcher.search()
        print(f"Keyframes Found: {len(keyframes)}, Timestamps: {timestamps}")
        # Step 3: Reasoning (QA Inference)
        answer = self.reasoner.inference_qa(keyframes, question, options)
        print(f"Reasoning Results - Answer: {answer}")
        return {
            "answer": answer,
            "keyframes": keyframes,
            "timestamps": timestamps,
            "grounding_results": grounding_results
        }
