In [1]:
from data_fetcher import clone_repo, collect_python_files
from split_generator import SplitGenerator

import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM


  from .autonotebook import tqdm as notebook_tqdm


# 1. Fetching Data

In [5]:
REPO_URL = "https://github.com/mradovic38/football_analysis"

# Clone the repository
clone_repo(REPO_URL, clone_dir="repo")

# Collect all Python files from the cloned repository
collect_python_files("repo", target_dir="code_examples")

Cloning repository from https://github.com/mradovic38/football_analysis into repo...
Repository cloned successfully.
Collecting Python files from repo into code_examples...
Copied: repo\main.py -> code_examples\main.py
Copied: repo\yolo_inf.py -> code_examples\yolo_inf.py
Copied: repo\annotation\abstract_annotator.py -> code_examples\abstract_annotator.py
Copied: repo\annotation\abstract_video_processor.py -> code_examples\abstract_video_processor.py
Copied: repo\annotation\football_video_processor.py -> code_examples\football_video_processor.py
Copied: repo\annotation\frame_number_annotator.py -> code_examples\frame_number_annotator.py
Copied: repo\annotation\keypoints_annotator.py -> code_examples\keypoints_annotator.py
Copied: repo\annotation\object_annotator.py -> code_examples\object_annotator.py
Copied: repo\annotation\projection_annotator.py -> code_examples\projection_annotator.py
Copied: repo\ball_to_player_assignment\ball_to_player_assigner.py -> code_examples\ball_to_player_

# 2. Creating Data Examples

In [13]:
sg = SplitGenerator('code_examples', max_chars=256)

sg.generate('dataset/data.csv')

Generated 1 examples for file: code_examples\abstract_tracker.py
Generated 1 examples for file: code_examples\abstract_video_processor.py
Generated 9 examples for file: code_examples\ball_to_player_assigner.py
Generated 5 examples for file: code_examples\bbox_utils.py
Generated 17 examples for file: code_examples\club_assigner.py
Generated 3 examples for file: code_examples\color_utils.py
Generated 12 examples for file: code_examples\football_video_processor.py
Generated 3 examples for file: code_examples\frame_number_annotator.py
Generated 6 examples for file: code_examples\homography.py
Generated 3 examples for file: code_examples\keypoints_annotator.py
Generated 5 examples for file: code_examples\keypoints_tracker.py
Generated 4 examples for file: code_examples\main.py
Generated 5 examples for file: code_examples\object_annotator.py
Generated 2 examples for file: code_examples\object_position_mapper.py
Generated 10 examples for file: code_examples\object_tracker.py
Generated 3 examp

# 3. Loading Data

In [14]:
df = pd.read_csv('dataset/data.csv', delimiter='|').fillna('')

df.head()

Unnamed: 0,fname,prefix,middle,suffix
0,code_examples\ball_to_player_assigner.py,"from utils import point_distance, get_bbox_cen...",from .possession_tracking import PossessionTra...,"from typing import Dict, Tuple, Any\n\nclass B..."
1,code_examples\club_assigner.py,"def predict(self, extracted_color: Tuple[i...","""""""\n Predict the club for a gi...",\n Args:\n
2,code_examples\bbox_utils.py,"def point_coord_diff(p1: Tuple[float, float], ...",Calculate the coordinate differences betwe...,"Args:\n p1 (Tuple[float, float]): T..."
3,code_examples\projection_annotator.py,Parameters:\n frame (np.nda...,"pos (tuple): The (x, y) position o...",shape (str): The shape of the outl...
4,code_examples\club_assigner.py,"return (int(jersey_color_bgr[2]), int(...","\n def save_player_image(self, img: np.ndar...","""""""\n"


In [17]:
df['suffix'][6]

'    A video processor for football footage that tracks objects and keypoints,\n'

In [18]:
# Load the Tiny Starcoder model and tokenizer
model_name = "bigcode/tiny_starcoder_py"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Ensure pad_token_id is set to a valid token (e.g., eos_token_id)
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

# Set the model to evaluation mode
model.eval()

# Move model to device (CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Function to generate predictions for the middle part
def get_completion(prefix, suffix, max_new_tokens=50):
    # Prepare the input text
    input_text = f"<fim_prefix>{prefix}<fim_suffix>{suffix}<fim_middle>"
    
    # Tokenize the input
    inputs = tokenizer.encode(input_text, return_tensors="pt").to(device)
    
    # Generate the completion
    with torch.no_grad():
        outputs = model.generate(
            inputs, 
            max_length=256
        )
    
    # Decode and return the generated text
    generated_text = tokenizer.decode(outputs[0])
    
    # Extract the completion (text between prefix and suffix)
    completion = generated_text.split("<fim_middle>")[1].split(suffix)[0]
    
    return completion

# Generate predictions for each row in the DataFrame
preds = df[:5].apply(lambda row: get_completion(row['prefix'], row['suffix']), axis=1)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


In [5]:
input_text = "<fim_prefix>def print_one_two_three():\n    print('one')\n    <fim_suffix>\n    print('three')<fim_middle>"
inputs = tokenizer.encode(input_text, return_tensors="pt").to(device)
outputs = model.generate(inputs, max_length=128)
print(tokenizer.decode(outputs[0]))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


<fim_prefix>def print_one_two_three():
    print('one')
    <fim_suffix>
    print('three')<fim_middle>print('two')
    print('three')<|endoftext|>


In [13]:
for i in range(5):
    print('\nPREFIX:')
    print(df['prefix'][i])
    print('*'*50)
    print('\nMIDDLE TRUE:')
    print(df['middle'][i])
    print('*'*50)
    print('\nSUFFIX:')
    print(df['suffix'][i])
    print('_'*100)


PREFIX:
from abc import ABC, abstractmethod
from typing import Any

class AbstractWriter(ABC):
    """An abstract base class for writing data to a file."""

    @abstractmethod
    def write(self, filename: str, data: Any) -> None:
**************************************************

MIDDLE TRUE:
        """Save data to a file.

        Args:
            filename (str): The name of the file to save the data.
            data (Any): The data to be saved.
        """
        pass
        
    @abstractmethod
    def _make_serializable(self, obj: Any) -> Any:
**************************************************

SUFFIX:
        """Convert objects to a serializable format.

        Args:
            obj (Any): The object to convert.

        Returns:
            Any: A serializable representation of the object.
        """
        pass
____________________________________________________________________________________________________

PREFIX:
from .abstract_annotator import AbstractAnnotato

In [12]:
res = df.drop(columns=['fname']).copy()

res['mid_pred'] = preds

for i in range(5):
    print('PREFIX:')
    print(res['prefix'][i])
    print('*'*50)
    print('MIDDLE PREDICTION:')
    print(res['mid_pred'][i])
    print('*'*50)
    print('MIDDLE TRUE:')
    print(res['middle'][i])
    print('*'*50)
    print('SUFFIX:')
    print(res['suffix'][i])
    print('_'*100)

PREFIX:
                    color = track_info.get('club_color', (255, 255, 255))

**************************************************
MIDDLE PREDICTION:
                            shape = 'circle' if class_name == 'goalkeeper' else 'square'
                            self._draw_outline(frame, proj_pos, shape=shape, is_dark=is_dark_color)

                            if track_info.get('has_ball', False):
<|endoftext|>
**************************************************
MIDDLE TRUE:
                    color = rgb_bgr_converter(color)

**************************************************
SUFFIX:
                    is_dark_color = is_color_dark(color)

                    if class_name in ['player', 'goalkeeper']:
                        shape = 'square' if class_name == 'goalkeeper' else 'circle'
                        self._draw_outline(frame, proj_pos, shape=shape, is_dark=is_dark_color)

                        if track_info.get('has_ball', False):

__________________________________