In [17]:
from data_fetcher import clone_repo, collect_python_files
from split_generator import SplitGenerator
from manual_reviewer import ManualReviewer

import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from sklearn.metrics import accuracy_score
from sacrebleu import corpus_bleu
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
import numpy as np
from scipy.stats import pearsonr, spearmanr

# 1. Fetching Data

In [2]:
REPO_URL = "https://github.com/mradovic38/football_analysis"

# Clone the repository
clone_repo(REPO_URL, clone_dir="repo")

# Collect all Python files from the cloned repository
collect_python_files("repo", target_dir="code_examples")

Cloning repository from https://github.com/mradovic38/football_analysis into repo...
Repository cloned successfully.
Collecting Python files from repo into code_examples...
Copied: repo\main.py -> code_examples\main.py
Copied: repo\yolo_inf.py -> code_examples\yolo_inf.py
Copied: repo\annotation\abstract_annotator.py -> code_examples\abstract_annotator.py
Copied: repo\annotation\abstract_video_processor.py -> code_examples\abstract_video_processor.py
Copied: repo\annotation\football_video_processor.py -> code_examples\football_video_processor.py
Copied: repo\annotation\frame_number_annotator.py -> code_examples\frame_number_annotator.py
Copied: repo\annotation\keypoints_annotator.py -> code_examples\keypoints_annotator.py
Copied: repo\annotation\object_annotator.py -> code_examples\object_annotator.py
Copied: repo\annotation\projection_annotator.py -> code_examples\projection_annotator.py
Copied: repo\ball_to_player_assignment\ball_to_player_assigner.py -> code_examples\ball_to_player_

# 2. Loading the Model

In [2]:
# Load the Tiny Starcoder model and tokenizer
MODEL_NAME = "bigcode/tiny_starcoder_py"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)

# 3. Creating Data Examples

In [3]:
PREFIX_LENGTH = 200
MIDDLE_LEGTH = 40
SUFFIX_LENGTH = 200

In [4]:
sg = SplitGenerator(tokenizer=tokenizer, directory='code_examples', 
                    middle_length=MIDDLE_LEGTH, prefix_length=PREFIX_LENGTH, suffix_length=SUFFIX_LENGTH)

sg.generate('dataset/data.csv', random_seed=42)

Generated 5 examples for file: code_examples\ball_to_player_assigner.py
Generated 1 examples for file: code_examples\bbox_utils.py
Generated 5 examples for file: code_examples\club_assigner.py
Generated 9 examples for file: code_examples\football_video_processor.py
Generated 2 examples for file: code_examples\homography.py
Generated 2 examples for file: code_examples\keypoints_tracker.py
Generated 2 examples for file: code_examples\main.py
Generated 5 examples for file: code_examples\object_annotator.py
Generated 1 examples for file: code_examples\object_position_mapper.py
Generated 2 examples for file: code_examples\object_tracker.py
Generated 4 examples for file: code_examples\projection_annotator.py
Generated 2 examples for file: code_examples\speed_estimator.py
Generated 2 examples for file: code_examples\tracks_json_writer.py
Generated 4 examples for file: code_examples\video_utils.py


# 3. Loading Data

In [5]:
df = pd.read_csv('dataset/data.csv', delimiter='|').fillna('')

df.head()

Unnamed: 0,fname,prefix,middle,suffix
0,code_examples\tracks_json_writer.py,from .abstract_writer import AbstractWriter\n\...,_dir\n self.obj_path = os.path.join(sel...,"join(self.save_dir, f'{keypoints_fname}.json')..."
1,code_examples\club_assigner.py,os.listdir(self.output_dir) if name.startswit...,"55, 255])\n\n # Create the mask\n ...",of masked pixels\n total_pixels = imag...
2,code_examples\ball_to_player_assigner.py,ball_speed\n self.speed_check_frames = ...,"bool: True if the ball movement is valid, Fal...","_pos, last_frame = self.ball_history[-1]\n\n ..."
3,code_examples\football_video_processor.py,()[-1]\n possession_club1 = possession[...,_color)\n\n # Draw club 1's possession ...,"width, bar_y + bar_height), club1_color, -1)\n..."
4,code_examples\football_video_processor.py,"np.ndarray:\n """"""\n Combines th...",".resize(projection_frame, (new_w_proj, new_h_p...","frame = np.zeros((canvas_height, canvas_width,..."


# 4. Making Predictions

In [6]:
# Ensure pad_token_id is set to a valid token (e.g., eos_token_id)
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

# Set the model to evaluation mode
model.eval()

# Move model to device (CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Function to generate predictions for the middle part
def get_completion(prefix: str, suffix: str) -> str:
    # Prepare the input text
    input_text = f"<fim_prefix>{prefix}<fim_suffix>{suffix}<fim_middle>"
    
    # Tokenize the input, ensuring it returns tensors
    inputs = tokenizer(input_text, return_tensors="pt", padding=True).to(device)
    
    # Generate the completion
    with torch.no_grad():
        outputs = model.generate(
            inputs["input_ids"], 
            max_length=PREFIX_LENGTH + MIDDLE_LEGTH + SUFFIX_LENGTH, 
            pad_token_id=tokenizer.pad_token_id
        )
    
    # Decode the output and extract the generated text
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=False)
    
    # Extract the completion (text between prefix and suffix)
    middle_start = generated_text.find("<fim_middle>") + len("<fim_middle>")
    middle_end = generated_text.find(suffix, middle_start)
    completion = generated_text[middle_start:middle_end].replace('<|endoftext|>', '')
    
    return completion

# Generate predictions for each row in the DataFrame
preds = df.apply(lambda row: get_completion(row['prefix'], row['suffix']), axis=1)


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


# 5. Tiny Starcoder Evaluation

## 5.1 Evaluating Example by Example

Manually going through each of the examples one-by-one and assigning scores for each prediction (0 to 1)

In [7]:
df['mid_pred'] = preds

reviewer = ManualReviewer()

df = reviewer.review(df, output_path='reviewed_data.csv')


--- Review Example ---
Prefix:
from .abstract_writer import AbstractWriter

import os
import json
import numpy as np
from typing import Any, List


class TracksJsonWriter(AbstractWriter):
    """
    A class to write tracking data to JSON files.

    This class handles writing both object tracks and keypoint tracks to separate JSON files.
    It ensures that existing data can be appended without losing previous entries.
    """

    def __init__(self, save_dir: str = '', object_fname: str = 'object_tracks', 
                 keypoints_fname: str = 'keypoint_tracks') -> None:
        """
        Initializes the TracksJsonWriter.

        Args:
            save_dir (str): Directory to save JSON files.
            object_fname (str): Filename for object tracks (without extension).
            keypoints_fname (str): Filename for keypoint tracks (without extension).
        """
        super().__init__()
        self.save_dir = save
*********************************************************

## 5.2 Evaluating using Different Evaluation Metrics

In [8]:
res = pd.read_csv('reviewed_data.csv', delimiter='|').fillna('')

In [12]:
# Preprocess function for cleaning the text
def preprocess(text):
    return text.strip().lower()

res['middle'] = res['middle'].apply(preprocess)
res['mid_pred'] = res['mid_pred'].apply(preprocess)


# Calculate Exact Match
def exact_match(row):
    return row['middle'] == row['mid_pred']

def calculate_bleu(reference, hypothesis):
    # Apply smoothing to avoid 0 BLEU score
    smoothing_function = SmoothingFunction().method1

    # Calculate BLEU score with smoothing
    bleu_score = sentence_bleu(reference, hypothesis, smoothing_function=smoothing_function)

    return bleu_score

res['exact_match'] = res.apply(exact_match, axis=1)

# Calculate chrF score
res['chrf'] = res.apply(lambda row: corpus_bleu([row['mid_pred']], [[row['middle']]]).score, axis=1)

# Calculate BLEU score
res['bleu'] = res.apply(lambda row: calculate_bleu([row['middle'].split()], row['mid_pred'].split()), axis=1)

# Calculate ROUGE score
def calculate_rouge(row):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(row['middle'], row['mid_pred'])
    return scores['rouge1'].fmeasure, scores['rouge2'].fmeasure, scores['rougeL'].fmeasure

res[['rouge1', 'rouge2', 'rougeL']] = res.apply(calculate_rouge, axis=1, result_type='expand')

# Combine results
results = {
    'exact_match': np.mean(res['exact_match']),
    'chrf': np.mean(res['chrf']),
    'bleu': np.mean(res['bleu']),
    'rouge1': np.mean(res['rouge1']),
    'rouge2': np.mean(res['rouge2']),
    'rougeL': np.mean(res['rougeL'])
}

In [13]:
print(f"Exact Match Score: {results['exact_match']:.2f}")
print(f"Character-Level F Score: {results['chrf']:.2f}")
print(f"BLEU: {results['bleu']:.2f}")
print(f"Rouge-1: {results['rouge1']:.2f}")
print(f"Rouge-2: {results['rouge2']:.2f}")
print(f"Rouge-L: {results['rougeL']:.2f}")

Exact Match Score: 0.00
Character-Level F Score: 43.74
BLEU: 0.20
Rouge-1: 0.55
Rouge-2: 0.40
Rouge-L: 0.52


In [15]:
res.head()

Unnamed: 0,fname,prefix,middle,suffix,mid_pred,label,comment,exact_match,chrf,bleu,rouge,rouge1,rouge2,rougeL
0,code_examples\tracks_json_writer.py,from .abstract_writer import AbstractWriter\n\...,_dir\n self.obj_path = os.path.join(sel...,"join(self.save_dir, f'{keypoints_fname}.json')...",_dir\n self.object_fname = object_fname...,2,"Some parts make sense, but mostly incorrect.",False,11.39151,0.027776,"{'rouge1': (0.35294117647058826, 0.31578947368...",0.333333,0.117647,0.277778
1,code_examples\club_assigner.py,os.listdir(self.output_dir) if name.startswit...,"55, 255])\n\n # create the mask\n ...",of masked pixels\n total_pixels = imag...,", 25])\n\n # define the mask\n m...",0,"Almost entirely correct. Some parts differ, bu...",False,71.654203,0.493098,"{'rouge1': (0.7333333333333333, 0.647058823529...",0.6875,0.666667,0.6875
2,code_examples\ball_to_player_assigner.py,ball_speed\n self.speed_check_frames = ...,"bool: true if the ball movement is valid, fals...","_pos, last_frame = self.ball_history[-1]\n\n ...","bool: true if the ball is moving, false otherw...",1,"The comment is entirely correct, but the follo...",False,42.69759,0.273548,"{'rouge1': (0.6818181818181818, 0.6, 0.6382978...",0.638298,0.355556,0.595745
3,code_examples\football_video_processor.py,()[-1]\n possession_club1 = possession[...,_color)\n\n # draw club 1's possession ...,"width, bar_y + bar_height), club1_color, -1)\n...",_color)\n neutral_color = rgb_bgr_conve...,2,"Some parts make sense, but for the main part o...",False,36.248252,0.039525,"{'rouge1': (0.4375, 0.4117647058823529, 0.4242...",0.424242,0.258065,0.363636
4,code_examples\football_video_processor.py,"np.ndarray:\n """"""\n Combines th...",".resize(projection_frame, (new_w_proj, new_h_p...","frame = np.zeros((canvas_height, canvas_width,...",".resize(projection_frame, (new_w_proj, new_h_p...",0,Almost entirely correct.,False,60.026336,0.142166,"{'rouge1': (0.4782608695652174, 0.6875, 0.5641...",0.564103,0.432432,0.512821


## 5.3 Choosing the best metrics

In [37]:
# Ensure `manual_label` and `exact_match` are in the DataFrame
correlations = {}

# Function to compute correlations safely
def compute_correlations(column_name):
    if res[column_name].nunique() > 1:  # Check if there are more than one unique value
        pearson_corr = pearsonr(res['label'], res[column_name])[0]
        spearman_corr = spearmanr(res['label'], res[column_name])[0]
        return pearson_corr, spearman_corr
    else:
        return None, None

# Calculate correlations for each metric
metrics = ['exact_match', 'chrf', 'bleu', 'rouge1', 'rouge2', 'rougeL']

for metric in metrics:
    pearson_corr, spearman_corr = compute_correlations(metric)
    correlations[metric] = {'Pearson': pearson_corr, 'Spearman': spearman_corr}

# Print the results
print("Correlations with Manual Labels:")
for metric, scores in correlations.items():
    s1, s2 = 'None', 'None'
    if scores['Pearson'] and scores['Spearman']:
        s1 = f"{scores['Pearson']:.4f}"
        s2 = f"{scores['Spearman']:.4f}"
         
    print(f"{metric}: Pearson = {s1}, Spearman = {s2}")


Correlations with Manual Labels:
exact_match: Pearson = None, Spearman = None
chrf: Pearson = -0.6995, Spearman = -0.7090
bleu: Pearson = -0.7074, Spearman = -0.7609
rouge1: Pearson = -0.6045, Spearman = -0.6178
rouge2: Pearson = -0.6586, Spearman = -0.6479
rougeL: Pearson = -0.6698, Spearman = -0.6521


Most metrics display negative correlation. This is because the manually created labels are assigned in opposite way - 0 is correct, while 2 is incorrect.

1. **Avoid Using Exact Match**: Since the exact match did not provide meaningful correlations, it should not be used in evaluation.

2. **Use BLEU and chrF**: The negative correlations for both the chrF score and the BLEU score indicate that these metrics are inversely related to the quality of the predictions. As these scores increase, the likelihood of being labeled as correct also increases, making them strong candidates for evaluating model performance.

3. **The ROUGE metrics** (ROUGE-1, ROUGE-2, and ROUGE-L) show similar trends, with negative correlations. These scores also effectively reflect the model's ability to generate relevant and coherent code completions, particularly in terms of n-gram overlap.

# Final thoughts - Which metrics to choose?

1. **Focus on chrF and BLEU scores** for evaluating model performance, as they demonstrate a robust inverse relationship with manual correctness labels.

2. **Consider ROUGE scores as supplementary metrics**, especially for understanding the quality of generated outputs in relation to reference completions.

3. **Do not use exact match**, as it may not capture the complexity of code generation tasks where variations can still be valid.