# AI Code Completion

Import necessary libraries

In [2]:
import random
import pandas as pd


Import own modules

In [2]:
from src.scrape.scrape_files import scrape_files

In [3]:
src_path = ''
max_examples = 50

Scrape files and split code into prefix, middle, and suffix

In [4]:
examples = scrape_files(src_path, ('.ts', '.tsx'))

In [5]:
random.shuffle(examples)
selected_examples = examples[:max_examples]

Debugging

In [6]:
for i, (prefix, middle, suffix) in enumerate(selected_examples):
    print(f"Example {i + 1}:/n")
    print(f"Prefix:\n{prefix}\n")
    print(f"Middle:\n{middle}\n")
    print(f"Suffix:\n{suffix}\n")
    print("=" * 40)

Example 1:/n
Prefix:
import { useTranslation } from 'react-i18next'
import { toast } from 'react-toastify'

import { useAuth } from '@/auth'
import { useCheckoutMutation } from '@/shared/api/paymentsQuery'
import { useGetProductsQuery } from '@/shared/api/productsQuery'
import Skeleton from '@/shared/components/skeleton/Skeleton'
import LoadingWrapper from '@/shared/components/wrapper/LoadingWrapper'
import { PageWrapper } from '@/shared/components/wrapper/PageWrapper'
import { IProduct } from '@/shared/model/products'

import { ProductCard } from './components/ProductCard'
import { ProductsWrapper } from './productsElements'
import useDimBreakpoints from '@/shared/utils/useDimBreakpoints'

const Products = () => {
  const { t } = useTranslation()
  const auth = useAuth()
  const { md } = useDimBreakpoints()
  const { data: products, isLoading } = useGetProductsQuery('pl')
  const { mutateAsync: checkoutMutation } = useCheckoutMutation()

  const handleCheckout = (productId: string) =>

Dataframe of selected examples

In [7]:
selected_examples_df = pd.DataFrame(selected_examples, columns=['prefix', 'middle', 'suffix'])

Save selected examples to a CSV file

In [8]:
selected_examples_df.to_csv('../data/selected_examples.csv', index=False)

Create a DataFrame for the length of all examples

In [9]:
examples_data = {
    'prefix_length': [len(prefix) for prefix, _, _ in examples],
    'middle_length': [len(middle) for _, middle, _ in examples],
    'suffix_length': [len(suffix) for _, _, suffix in examples]
}
examples_df = pd.DataFrame(examples_data)

Create a DataFrame for the length of selected examples

In [10]:
selected_examples_data = {
    'prefix_length': [len(prefix) for prefix, _, _ in selected_examples],
    'middle_length': [len(middle) for _, middle, _ in selected_examples],
    'suffix_length': [len(suffix) for _, _, suffix in selected_examples]
}
selected_examples_df = pd.DataFrame(selected_examples_data)

In [11]:
print("All Examples Summary:")
print(examples_df.describe())

All Examples Summary:
       prefix_length  middle_length  suffix_length
count     864.000000     864.000000     864.000000
mean     1363.070602      34.489583     499.173611
std       434.954526      14.766546     386.921759
min       592.000000      20.000000       0.000000
25%      1054.000000      24.000000     203.000000
50%      1283.000000      29.000000     412.000000
75%      1615.750000      40.000000     725.000000
max      2791.000000     100.000000    1925.000000


In [12]:
print("Selected Examples Summary:")
print(selected_examples_df.describe())

Selected Examples Summary:
       prefix_length  middle_length  suffix_length
count      50.000000      50.000000      50.000000
mean     1404.500000      32.560000     411.600000
std       448.779263      10.924901     393.978245
min       592.000000      21.000000       0.000000
25%      1082.000000      25.000000     123.750000
50%      1353.500000      30.000000     252.000000
75%      1705.250000      37.500000     631.500000
max      2429.000000      66.000000    1925.000000


Load model and tokenizer

In [13]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
checkpoint = "bigcode/tiny_starcoder_py"

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForCausalLM.from_pretrained(checkpoint).to(device)

cuda


In [14]:
dataset = pd.read_csv('../data/selected_examples.csv')

Run the model on the selected examples

In [15]:
from tqdm import tqdm

tokenizer.pad_token = tokenizer.eos_token

completions = []
for i, row in tqdm(dataset.iterrows(), total=len(dataset), desc="Processing examples"):
    prefix = row['prefix']
    suffix = row['suffix']
    input_text = f"<fim_prefix>{prefix}<fim_suffix>{suffix}<fim_middle>"
    inputs = tokenizer.encode_plus(input_text, return_tensors="pt", padding=True).to(device)
    outputs = model.generate(
        inputs['input_ids'],
        attention_mask=inputs['attention_mask'],
        max_new_tokens=16,
        pad_token_id=tokenizer.eos_token_id,
        repetition_penalty=1.05,
    )
    completion = tokenizer.decode(outputs[0])
    fim_middle_loc = completion.find("<fim_middle>")
    completion = completion[fim_middle_loc + len("<fim_middle>"):].strip()
    completions.append(completion.strip().split('\n')[0])

Processing examples: 100%|██████████| 50/50 [00:16<00:00,  3.02it/s]


In [16]:
completed_code = dataset.copy()
completed_code['completion'] = completions

In [17]:
completed_code.to_csv('../data/completed_code.csv', index=False)

Label code completions

In [23]:
completed_code = pd.read_csv('../data/completed_code.csv')

In [24]:
labeled_code = completed_code.copy()
labeled_code['label'] = 0
for i, row in labeled_code.iterrows():
    middle = row['middle'].strip()
    completion = row['completion']
    if middle == completion:
        labeled_code.at[i, 'label'] = 1

In [25]:
labeled_code.to_csv('../data/labeled_code.csv', index=False)
labeled_code.to_csv('../data/manually_labeled_code.csv', index=False)

Rest of the labeling is done manually in the CSV file.

The code completion might not be equal to the middle part, but it can still be correct in some cases.

In [3]:
labeled_code = pd.read_csv('../data/labeled_code.csv')
manually_labeled_code = pd.read_csv('../data/manually_labeled_code.csv')

In [4]:
labeled_code['label'].value_counts()

label
0    46
1     4
Name: count, dtype: int64

In [5]:
manually_labeled_code['label'].value_counts()

label
0    32
1    18
Name: count, dtype: int64

In [14]:
from sklearn.metrics import accuracy_score
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.chrf_score import sentence_chrf
from rouge import Rouge

In [8]:
middles = labeled_code['middle'].str.strip().tolist()
completions = labeled_code['completion'].tolist()

In [9]:
exact_matches = [1 if middle == completion else 0 for middle, completion in zip(middles, completions)]
exact_match_score = accuracy_score(middles, completions)

In [10]:
# chrF
chrf_scores = [sentence_chrf([middle], completion) for middle, completion in zip(middles, completions)]
average_chrf_score = sum(chrf_scores) / len(chrf_scores)

In [15]:
# BLEU
smoothing_function = SmoothingFunction().method1
bleu_scores = [sentence_bleu([middle.split()], completion.split(), smoothing_function=smoothing_function) for middle, completion in zip(middles, completions)]
average_bleu_score = sum(bleu_scores) / len(bleu_scores)

In [17]:
# ROUGE-L
rouge = Rouge()
rouge_scores = [rouge.get_scores(completion, middle)[0]['rouge-l']['f'] for middle, completion in zip(middles, completions)]
average_rouge_l_score = sum(rouge_scores) / len(rouge_scores)

In [18]:
print(f"Exact Match Score: {exact_match_score:.4f}")
print(f"Average chrF Score: {average_chrf_score:.4f}")
print(f"Average BLEU Score: {average_bleu_score:.4f}")
print(f"Average ROUGE-L Score: {average_rouge_l_score:.4f}")

Exact Match Score: 0.0800
Average chrF Score: 0.1905
Average BLEU Score: 0.0270
Average ROUGE-L Score: 0.1062
