In [1]:
import pandas as pd
from training import metrics
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Evaluate Best Model Against Graded Student Submissions

#### Load data and get in same form as training

In [2]:
df = pd.read_csv('../data/graded_docstring_code_pairs.csv', index_col=0)
df = df.drop(columns=['file_name', 'id'])
df.head()

Unnamed: 0,function,docstring,label
0,public void act(List<Animal> newkingfishers)\n...,This is what the kingfisher does most of the t...,2
1,public void spreadDisease()\n {\n Field...,Spread the disease.,2
2,private void incrementAge()\n {\n ag...,Increase the age. This could result in the kin...,2
3,private void incrementHunger()\n {\n ...,Make this kingfisher more hungry. This could r...,2
4,protected Location findFood()\n {\n ...,Look for salmons adjacent to the current locat...,2


In [8]:
def format_str(string):
    for char in ['\r\n', '\r', '\n']:
        string = string.replace(char, ' ')
    return string

def concat_nl_and_code(data):
    return format_str(data['docstring'] + '<CODESPLIT>' + data['function'])

df['text'] = df.apply(lambda x: concat_nl_and_code(x), axis=1)
df.head()

Unnamed: 0,function,docstring,label,text
0,public void act(List<Animal> newkingfishers)\n...,This is what the kingfisher does most of the t...,2,This is what the kingfisher does most of the t...
1,public void spreadDisease()\n {\n Field...,Spread the disease.,2,Spread the disease.<CODESPLIT>public void spre...
2,private void incrementAge()\n {\n ag...,Increase the age. This could result in the kin...,2,Increase the age. This could result in the kin...
3,private void incrementHunger()\n {\n ...,Make this kingfisher more hungry. This could r...,2,Make this kingfisher more hungry. This could r...
4,protected Location findFood()\n {\n ...,Look for salmons adjacent to the current locat...,2,Look for salmons adjacent to the current locat...


#### Load Model and setup evaluation

In [None]:
#TODO Load model
tokeninzer = AutoTokenizer.from_pretrained()
model = AutoModelForSequenceClassification.from_pretrained()

tokenized_data = df['text'].map(lambda string: tokeninzer(string, truncation=True, padding=True))


model_prediction = model(tokenized_data)

results = metrics.compute_metrics((model_prediction, list(df['label'])))
results