In [30]:
import pandas as pd
from training.metrics import compute_metrics
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, DataCollatorWithPadding
from datasets import Dataset

# Evaluate Best Model Against Graded Student Submissions

#### Load data and get in same form as training

In [4]:
df = pd.read_csv('../data/graded_docstring_code_pairs.csv', index_col=0)
df = df.drop(columns=['file_name', 'id'])
df.head()

Unnamed: 0,function,docstring,label
0,public void act(List<Animal> newkingfishers)\n...,This is what the kingfisher does most of the t...,2
1,public void spreadDisease()\n {\n Field...,Spread the disease.,2
2,private void incrementAge()\n {\n ag...,Increase the age. This could result in the kin...,2
3,private void incrementHunger()\n {\n ...,Make this kingfisher more hungry. This could r...,2
4,protected Location findFood()\n {\n ...,Look for salmons adjacent to the current locat...,2


In [5]:
def format_str(string):
    for char in ['\r\n', '\r', '\n']:
        string = string.replace(char, ' ')
    return string

def concat_nl_and_code(data):
    return format_str(data['docstring'] + '<CODESPLIT>' + data['function'])

df['text'] = df.apply(lambda x: concat_nl_and_code(x), axis=1)
df.head()

Unnamed: 0,function,docstring,label,text
0,public void act(List<Animal> newkingfishers)\n...,This is what the kingfisher does most of the t...,2,This is what the kingfisher does most of the t...
1,public void spreadDisease()\n {\n Field...,Spread the disease.,2,Spread the disease.<CODESPLIT>public void spre...
2,private void incrementAge()\n {\n ag...,Increase the age. This could result in the kin...,2,Increase the age. This could result in the kin...
3,private void incrementHunger()\n {\n ...,Make this kingfisher more hungry. This could r...,2,Make this kingfisher more hungry. This could r...
4,protected Location findFood()\n {\n ...,Look for salmons adjacent to the current locat...,2,Look for salmons adjacent to the current locat...


#### Load Model and setup evaluation

In [35]:
ds = Dataset.from_pandas(df)


tokenizer = AutoTokenizer.from_pretrained('../models/best')
model = AutoModelForSequenceClassification.from_pretrained('../models/best')

model.eval()

data_tokens = ds.map(lambda x: tokenizer(x['text'], truncation=True, padding=True))

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    data_collator=data_collator
)

model_predictions = trainer.predict(data_tokens)
print(model_predictions.metrics)

eval_results_formatted = \
            {"test/" + key.split('_', 1)[1]: item for key, item in model_predictions.metrics.items()}

print("Test Results:")
print(str(eval_results_formatted))

Map:   0%|          | 0/5413 [00:00<?, ? examples/s]

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


KeyboardInterrupt: 