In [307]:
import numpy as np
import pandas as pd
import os
from tqdm.notebook import tqdm

import function_parser
from function_parser.language_data import LANGUAGE_METADATA
from function_parser.process import DataProcessor
from tree_sitter import Language


# Process human graded data for evaluation
### Load data

Take the two batches and the two grades and concat them to one frame

In [23]:
grader_1_batch_1_df = pd.read_csv('../data/student_grades/student_grades_01_01.csv', usecols=['Name', '5: Documentation (100.0 pts)'])
grader_1_batch_2_df = pd.read_csv('../data/student_grades/student_grades_01_02.csv', usecols=['Name', '5: Documentation (100.0 pts)'])

grader_2_batch_1_df = pd.read_csv('../data/student_grades/student_grades_02_01.csv', usecols=['Name', '5: Documentation (100.0 pts)'])
grader_2_batch_2_df = pd.read_csv('../data/student_grades/student_grades_02_02.csv', usecols=['Name', '5: Documentation (100.0 pts)'])

grades_df = pd.concat([grader_1_batch_1_df, grader_1_batch_2_df, grader_2_batch_1_df, grader_2_batch_2_df])
grades_df.columns = ['id', 'grade']
grades_df = grades_df.dropna()

grades_df.head()

Unnamed: 0,id,grade
0,637,67.0
1,164,73.0
2,214,75.0
3,94,85.0
4,617,0.0


### Average grades between the two graders

In [281]:
grades_df = grades_df.groupby('id').mean()
grades_df

Unnamed: 0_level_0,grade,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1
12,61.0,1.5
17,71.0,2.5
28,65.0,2.0
39,65.0,2.0
73,70.0,2.5
83,75.0,3.0
94,80.0,3.0
120,64.0,2.0
127,71.0,2.5
128,69.0,2.5


### Convert to relevance classes

In [282]:
def apply_bounds(grade):
    if grade >= 70:
        return 3
    elif 70 > grade >= 60:
        return 2
    elif 60 > grade >= 50:
        return 1
    else:
        return 0


grades_df['label'] = grades_df['grade'].map(apply_bounds)
grades_df.head()

Unnamed: 0_level_0,grade,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1
12,61.0,2
17,71.0,3
28,65.0,2
39,65.0,2
73,70.0,3


# Get Code
### Read code from files into Dataframe

In [232]:
CODE_PATH = "../data/student_assignments"

code_df = pd.DataFrame(columns=['dir', 'file_name', 'src'])
code_df.columns

Index(['dir', 'file_name', 'src'], dtype='object')

In [233]:
for root, _, files in os.walk(CODE_PATH):
    for file in files:
        if file.endswith(".java"):
            with open(os.path.join(root, file), "r", encoding="ISO-8859-1") as f:
                source_code = f.read()

            sub_path = "/".join(root.split('/')[:5])
            new_row = pd.DataFrame({'dir': sub_path, 'file_name': file, 'src': source_code}, index=[0])
            code_df = pd.concat([code_df, new_row], ignore_index=True)

In [234]:
code_df.head()

Unnamed: 0,dir,file_name,src
0,../data/student_assignments/18~19_Submission_28,Kingfisher.java,import java.util.List;\nimport java.util.Itera...
1,../data/student_assignments/18~19_Submission_28,Rabbit.java,import java.util.List;\nimport java.util.Rando...
2,../data/student_assignments/18~19_Submission_28,LandAnimal.java,import java.util.List;\nimport java.util.Linke...
3,../data/student_assignments/18~19_Submission_28,Snow.java,import java.awt.Color;\nimport java.util.Rando...
4,../data/student_assignments/18~19_Submission_28,Randomizer.java,import java.util.Random;\n\n/**\n * Provide co...


Adapted from: https://github.com/ncoop57/function_parser

If getting OS Error run the build_grammars command, which is part of the package

In [301]:
def get_submission_number(path):
    submission_path = [path_part for path_part in path.split('/') if 'Submission' in path_part][0]
    return int(submission_path.split('_')[2])

def get_code_pairs(row):
    file_path = os.path.join(row['dir'], row['file_name'])

    defs = processor.process_single_file(file_path)

    out_df = pd.DataFrame(defs)
    out_df['id'] = get_submission_number(file_path)

    return out_df

In [302]:
language = 'java'

DataProcessor.PARSER.set_language(
    Language(os.path.join(function_parser.__path__[0], "tree-sitter-languages.so"), language)
)

processor = DataProcessor(
    language=language, language_parser=LANGUAGE_METADATA[language]["language_parser"]
)

code_pairs_df = pd.DataFrame(columns=['function', 'docstring', 'file_name'])

for index, row in tqdm(code_df.iterrows(), total=code_df.shape[0]):
    code_pair_df = get_code_pairs(row)

    if all([x in code_pair_df.columns for x in ['function', 'docstring', 'identifier']]):
        code_pair_df['file_name'] = code_pair_df['identifier'] + 'java'
        code_pairs_df = pd.concat([code_pairs_df, code_pair_df[['function', 'docstring', 'file_name', 'id']]])


code_pairs_df.head()

  0%|          | 0/915 [00:00<?, ?it/s]

Unnamed: 0,function,docstring,file_name,id
0,public void act(List<Animal> newkingfishers)\n...,This is what the kingfisher does most of the t...,Kingfisher.java,28.0
1,public void spreadDisease()\n {\n Field...,Spread the disease.,Kingfisher.java,28.0
2,private void incrementAge()\n {\n ag...,Increase the age. This could result in the kin...,Kingfisher.java,28.0
3,private void incrementHunger()\n {\n ...,Make this kingfisher more hungry. This could r...,Kingfisher.java,28.0
4,protected Location findFood()\n {\n ...,Look for salmons adjacent to the current locat...,Kingfisher.java,28.0


#### Merge with labels
All documentation/code pair from a project gets the same label, as documentation was graded for the project

In [310]:
code_pairs_df['label'] = code_pairs_df['id'].map(grades_df['label'])
code_pairs_df = code_pairs_df.replace('', np.nan)
code_pairs_df = code_pairs_df.dropna()
code_pairs_df.head()

Unnamed: 0,function,docstring,file_name,id,label
0,public void act(List<Animal> newkingfishers)\n...,This is what the kingfisher does most of the t...,Kingfisher.java,28.0,2
1,public void spreadDisease()\n {\n Field...,Spread the disease.,Kingfisher.java,28.0,2
2,private void incrementAge()\n {\n ag...,Increase the age. This could result in the kin...,Kingfisher.java,28.0,2
3,private void incrementHunger()\n {\n ...,Make this kingfisher more hungry. This could r...,Kingfisher.java,28.0,2
4,protected Location findFood()\n {\n ...,Look for salmons adjacent to the current locat...,Kingfisher.java,28.0,2


In [313]:
code_pairs_df.label.value_counts()

label
3    3846
2    1343
0     112
1     112
Name: count, dtype: int64

In [315]:
code_pairs_df.to_csv('../data/graded_docstring_code_pairs.csv')