In [108]:
import function_parser
import os

import pandas as pd

from function_parser.language_data import LANGUAGE_METADATA
from function_parser.process import DataProcessor
from tree_sitter import Language

# Build Dataset of Docstring and Code Pairs
### Build Grammers

In [109]:
!build_grammars

## Process Assignments

In [110]:
language = 'java'
DataProcessor.PARSER.set_language(Language(os.path.join(function_parser.__path__[0], "tree-sitter-languages.so"), language))
processor = DataProcessor(language=language, language_parser=LANGUAGE_METADATA[language]['language_parser'])

ASSIGNMENTS_PATH = 'data/anonymised_assignments'

pairs = []

for root, _, files in os.walk(ASSIGNMENTS_PATH):
    assignment_number = root.split('_')[-1].split('/')[0]
    
    for file in files:
        if not file.endswith('.java'):
            continue
            
        path = os.path.join(root,file)
        definitions = processor.process_single_file(path)

        def_df = pd.DataFrame(definitions)
        def_df['assignment_number'] = assignment_number
        def_df['path'] = path
        
        pairs.append(def_df)
assignment_df = pd.concat(pairs)

In [111]:
assignment_df = assignment_df.reset_index(drop=True)
assignment_df['assignment_number'] = assignment_df['assignment_number'].astype('int')
assignment_df = assignment_df.dropna(axis='columns')
assignment_df.head()

Unnamed: 0,nwo,sha,path,language,identifier,parameters,argument_list,return_statement,docstring,docstring_summary,docstring_tokens,function,function_tokens,url,assignment_number
0,,,data/anonymised_assignments/20~21/20~21_Submis...,java,Randomizer.,,,,Provide a random generator.\n@return A random ...,Provide a random generator.,"[Provide, a, random, generator, .]",public static Random getRandom()\n {\n ...,"[public, static, Random, getRandom, (, ), {, i...",https://github.com//blob//#L30-L38,290
1,,,data/anonymised_assignments/20~21/20~21_Submis...,java,Randomizer.,,,,Reset the randomization.\nThis will have no ef...,Reset the randomization.\nThis will have no ef...,"[Reset, the, randomization, ., This, will, hav...",public static void reset()\n {\n if(...,"[public, static, void, reset, (, ), {, if, (, ...",https://github.com//blob//#L45-L50,290
2,,,data/anonymised_assignments/20~21/20~21_Submis...,java,Counter.,,,,@return The short description of this type.,,[],public String getName()\n {\n return...,"[public, String, getName, (, ), {, return, nam...",https://github.com//blob//#L31-L34,290
3,,,data/anonymised_assignments/20~21/20~21_Submis...,java,Counter.,,,,@return The current count for this type.,,[],public int getCount()\n {\n return c...,"[public, int, getCount, (, ), {, return, count...",https://github.com//blob//#L39-L42,290
4,,,data/anonymised_assignments/20~21/20~21_Submis...,java,Counter.,,,,Increment the current count by one.,Increment the current count by one.,"[Increment, the, current, count, by, one, .]",public void increment()\n {\n count+...,"[public, void, increment, (, ), {, count, ++, ...",https://github.com//blob//#L47-L50,290


### Process Grades

In [112]:
grades_df = pd.read_csv('data/grades.csv', index_col=0)
grades_df['assignment_number'] = grades_df['assignment_number'].astype(int)
grades_df['grade'] = pd.Categorical(grades_df['grade'],
                                                        ['A++', 'A+', 'A', 'A-',
                                                         'B+', 'B', 'B-',
                                                         'C+', 'C', 'C-',
                                                         'D+', 'D', 'D-',
                                                         'F'])
grades_df = grades_df[grades_df['skill'] == 'Documentation']
grades_df = grades_df.sort_values(by='assignment_number', ascending=False)
grades_df = grades_df.dropna(subset=['grade'])
grades_df

Unnamed: 0,assignment_number,comments,skill,participant_id,batch,grade
9,686,,Documentation,25,2,A+
9,686,Good use of comments. Each function block comm...,Documentation,27,2,A-
9,686,,Documentation,26,2,A
9,686,"The code documentation is good overall, howeve...",Documentation,28,2,B-
13,685,,Documentation,23,2,C-
...,...,...,...,...,...,...
7,6,,Documentation,25,1,A-
9,2,Your documentation is thorough and well-organi...,Documentation,15,1,A
9,2,,Documentation,14,1,A+
9,2,Overall good comments and documentation across...,Documentation,16,1,B


In [113]:
grade_code_dict = {k:v for k, v in dict(enumerate(grades_df['grade'].cat.categories)).items()}
grade_code_dict

{0: 'A++',
 1: 'A+',
 2: 'A',
 3: 'A-',
 4: 'B+',
 5: 'B',
 6: 'B-',
 7: 'C+',
 8: 'C',
 9: 'C-',
 10: 'D+',
 11: 'D',
 12: 'D-',
 13: 'F'}

In [114]:
grades_df['grade_code'] = grades_df['grade'].cat.codes
mean_grades_df = grades_df[['assignment_number', 'grade_code']].groupby('assignment_number').mean().reset_index()
mean_grades_df['grade_code'] = mean_grades_df['grade_code'].round()
mean_grades_df['grade'] = mean_grades_df['grade_code'].apply(lambda x: grade_code_dict[x])
mean_grades_df

Unnamed: 0,assignment_number,grade_code,grade
0,2,2.0,A
1,6,2.0,A
2,8,3.0,A-
3,9,4.0,B+
4,10,2.0,A
...,...,...,...
274,682,6.0,B-
275,683,2.0,A
276,684,3.0,A-
277,685,3.0,A-


In [115]:
df = mean_grades_df.merge(assignment_df, on='assignment_number', how='inner')
df['line_numbers'] = df['url'].apply(lambda x: x.split('/')[-1][1:])

df = df[['assignment_number', 'grade_code', 'grade', 'path', 'docstring', 'docstring_summary', 'docstring_tokens', 'function', 'function_tokens', 'line_numbers']]
df.to_csv('data/docstring_code_grades.csv')
df.head()

Unnamed: 0,assignment_number,grade_code,grade,path,docstring,docstring_summary,docstring_tokens,function,function_tokens,line_numbers
0,2,2.0,A,data/anonymised_assignments/18~19/18~19_Submis...,This is what the rabbit does most of the time ...,This is what the rabbit does most of the time ...,"[This, is, what, the, rabbit, does, most, of, ...",public void act(List<Animal> newRabbits)\n ...,"[public, void, act, (, List, <, Animal, >, new...",L60-L81
1,2,2.0,A,data/anonymised_assignments/18~19/18~19_Submis...,Increase the age.\nThis could result in the ra...,Increase the age.\nThis could result in the ra...,"[Increase, the, age, ., This, could, result, i...",private void incrementAge()\n {\n ag...,"[private, void, incrementAge, (, ), {, age, ++...",L88-L97
2,2,2.0,A,data/anonymised_assignments/18~19/18~19_Submis...,Make this rabbit more hungry. This could resul...,Make this rabbit more hungry. This could resul...,"[Make, this, rabbit, more, hungry, ., This, co...",private void incrementHunger()\n {\n ...,"[private, void, incrementHunger, (, ), {, food...",L102-L108
3,2,2.0,A,data/anonymised_assignments/18~19/18~19_Submis...,Look for rabbits adjacent to the current locat...,Look for rabbits adjacent to the current locat...,"[Look, for, rabbits, adjacent, to, the, curren...",private Location findFood()\n {\n Fi...,"[private, Location, findFood, (, ), {, Field, ...",L115-L134
4,2,2.0,A,data/anonymised_assignments/18~19/18~19_Submis...,Check whether or not this rabbit is to give bi...,Check whether or not this rabbit is to give bi...,"[Check, whether, or, not, this, rabbit, is, to...",private void giveBirth(List<Animal> newRabbits...,"[private, void, giveBirth, (, List, <, Animal,...",L141-L156
