In [1]:
import os
import re
import requests
import pprint
import pandas as pd
from solidity_parser import parser

In [2]:
if not os.path.isdir('/tmp'):
    os.mkdir('/tmp')

In [3]:
ERRORMSG = 'error: could not parse'

# Construct governance surface of a Solidity smart contract
- [x] Parse structure of the smart contract, extracting all functions/modifiers/events, their parameters, and other relevant properties (e.g., visibility)
- [x] Get comments corresponding to function/parameter definitions to contextualize the structure
- [ ] Select subset of functions/parameters relevant to governance, preserving their structural relationships

TODO:
- [ ] Don't add inline comment for a parameter if it's the same as it's object's description
- [ ] Clearly separate parsing from Airtable-specific coding (e.g., in get_parameter_type(param)); ideally, have flag which turns coding on/off for all cases

## Parse contract (functions)
Use the `solidity_parser` library to parse the contract as an [abstract syntax tree](https://en.wikipedia.org/wiki/Abstract_syntax_tree) (AST). This gets us the full structure of the contract, including functions, their parameters, and which functions call which other functions. 

Since this library does not preserve comments, add location information to the AST nodes so that later we can go back to the text and extract the relevant function/parameter descriptions.

Note that it's also possible to create abject from the OrderedList using `parser.objectify`, but this turns out not to actually be that useful, since only some of the information in the AST is preserved.

In [25]:
def get_parameter_type(param):
    """Get parameter dtype.
    
    If type is any kind of array or user-defined value, returns simply 'array' or 'userdefined'."""
    
    typeDict = param['typeName']
    typeType = typeDict.get('type')
    if 'TypeName' in typeType and not 'Elementary' in typeType:
        paramType = typeType[:-8].lower()
    else:
        paramType = typeDict.get('name', typeDict.get('namePath', ERRORMSG))
    
    if paramType.startswith('bytes'):
        paramType = 'bytes'
    if paramType.startswith('uint'):
        paramType = 'uint'
        
    return paramType

In [5]:
def get_parameter_initialValue(param):
    """Get parameter initialValue"""
    
    value = param.get('initialValue')
    if value is not None:
        value = value.get('value', ERRORMSG)
    
    return value

In [6]:
def get_object_modifiers(obj):
    """Get object modifiers"""
    
    modifiers = obj.get('modifiers', [])
    modifiers = [m.get('name', ERRORMSG) for m in modifiers]
    
    return modifiers

In [7]:
def get_object_values(obj):
    """Get object options ("members" as defined in enum objects only)"""
    
    values = []
    
    if obj['type'] == 'EnumDefinition':
        members = obj.get('members', [])
        values = [m.get('name', ERRORMSG) for m in members]
        
    return values

In [8]:
def extract_objects_and_parameters(contracts):
    """Collect information on contract objects (contracts/functions/modifiers/events) and their parameters
    
    Returns two DataFrames, one for objects, the other for their parameters.
    Currently grabs state variables declarations and event/modifier/function definitions.
    """
    
    df_objects = pd.DataFrame()
    df_parameters = pd.DataFrame()
    # Iterate through contracts (there may be multiple in a solidity file)
    for c in contracts:
        contractName = c['name']
        baseContracts = [b['baseName']['namePath'] for b in c.get('baseContracts', [])]
    
        # Append item and parameters for contract itself
        objectDict = {'contract': contractName,
                      'object_name': contractName,
                      'type': 'ContractDefinition',
                      'line_numbers': (c['loc']['start']['line'], c['loc']['end']['line']),
                      'inheritance': baseContracts,
                      'description': ''}
        df_objects = df_objects.append(pd.Series(objectDict), ignore_index=True)

        # Iterate through relevant subnodes in contract
        for item in c.get('subNodes', []):
            itemType = item['type']
            
            if itemType in ['StateVariableDeclaration']:
                # Append contract state variables to parameters DataFrame
                for param in item.get('variables', {}):
                    paramDict = {'parameter_name': param['name'],
                                 'object_name': contractName,
                                 'contract': contractName,
                                 'type': get_parameter_type(param),
                                 'line_number': param['loc']['start']['line'],
                                 'initial_value': get_parameter_initialValue(item),
                                 'visibility': param.get('visibility'),
                                 'description': ''}
                    df_parameters = df_parameters.append(pd.Series(paramDict), ignore_index=True)

            if itemType in ['EventDefinition', 'ModifierDefinition', 'FunctionDefinition', 'StructDefinition', 'EnumDefinition']:
                itemName = item['name']

                # Append function/event/modifier definition to objects DataFrame
                objectDict = {'contract': contractName,
                              'object_name': itemName,
                              'type': itemType,
                              'modifiers': get_object_modifiers(item),
                              'visibility': item.get('visibility'), 
                              'values': get_object_values(item),
                              'line_numbers': (item['loc']['start']['line'], item['loc']['end']['line']),
                              'description': ''}
                df_objects = df_objects.append(pd.Series(objectDict), ignore_index=True)

                # Append each parameter to DataFrame
                paramObj = item.get('parameters', (item.get('members', {})))
                if isinstance(paramObj, dict):
                    values = paramObj.get('parameters', [])
                elif isinstance(paramObj, list) and itemType == 'StructDefinition':
                    values = paramObj
                else:
                    values = []
                for param in values:
                    paramDict = {'parameter_name': param['name'],
                                 'object_name': itemName,
                                 'contract': contractName,
                                 'type': get_parameter_type(param),
                                 'line_number': param['loc']['start']['line'],
                                 'initial_value': get_parameter_initialValue(item),
                                 'visibility': param.get('visibility'),
                                 'description': ''}
                    df_parameters = df_parameters.append(pd.Series(paramDict), ignore_index=True)

    return df_objects, df_parameters

## Parse comments (functions)
Extract two kinds of comments:
- Docstrings, or any other set of comments right before a function
- Inline comments on parameter definitions

In [9]:
def clean_object_comment_lines(lines):
    """Clean list of strings of comment block prior to object declaration"""
    
    # Try to get comment block, if there is one
    linesStr = '\n'.join(lines)
    pattern_commentBlock = re.compile(r'/\*\*.+?\*/', re.DOTALL)
    match = re.search(pattern_commentBlock, linesStr)
    if match:
        lines_new = match[0].split('\n')
        lines_new = [s.strip().strip('*') for s in lines if s]
    else:
        # Otherwise, parse individual line comments
        lines_new = [s.strip() for s in lines if s]
        lines_new = [s.split('//')[-1] for s in lines if s.startswith('//')]
                                                          
    lines_new = [s.strip() for s in lines_new if s]

    return lines_new

In [10]:
def clean_parameter_comment_lines(lines):
    """Clean list of strings of comment block prior to/at parameter declaration"""
    
    lines_new = [s.strip() for s in lines if s]
    if len(lines_new) > 0:
        prevLines = [s for s in lines_new[:-1] if s.startswith('//')]
        tmp = lines_new[-1].split('//')
        inLine = [tmp[-1]] if len(tmp) > 1 else ['']
        lines_new = prevLines + inLine
        lines_new = [s.split('//')[-1] for s in lines_new if len(lines_new) > 1]
        lines_new = [s.strip('/* ').strip() for s in lines_new]
        lines_new = [s for s in lines_new if s]

    return lines_new

In [11]:
def parse_object_description(lines_raw):
    """Clean and parse list of comment strings before an object definition.
    May be a block comment or individual line comments."""
    
    lines = clean_object_comment_lines(lines_raw)
    
    commentDict = {}
    description = ''
    
    # Add full (cleaned) comment
    commentDict['full_comment'] = '\n'.join(lines)
    commentDict['description'] = ''

    # Parse notice line(s); keep just the first one
    noticeLines = [s.split('@notice')[-1].strip() for s in lines if s.startswith('@notice')]
    try: 
        s = noticeLines[0]
        commentDict['notice'] = s
        description = s
    except IndexError:
        commentDict['notice'] = None    
    
    # Parse dev line(s); keep just the first one
    devLines = [s.split('@dev')[-1].strip() for s in lines if s.startswith('@dev')]
    try: 
        s = devLines[0]
        commentDict['dev'] = s
        if description == '':
            description = s        
    except IndexError:
        commentDict['dev'] = None


    # Save first line if no @dev or @notice, and if's probably actually useful
    if (commentDict['dev'] is None) and (commentDict['notice'] is None) and len(lines) > 0:
        first_line = lines[0]
        if (not first_line.startswith('pragma')) and (not len(first_line) == 1):
            commentDict['first_line'] = first_line
            if description == '':
                description == first_line
    else:
        commentDict['first_line'] = None
    
    commentDict['description'] = description
    
    # Parse parameter lines; create list of dict of parameter:description pairs (empty list if none found)
    paramLines = [s.split('@param')[-1].strip().split(' ', 1) for s in lines if s.startswith('@param')]
    commentDict['param'] = [{'parameter': p[0], 'description': p[1]} for p in paramLines]

    return commentDict

In [12]:
def parse_parameter_description(lines_raw, parameterName):
    """Clean and parse comment relating to parameter, either inline or right before the paramter"""

    lines = clean_parameter_comment_lines(lines_raw)
    
    commentDict = {}
    description = ''
    
    # Add full (cleaned) comment
    commentDict['full_comment'] = '\n'.join(lines)

    # Try to get notice first
    noticeLines = [s.split('@notice')[-1].strip() for s in lines if s.startswith('@notice')]
    if len(noticeLines) > 0:
        description = noticeLines[-1]

    # Parse parameter lines; create dict of parameter:description pairs
    paramLines = [s.split('@param')[-1].strip().split(' ', 1) for s in lines if '@param' in s]
    paramDict = {p[0]: p[1] for p in paramLines}
    description = paramDict.get(parameterName, description)

    # If above two methods failed, just grab the last line, if any
    if description == '' and len(lines) > 0:
        description = lines[-1]

    commentDict['description'] = description
        
    return commentDict

In [13]:
def add_docstring_comments(lines, df_objects, df_parameters):
    """Parse comments and add them to the relevant rows in the object and parameter DataFrames"""

    df_o = df_objects.copy(deep=True)
    df_p = df_parameters.copy(deep=True)

    df_o['full_comment'] = ''
    df_o['dev'] = ''
    df_o['notice'] = ''
    df_o['first_line'] = ''

    prevObjectLoc = (0,0)
    for i, row in df_o.iterrows():
        # Get, clean, and parse object comment lines
        commentEnd = row['line_numbers'][0] - 1
        if prevObjectLoc[1] <= commentEnd:
            commentStart = prevObjectLoc[1]
        else:
            commentStart = prevObjectLoc[0]
        commentLines = lines[commentStart:commentEnd]
        commentDict = parse_object_description(commentLines)

        # Add object descriptions to objects
        for key, value in commentDict.items():
            if key in df_o.columns:
                df_o.iat[i, df_o.columns.get_loc(key)] = value

        # Add parameter descriptions to parameters
        for item in commentDict['param']:
            index = df_p.loc[(df_p['object_name']==row['object_name']) &
                             (df_p['parameter_name']==item['parameter'])].index[0]
            df_p.iat[index, df_p.columns.get_loc('description')] = item['description']

        prevObjectLoc = row['line_numbers']

    return df_o, df_p

In [14]:
def add_inline_comments(lines, df_parameters):
    """Parse comments and add them to the relevant rows in the parameter DataFrame"""

    df_p = df_parameters.copy(deep=True)
    df_p['full_comment'] = ''

    commentStart = 0
    for i, row in df_p.iterrows():   
        # Grab and parse comment lines
        commentEnd = int(row['line_number'])
        commentLines = lines[min(commentStart, commentEnd - 2):commentEnd + 1]
        commentDict = parse_parameter_description(commentLines, row['parameter_name'])
        
        # Add to dict (but don't overwrite previously found value)
        for key, value in commentDict.items():
            if key in df_p.columns:
                currentValue = df_p.iat[i, df_p.columns.get_loc(key)]
                if not currentValue:
                    df_p.iat[i, df_p.columns.get_loc(key)] = value

        commentStart = commentEnd

    return df_p

## Run example: Compound Governor Bravo

In [30]:
PROJECT = 'Aragon Court'
urls = ['https://raw.githubusercontent.com/aragon/aragon-court/master/contracts/court/AragonCourt.sol',
        'https://raw.githubusercontent.com/aragon/aragon-court/master/contracts/court/controller/Controller.sol',
        'https://raw.githubusercontent.com/aragon/aragon-court/master/contracts/arbitration/IArbitrator.sol',
        'https://raw.githubusercontent.com/aragon/aragon-court/master/contracts/disputes/IDisputeManager.sol'
       ] 

fpath = 'tmp/solidity.txt'

In [31]:
df_objects_all = pd.DataFrame()
df_parameters_all = pd.DataFrame()
for url in urls:
    print(url.split('/')[-1].split('.sol')[0])
    # Get content of Gnosis Safe contract and save to temporary file
    content = requests.get(url).text
    with open(fpath, 'w') as f:
        f.write(content)
    lines = content.split('\n')

    # Get file structure as OrderedList and split into contracts
    sourceUnit = parser.parse_file(fpath, loc=True)
    
    # Save to file
    savename = 'tmp/' + url.split('/')[-1].split('.sol')[0] + '_parsed.txt'
    with open(savename, 'w') as f:
        pprint.pprint(sourceUnit, stream=f)    
    
    # Get object and parameter DataFrames (selecting from solidity_parser AST)
    contracts = [c for c in sourceUnit['children'] if c.get('type') == 'ContractDefinition'] 
    df_objects, df_parameters = extract_objects_and_parameters(contracts)

    # Add comments to the DataFrames
    df_objects, df_parameters = add_docstring_comments(lines, df_objects, df_parameters)
    df_parameters = add_inline_comments(lines, df_parameters)
    
    df_objects['url'] = url
    
    df_objects_all = df_objects_all.append(df_objects)
    df_parameters_all = df_parameters_all.append(df_parameters)
    
df_objects_all['project'] = PROJECT

AragonCourt
Controller
IArbitrator
IDisputeManager


In [32]:
df_objects_all.drop(columns=['line_numbers', 'full_comment']).to_csv('tmp/contract_objects.csv')
df_objects_all.drop(columns=['line_numbers', 'full_comment'])

Unnamed: 0,contract,object_name,type,inheritance,description,modifiers,values,visibility,dev,notice,first_line,url,project
0,AragonCourt,AragonCourt,ContractDefinition,"[Controller, IArbitrator]",,,,,,,,https://raw.githubusercontent.com/aragon/arago...,Aragon Court
1,AragonCourt,constructor,FunctionDefinition,,Constructor function,[Controller],[],public,Constructor function,,,https://raw.githubusercontent.com/aragon/arago...,Aragon Court
2,AragonCourt,createDispute,FunctionDefinition,,Create a dispute with `_possibleRulings` possi...,[],[],external,,Create a dispute with `_possibleRulings` possi...,,https://raw.githubusercontent.com/aragon/arago...,Aragon Court
3,AragonCourt,closeEvidencePeriod,FunctionDefinition,,Close the evidence period of dispute #`_disput...,[],[],external,,Close the evidence period of dispute #`_disput...,,https://raw.githubusercontent.com/aragon/arago...,Aragon Court
4,AragonCourt,executeRuling,FunctionDefinition,,Execute the Arbitrable associated to dispute #...,[],[],external,,Execute the Arbitrable associated to dispute #...,,https://raw.githubusercontent.com/aragon/arago...,Aragon Court
...,...,...,...,...,...,...,...,...,...,...,...,...,...
13,IDisputeManager,getDispute,FunctionDefinition,,Tell information of a certain dispute,[],[],external,Tell information of a certain dispute,,,https://raw.githubusercontent.com/aragon/arago...,Aragon Court
14,IDisputeManager,getRound,FunctionDefinition,,Tell information of a certain adjudication round,[],[],external,Tell information of a certain adjudication round,,,https://raw.githubusercontent.com/aragon/arago...,Aragon Court
15,IDisputeManager,getAppeal,FunctionDefinition,,Tell appeal-related information of a certain a...,[],[],external,Tell appeal-related information of a certain a...,,,https://raw.githubusercontent.com/aragon/arago...,Aragon Court
16,IDisputeManager,getNextRoundDetails,FunctionDefinition,,Tell information related to the next round due...,[],[],external,Tell information related to the next round due...,,,https://raw.githubusercontent.com/aragon/arago...,Aragon Court


In [33]:
df_parameters_all.drop(columns=['line_number', 'full_comment']).to_csv('tmp/contract_parameters.csv')
df_parameters_all.drop(columns=['line_number', 'full_comment'])[df_parameters_all['description'] != ""]

Unnamed: 0,parameter_name,object_name,contract,type,initial_value,visibility,description
2,ARBITRABLE_INTERFACE_ID,AragonCourt,AragonCourt,bytes,error: could not parse,private,Arbitrable interface ID based on ERC-165
3,_termParams,constructor,AragonCourt,array,,,Array containing:
4,_governors,constructor,AragonCourt,array,,,Array containing:
5,_feeToken,constructor,AragonCourt,userdefined,,,Address of the token contract that is used to ...
6,_fees,constructor,AragonCourt,array,,,Array containing:
...,...,...,...,...,...,...,...
26,_disputeId,getNextRoundDetails,IDisputeManager,uint,,,Identification number of the dispute being que...
27,_roundId,getNextRoundDetails,IDisputeManager,uint,,,Identification number of the round requesting ...
28,_disputeId,getJuror,IDisputeManager,uint,,,Identification number of the dispute being que...
29,_roundId,getJuror,IDisputeManager,uint,,,Identification number of the round being queried


In [34]:
print(df_parameters_all['type'].unique())

['string' 'bytes' 'array' 'userdefined' 'uint' 'address'
 'error: could not parse']
