In [1]:
import os
import requests
import pprint
import pandas as pd
from solidity_parser import parser

In [2]:
if not os.path.isdir('/tmp'):
    os.mkdir('/tmp')

# Construct governance surface of a Solidity smart contract
- [x] Parse structure of the smart contract, extracting all functions/modifiers/events, their parameters, and other relevant properties (e.g., visibility)
- [x] Get comments corresponding to function/parameter definitions to contextualize the structure
- [ ] Select subset of functions/parameters relevant to governance, preserving their structural relationships

TODO:
- [ ] Don't add inline comment for a parameter if it's the same as it's object's description

## Parse contract (functions)
Use the `solidity_parser` library to parse the contract as an [abstract syntax tree](https://en.wikipedia.org/wiki/Abstract_syntax_tree) (AST). This gets us the full structure of the contract, including functions, their parameters, and which functions call which other functions. 

Since this library does not preserve comments, add location information to the AST nodes so that later we can go back to the text and extract the relevant function/parameter descriptions.

Note that it's also possible to create abject from the OrderedList using `parser.objectify`, but this turns out not to actually be that useful, since only some of the information in the AST is preserved.

In [3]:
def get_parameter_type(param):
    """Get parameter dtype"""
    
    typeDict = param['typeName']
    paramType = typeDict.get('name', typeDict.get('namePath', None))
    
    return paramType

In [4]:
def get_parameter_initialValue(param):
    """Get parameter initialValue"""
    
    value = param.get('initialValue')
    if value is not None:
        value = str(value)
    
    return value

In [5]:
def extract_objects_and_parameters(contracts):
    """Collect information on contract objects (contracts/functions/modifiers/events) and their parameters
    
    Returns two DataFrames, one for objects, the other for their parameters.
    Currently grabs state variables declarations and event/modifier/function definitions.
    """
    
    df_objects = pd.DataFrame()
    df_parameters = pd.DataFrame()
    # Iterate through contracts (there may be multiple in a solidity file)
    for c in contracts:
        contractName = c['name']
        baseContracts = [b['baseName']['namePath'] for b in c.get('baseContracts', [])]
    
        # Append item and parameters for contract itself
        objectDict = {'contract': contractName,
                      'object_name': contractName,
                      'type': 'ContractDefinition',
                      'line_numbers': (c['loc']['start']['line'], c['loc']['end']['line']),
                      'inheritance': baseContracts,
                      'description': ''}
        df_objects = df_objects.append(pd.Series(objectDict), ignore_index=True)

        # Iterate through relevant subnodes in contract
        for item in c.get('subNodes', []):
            itemType = item['type']
            
            if itemType in ['StateVariableDeclaration']:
                # Append contract state variables to parameters DataFrame
                for param in item.get('variables', {}):
                    paramDict = {'parameter_name': param['name'],
                                 'object_name': contractName,
                                 'contract': contractName,
                                 'type': get_parameter_type(param),
                                 'line_number': param['loc']['start']['line'],
                                 'initial_value': get_parameter_initialValue(item),
                                 'visibility': param.get('visibility'),
                                 'description': ''}
                    df_parameters = df_parameters.append(pd.Series(paramDict), ignore_index=True)

            if itemType in ['EventDefinition', 'ModifierDefinition', 'FunctionDefinition']:
                itemName = item['name']

                # Append function/event/modifier definition to objects DataFrame
                objectDict = {'contract': contractName,
                              'object_name': itemName,
                              'type': itemType,
                              'modifiers': item.get('modifiers'), # only applies to functions
                              'visibility': item.get('visibility'), 
                              'line_numbers': (item['loc']['start']['line'], item['loc']['end']['line']),
                              'description': ''}
                df_objects = df_objects.append(pd.Series(objectDict), ignore_index=True)

                # Append each parameter to DataFrame
                for param in item.get('parameters', {}).get('parameters', []):
                    paramDict = {'parameter_name': param['name'],
                                 'object_name': itemName,
                                 'contract': contractName,
                                 'type': get_parameter_type(param),
                                 'line_number': param['loc']['start']['line'],
                                 'initial_value': get_parameter_initialValue(item),
                                 'visibility': param.get('visibility'),
                                 'description': ''}
                    df_parameters = df_parameters.append(pd.Series(paramDict), ignore_index=True)

    return df_objects, df_parameters

## Parse comments (functions)
Extract two kinds of comments:
- Docstrings, or any other set of comments right before a function
- Inline comments on parameter definitions

In [6]:
def clean_object_comment_lines(lines):
    """Clean list of strings of comment block prior to object declaration"""
    
    lines_new = [s.strip() for s in lines if s]
    lines_new = [s.split('//')[-1] for s in lines]
    lines_new = [s.strip('/* ').strip() for s in lines_new]
    lines_new = [s for s in lines_new if s]

    return lines_new

In [7]:
def clean_parameter_comment_lines(lines):
    """Clean list of strings of comment block prior to/at parameter declaration"""
    
    lines_new = [s.strip() for s in lines if s]
    if len(lines_new) > 0:
        prevLines = [s for s in lines_new[:-1] if s.startswith('//')]
        tmp = lines_new[-1].split('//')
        inLine = [tmp[-1]] if len(tmp) > 1 else ['']
        lines_new = prevLines + inLine
        lines_new = [s.split('//')[-1] for s in lines_new if len(lines_new) > 1]
        lines_new = [s.strip('/* ').strip() for s in lines_new]
        lines_new = [s for s in lines_new if s]

    return lines_new

In [8]:
def parse_object_description(lines_raw):
    """Clean and parse list of comment strings before an object definition.
    May be a block comment or individual line comments."""
    
    lines = clean_object_comment_lines(lines_raw)
    
    commentDict = {}
    
    # Add full (cleaned) comment
    commentDict['full_comment'] = '\n'.join(lines)

    # Parse dev line(s); keep just the first one
    devLines = [s.split('@dev')[-1].strip() for s in lines if s.startswith('@dev')]
    try: 
        commentDict['dev'] = devLines[0]
    except IndexError:
        commentDict['dev'] = None

    # Parse notice line(s); keep just the first one
    noticeLines = [s.split('@notice')[-1].strip() for s in lines if s.startswith('@notice')]
    try: 
        commentDict['notice'] = noticeLines[0]
    except IndexError:
        commentDict['notice'] = None

    # Save first line if no @dev or @notice, and if's probably actually useful
    if (commentDict['dev'] is None) and (commentDict['notice'] is None) and len(lines) > 0:
        first_line = lines[0]
        if (not first_line.startswith('pragma')) and (not len(first_line) == 1):
            commentDict['first_line'] = first_line
    else:
        commentDict['first_line'] = None
        
    # Parse parameter lines; create list of dict of parameter:description pairs (empty list if none found)
    paramLines = [s.split('@param')[-1].strip().split(' ', 1) for s in lines if s.startswith('@param')]
    commentDict['param'] = [{'parameter': p[0], 'description': p[1]} for p in paramLines]

    return commentDict

In [9]:
def parse_parameter_description(lines_raw, parameterName):
    """Clean and parse comment relating to parameter, either inline or right before the paramter"""

    lines = clean_parameter_comment_lines(lines_raw)
    
    commentDict = {}
    description = ''
    
    # Add full (cleaned) comment
    commentDict['full_comment'] = '\n'.join(lines)

    # Try to get notice first
    noticeLines = [s.split('@notice')[-1].strip() for s in lines if s.startswith('@notice')]
    if len(noticeLines) > 0:
        description = noticeLines[-1]

    # Parse parameter lines; create dict of parameter:description pairs
    paramLines = [s.split('@param')[-1].strip().split(' ', 1) for s in lines if '@param' in s]
    paramDict = {p[0]: p[1] for p in paramLines}
    description = paramDict.get(parameterName, description)

    # If above two methods failed, just grab the last line, if any
    if description == '' and len(lines) > 0:
        description = lines[-1]

    commentDict['description'] = description
        
    return commentDict

In [10]:
def add_docstring_comments(lines, df_objects, df_parameters):
    """Parse comments and add them to the relevant rows in the object and parameter DataFrames"""

    df_o = df_objects.copy(deep=True)
    df_p = df_parameters.copy(deep=True)

    df_o['full_comment'] = ''
    df_o['dev'] = ''
    df_o['notice'] = ''
    df_o['first_line'] = ''

    commentStart = 0
    for i, row in df_o.iterrows():
        # Get, clean, and parse object comment lines
        commentEnd = row['line_numbers'][0] - 1
        commentLines = clean_object_comment_lines(lines[commentStart:commentEnd])
        commentDict = parse_object_description(commentLines)

        # Add object descriptions to objects
        for key, value in commentDict.items():
            if key in df_o.columns:
                df_o.iat[i, df_o.columns.get_loc(key)] = value

        # Add parameter descriptions to parameters
        for item in commentDict['param']:
            index = df_p.loc[(df_p['object_name']==row['object_name']) &
                             (df_p['parameter_name']==item['parameter'])].index[0]
            df_p.iat[index, df_p.columns.get_loc('description')] = item['description']

        commentStart = row['line_numbers'][1]

    return df_o, df_p

In [11]:
def add_inline_comments(lines, df_parameters):
    """Parse comments and add them to the relevant rows in the parameter DataFrame"""

    df_p = df_parameters.copy(deep=True)
    df_p['full_comment'] = ''

    commentStart = 0
    for i, row in df_p.iterrows():   
        # Grab and parse comment lines
        commentEnd = int(row['line_number'])
        commentLines = lines[min(commentStart, commentEnd - 2):commentEnd + 1]
        commentDict = parse_parameter_description(commentLines, row['parameter_name'])
        
        # Add to dict (but don't overwrite previously found value)
        for key, value in commentDict.items():
            if key in df_p.columns:
                currentValue = df_p.iat[i, df_p.columns.get_loc(key)]
                if not currentValue:
                    df_p.iat[i, df_p.columns.get_loc(key)] = value

        commentStart = commentEnd

    return df_p

## Run example: Compound Governor Bravo

In [12]:
PROJECT = 'Governor Bravo'
urls = ['https://raw.githubusercontent.com/notchia/metagov/main/contracts/GovernorBravoDelegator.sol',
        'https://raw.githubusercontent.com/notchia/metagov/main/contracts/GovernorBravoInterfaces.sol'] 

fpath = '/tmp/solidity.txt'

In [13]:
df_objects_all = pd.DataFrame()
df_parameters_all = pd.DataFrame()
for url in urls:
    try:
        # Get content of Gnosis Safe contract and save to temporary file
        content = requests.get(url).text
        with open(fpath, 'w') as f:
            f.write(content)
        lines = content.split('\n')

        # Get contract structure as OrderedList
        sourceUnit = parser.parse_file(fpath, loc=True)

        contracts = [c for c in sourceUnit['children'] if c['type'] == 'ContractDefinition']

        # Save to file
        savename = url.split('/')[-1].split('.sol')[0]
        with open(savename + '.txt', 'w') as f:
            pprint.pprint(sourceUnit, stream=f)

        # Get object and parameter DataFrames (selecting from solidity_parser AST)
        df_objects, df_parameters = extract_objects_and_parameters(contracts)

        # Add comments to the DataFrames
        df_objects, df_parameters = add_docstring_comments(lines, df_objects, df_parameters)
        df_parameters = add_inline_comments(lines, df_parameters)

        df_objects_all = df_objects_all.append(df_objects)
        df_parameters_all = df_parameters_all.append(df_parameters)
    except Exception as e:
        print(e)

In [14]:
df_objects_all.to_csv('tmp/contract_objects.csv')
df_objects_all.drop(columns=['line_numbers', 'full_comment', 'description'])

Unnamed: 0,contract,object_name,type,inheritance,modifiers,visibility,dev,notice,first_line
0,GovernorBravoDelegator,GovernorBravoDelegator,ContractDefinition,"[GovernorBravoDelegatorStorage, GovernorBravoE...",,,,,
1,GovernorBravoDelegator,constructor,FunctionDefinition,,[],public,,,
2,GovernorBravoDelegator,_setImplementation,FunctionDefinition,,[],public,,Called by the admin to update the implementati...,
3,GovernorBravoDelegator,delegateTo,FunctionDefinition,,[],internal,It returns to the external caller whatever the...,Internal method to delegate execution to anoth...,
4,GovernorBravoDelegator,"function()externalpayable{(boolsuccess,)=imple...",FunctionDefinition,,[],external,Delegates execution to an implementation contr...,,
0,GovernorBravoEvents,GovernorBravoEvents,ContractDefinition,[],,,,,
1,GovernorBravoEvents,ProposalCreated,EventDefinition,,,,,,
2,GovernorBravoEvents,VoteCast,EventDefinition,,,,,An event emitted when a vote has been cast on ...,
3,GovernorBravoEvents,ProposalCanceled,EventDefinition,,,,,An event emitted when a proposal has been canc...,
4,GovernorBravoEvents,ProposalQueued,EventDefinition,,,,,An event emitted when a proposal has been queu...,


In [16]:
df_parameters_all.to_csv('tmp/contract_parameters.csv')
df_parameters_all.drop(columns=['line_number', 'type', 'initial_value'])

Unnamed: 0,parameter_name,object_name,contract,visibility,description,full_comment
0,timelock_,constructor,GovernorBravoDelegator,,,
1,comp_,constructor,GovernorBravoDelegator,,,
2,admin_,constructor,GovernorBravoDelegator,,,
3,implementation_,constructor,GovernorBravoDelegator,,,
4,votingPeriod_,constructor,GovernorBravoDelegator,,,
...,...,...,...,...,...,...
55,signature,executeTransaction,TimelockInterface,,,
56,data,executeTransaction,TimelockInterface,,,
57,eta,executeTransaction,TimelockInterface,,,
58,account,getPriorVotes,CompInterface,,,
