In [1]:
import os
import re
import requests
import pprint
import pandas as pd
from solidity_parser import parser

In [2]:
if not os.path.isdir('/tmp'):
    os.mkdir('/tmp')

In [3]:
ERRORMSG = 'error: could not parse'
IGNORE_CONTRACTS = ['SafeMath']

# Construct governance surface of a Solidity smart contract
- [x] Parse structure of the smart contract, extracting all functions/modifiers/events, their parameters, and other relevant properties (e.g., visibility)
- [x] Get comments corresponding to function/parameter definitions to contextualize the structure
- [ ] Select subset of functions/parameters relevant to governance, preserving their structural relationships

TODO:
- [ ] Don't add inline comment for a parameter if it's the same as it's object's description (e.g., Governor Bravo)
- [x] Fix inline comment finding (e.g., Moloch)
- [x] Clearly separate parsing from Airtable-specific coding (e.g., in get_parameter_type(param)); ideally, have flag which turns coding on/off for all cases

## Parse contract (functions)
Use the `solidity_parser` library to parse the contract as an [abstract syntax tree](https://en.wikipedia.org/wiki/Abstract_syntax_tree) (AST). This gets us the full structure of the contract, including functions, their parameters, and which functions call which other functions. 

Since this library does not preserve comments, add location information to the AST nodes so that later we can go back to the text and extract the relevant function/parameter descriptions.

Note that it's also possible to create abject from the OrderedList using `parser.objectify`, but this turns out not to actually be that useful, since only some of the information in the AST is preserved.

In [4]:
class ContractObject():
    SUPPORTED_OBJECTS = ['ContractDefinition', 'EventDefinition', 'ModifierDefinition', 'FunctionDefinition', 'StructDefinition', 'EnumDefinition']
    
    def __init__(self, ast_item, contractName):
        """Initialize object given portion of AST tree"""
        
        self.contract = contractName
        self.type = ast_item['type']
        
        assert self.type in self.SUPPORTED_OBJECTS, f"{self.type} type in {contractName} is not supported by ContractObject"
        
        if self.type == 'ContractDefinition':
            self.objectName = contractName
            self.inheritance = [b['baseName']['namePath'] for b in ast_item.get('baseContracts', [])]
            self.modifiers = ''
            self.values = ''
            self.visibility = ''
        else:
            name = ast_item['name']
            if name.startswith('function()'):
                # Nameless delegator functions are not parsed correctly by solidity_parser
                name = '(none)'
            self.objectName = name
            self.inheritance = ''
            self.modifiers = self.get_object_modifiers(ast_item)
            self.values = self.get_object_values(ast_item)
            self.visibility = ast_item.get('visibility', '')
            
        self.lineNumbers = (ast_item['loc']['start']['line'], ast_item['loc']['end']['line'])
                
        self.description = ''
        
    def as_Series(self):
        """Return variables as pd.Series"""
        
        objDict = {'object_name': self.objectName, 
                   'contract': self.contract, 
                   'type': self.type, 
                   'inheritance': self.inheritance, 
                   'modifiers': self.modifiers, 
                   'values': self.values, 
                   'visibility': self.visibility, 
                   'line_numbers': self.lineNumbers, 
                   'description': self.description
                  }
        return pd.Series(objDict)
    
    def get_object_modifiers(self, obj):
        """Get object modifiers"""

        modifiers = obj.get('modifiers', [])
        modifiers = [m.get('name', ERRORMSG) for m in modifiers]

        return modifiers

    def get_object_values(self, obj):
        """Get object options ("members" as defined in enum objects only)"""

        values = []

        if obj['type'] == 'EnumDefinition':
            members = obj.get('members', [])
            values = [m.get('name', ERRORMSG) for m in members]

        return values

In [5]:
class ContractParameter():
    def __init__(self, ast_item, parentObject):
        """Initialize parameter given portion of AST tree"""
        
        self.parameterName = ast_item['name']
        self.parentObject = parentObject

        self.lineNumber = ast_item['loc']['start']['line']
        self.visibility = ast_item.get('visibility', '')        
        
        self.type = self.get_parameter_type(ast_item)
        self.typeCategory = self.get_parameter_type_category(ast_item)
        self.initialValue = self.get_parameter_initialValue(ast_item)
        
        self.description = ''
        
    def as_Series(self):
        """Return variables as pd.Series"""
        
        paramDict = {'parameter_name': self.parameterName, 
                     'object_name': self.parentObject.objectName, 
                     'contract': self.parentObject.contract, 
                     'type': self.type, 
                     'type_category': self.typeCategory, 
                     'line_number': self.lineNumber, 
                     'initial_value': self.initialValue, 
                     'visibility': self.visibility, 
                     'description': self.description
                    }
        return pd.Series(paramDict)
    
    def get_parameter_type(self, param):
        """Get parameter data type"""

        typeDict = param['typeName']
        typeType = typeDict.get('type')

        paramType = typeDict.get('name', typeDict.get('namePath', ERRORMSG))

        return paramType
    
    def get_parameter_initialValue(self, param):
        """Get parameter initialValue"""

        value = param.get('initialValue')
        if value is not None:
            value = value.get('value', str(value))

        return value
    
    def get_parameter_type_category(self, param):
        """Get category of parameter dtype
        
        If ElementaryTypeName: returns the type stripped of any specific size indication (e.g., 'uint8' --> 'uint')
        If Mapping: returns 'map'
        If ArrayTypeName or UserDefinedTypeName: returns 'array' or 'userdefined'
        """

        typeDict = param['typeName']
        typeType = typeDict.get('type')
        if 'TypeName' in typeType and not 'Elementary' in typeType:
            paramCategory = typeType[:-8].lower()
        elif typeType == 'Mapping':
            paramCategory = 'map'
        else:
            paramCategory = typeDict.get('name', typeDict.get('namePath', ERRORMSG))

        # Strip digits from (end of) string (to remove size spcification from bytes, int, uint8)
        paramCategory = re.sub(r"\d+", "", paramCategory)

        return paramCategory

In [6]:
def extract_objects_and_parameters(sourceUnit):
    """Collect information on contract objects and their parameters
    
    Input: solidity-parser parsed AST for the contract file
    
    Returns two DataFrames:
      - df_objects contains contracts, function, event, modifier, struct, and enum definitions
      - df_parameters contains state variables, function arguments, struct values, and other
        parameters needed to define or call the above
        
    Each contract/parameter is first defined using the ContractObject or ContractParameter class
    to pull the relevant information from the AST node, then exported to a Series for storage in
    the corresponding DataFrame.
    """
    
    # Get list of relevant contract nodes defined in Solidity file
    contracts = [c for c in sourceUnit['children'] if c.get('type') == 'ContractDefinition']
    contracts = [c for c in contracts if c['name'] not in IGNORE_CONTRACTS]
    
    df_objects = pd.DataFrame()
    df_parameters = pd.DataFrame()
    
    # Iterate through contracts to extract objects and their parameters
    for c in contracts:
        contractName = c['name']
    
        # Append object for the contract itself
        contract = ContractObject(c, contractName)
        df_objects = df_objects.append(contract.as_Series(), ignore_index=True)

        # Iterate through relevant subnodes in contract
        for item in c.get('subNodes', []):
            itemType = item['type']
            
            if itemType == 'StateVariableDeclaration':
                # Append contract state variables to parameters DataFrame
                for param in item.get('variables', {}):
                    stateVar = ContractParameter(param, contract)
                    df_parameters = df_parameters.append(stateVar.as_Series(), ignore_index=True)
            else:
                try: 
                    # Append function/event/modifier definition to objects DataFrame
                    contractObj = ContractObject(item, contractName)
                    df_objects = df_objects.append(contractObj.as_Series(), ignore_index=True)

                    # Append each parameter to DataFrame
                    paramObj = item.get('parameters', (item.get('members', {})))
                    if isinstance(paramObj, dict):
                        values = paramObj.get('parameters', [])
                    elif isinstance(paramObj, list) and itemType == 'StructDefinition':
                        values = paramObj
                    else:
                        values = []
                    for param in values:
                        contractParam = ContractParameter(param, contractObj)
                        df_parameters = df_parameters.append(contractParam.as_Series(), ignore_index=True)
                except AssertionError as e:
                    # If unsupported object type is encountered
                    print(e)

    return df_objects, df_parameters

## Parse comments (functions)
Extract two kinds of comments:
- Docstrings, or any other set of comments right before a function
- Inline comments on parameter definitions

In [7]:
def clean_object_comment_lines(lines):
    """Clean list of strings of comment block prior to object declaration"""
    
    lines = [s.strip() for s in lines if s.strip()]
    linesStr = '\n'.join(lines)
    
    # Try to get comment block right before the object, if there is one
    pattern_commentBlock = re.compile(r'/\*\*(.+?)\*/$', re.DOTALL)
    match = re.search(pattern_commentBlock, linesStr)
    if match:
        lines_new = match.group(1).split('\n')
        lines_new = [re.sub('^\s*\*\s*', '', s).strip() for s in lines_new if s]
    else:
        # Otherwise, get contiguous block of individual line comments right before object
        lines_new = []
        i = len(lines) - 1
        endFlag = False
        while i >= 0 and not endFlag:
            if lines[i].startswith('//'):
                lines_new.append(re.sub(r'//+', '', lines[i]).strip())
            else:
                endFlag = True
            i -= 1
        lines_new = lines_new[::-1]
    
    return lines_new

In [8]:
def clean_parameter_comment_lines(lines):
    """Clean list of strings of comment block prior to/at parameter declaration"""
    
    prevLines = clean_object_comment_lines(lines[:-1])
    tmp = re.split(r'//+', lines[-1])
    inLine = [tmp[-1]] if len(tmp) > 1 else ['']
    
    lines_new = prevLines + inLine
    lines_new = [s.strip() for s in lines_new if s.strip()]
    
    return lines_new

In [9]:
def parse_object_description(lines_raw):
    """Clean and parse list of comment strings before an object definition.
    May be a block comment or individual line comments."""
    
    lines = clean_object_comment_lines(lines_raw)  
    
    commentDict = {}
    
    # Don't bother with the rest if no description was found
    if len(lines) == 0:
        return commentDict

    # Add full (cleaned) comment
    commentDict['full_comment'] = '\n'.join(lines)  
    
    # Add tag values, if NatSpec is used
    splitLines = re.split(r'@([a-z]+)', ' '.join(lines))[1:] 
    if len(splitLines) > 0:
        values = zip(splitLines[::2], splitLines[1::2])
        for (tag, value) in values:
            prevValue = commentDict.get(tag, '')
            if not prevValue:
                commentDict[tag] = value.strip()
            else:
                commentDict[tag] = prevValue + '\n' + value.strip()
    
    # Split paramters (if any) into a dictionary
    params = commentDict.get('param', '')
    if params:
        paramLines = [s.split(' ', 1) for s in params.split('\n')]
        commentDict['param'] = {p[0]: p[1] for p in paramLines}
    
    # Control logic for choosing main description
    if 'title' in commentDict.keys():
        description = commentDict['title']
    elif 'notice' in commentDict.keys():
        description = commentDict['notice']
    elif 'dev' in commentDict.keys():
        description = commentDict['dev']
    else:
        description = lines[0]
        
    commentDict['description'] = description
    
    return commentDict

In [10]:
def parse_parameter_description(lines_raw, parameterName):
    """Clean and parse comment relating to parameter, either inline or right before the paramter"""

    lines = clean_parameter_comment_lines(lines_raw)
    
    commentDict = {}
    description = ''

    # Try to get notice first
    noticeLines = [s.split('@notice')[-1].strip() for s in lines if s.startswith('@notice')]
    if len(noticeLines) > 0:
        description = noticeLines[-1]

    # Parse parameter lines; create dict of parameter:description pairs
    paramLines = [s.split('@param')[-1].strip().split(' ', 1) for s in lines if '@param' in s]
    paramDict = {p[0]: p[1] for p in paramLines}
    description = paramDict.get(parameterName, description)

    # If above two methods failed, just grab the first line, if any
    if description == '' and len(lines) > 0:
        description = lines[0]

    # Add description
    commentDict['description'] = description

    # Add full (cleaned) comment
    commentDict['full_comment'] = '\n'.join(lines)   
    
    return commentDict

In [11]:
def add_docstring_comments(lines, df_objects, df_parameters):
    """Parse comments and add them to the relevant rows in the object and parameter DataFrames"""

    df_o = df_objects.copy(deep=True)
    df_p = df_parameters.copy(deep=True)

    # Define tags to keep
    NATSPEC_TAGS = ['title', 'notice', 'dev', 'param', 'return']
    for tag in NATSPEC_TAGS:
        df_o[tag] = ''
    df_o['description'] = ''
    df_o['full_comment'] = ''
        
    prevObjectLoc = (0,0)
    for i, row in df_o.iterrows():
        # Get, clean, and parse object comment lines
        commentEnd = row['line_numbers'][0] - 1
        if prevObjectLoc[1] <= commentEnd:
            commentStart = prevObjectLoc[1]
        else:
            commentStart = prevObjectLoc[0]
        commentLines = lines[commentStart:commentEnd]
        commentDict = parse_object_description(commentLines)

        # Add object descriptions to objects
        for key, value in commentDict.items():
            if key in df_o.columns:
                if key == 'param':
                    value = list(value.keys())
                df_o.iat[i, df_o.columns.get_loc(key)] = value               

        # Add parameter descriptions to parameters
        for paramName, paramDescription in commentDict.get('param', {}).items():
            index = df_p.loc[(df_p['object_name']==row['object_name']) &
                             (df_p['parameter_name']==paramName)].index[0]
            df_p.iat[index, df_p.columns.get_loc('description')] = paramDescription

        prevObjectLoc = row['line_numbers']

    return df_o, df_p

In [12]:
def add_inline_comments(lines, df_parameters):
    """Parse comments and add them to the relevant rows in the parameter DataFrame"""

    df_p = df_parameters.copy(deep=True)
    df_p['full_comment'] = ''

    commentStart = 0
    for i, row in df_p.iterrows():   
        # Grab and parse comment lines
        commentEnd = int(row['line_number'])
        commentLines = lines[min(commentStart, commentEnd - 2):commentEnd]
        commentDict = parse_parameter_description(commentLines, row['parameter_name'])
        
        # Add to dict (but don't overwrite previously found value)
        for key, value in commentDict.items():
            if key in df_p.columns:
                currentValue = df_p.iat[i, df_p.columns.get_loc(key)]
                if not currentValue:
                    df_p.iat[i, df_p.columns.get_loc(key)] = value

        commentStart = commentEnd

    return df_p

In [13]:
def remove_duplicate_comments_in_parameters(df_o, df_parameters):
    """Remove description and/or full comment for a parameter if it is 
    the same as its parent object's description"""
    
    df_p = df_parameters.copy(deep=True)
    
    for i, row in df_parameters.iterrows():
        # Get parent object's comments
        index = df_o.loc[(df_o['object_name']==row['object_name']) &
                         (df_o['contract']==row['contract'])].index[0]
        object_fullComment = df_o.iat[index, df_o.columns.get_loc('full_comment')]
        object_description = df_o.iat[index, df_o.columns.get_loc('description')]
        
        # Delete parameter's comment(s) if duplicate of parent object's
        if (row['full_comment'] == object_fullComment) or ('@param' in object_fullComment):
            df_p.iat[i, df_p.columns.get_loc('full_comment')] = ''
        if row['description'] == object_description:
            df_p.iat[i, df_p.columns.get_loc('description')] = ''
        
    return df_p

## Run example: Compound Governor Bravo

In [14]:
PROJECT = 'Governor Bravo'
urls = ['https://raw.githubusercontent.com/notchia/metagov/main/data/contracts/Compound/GovernorBravoDelegate.sol',
        'https://raw.githubusercontent.com/notchia/metagov/main/data/contracts/Compound/GovernorBravoDelegator.sol',
        'https://raw.githubusercontent.com/notchia/metagov/main/data/contracts/Compound/GovernorBravoInterfaces%20(delegate).sol', 
        'https://raw.githubusercontent.com/notchia/metagov/main/data/contracts/Compound/GovernorBravoInterfaces%20(delegator).sol', 
        'https://raw.githubusercontent.com/notchia/metagov/main/data/contracts/Compound/Timelock.sol', 
        'https://raw.githubusercontent.com/notchia/metagov/main/data/contracts/Compound/Comp.sol', 
       ] 

fpath = 'tmp/solidity.txt'

In [15]:
df_objects_all = pd.DataFrame()
df_parameters_all = pd.DataFrame()
for url in urls:
    contractFilename = url.split('/')[-1].split('.sol')[0]
    print(f"Parsing {contractFilename}...")
    # Get content of Gnosis Safe contract and save to temporary file
    content = requests.get(url).text
    with open(fpath, 'w') as f:
        f.write(content)
    lines = content.split('\n')

    # Get file structure as OrderedList and split into contracts
    sourceUnit = parser.parse_file(fpath, loc=True)
    
    # Save to file
    savename = f"tmp/parsed_{contractFilename}.txt"
    with open(savename, 'w') as f:
        pprint.pprint(sourceUnit, stream=f)    
    
    # Get object and parameter DataFrames (selecting from solidity_parser AST)
    df_objects, df_parameters = extract_objects_and_parameters(sourceUnit)
    
    # Add comments to the DataFrames
    df_objects, df_parameters = add_docstring_comments(lines, df_objects, df_parameters)
    df_parameters = add_inline_comments(lines, df_parameters)
    df_parameters = remove_duplicate_comments_in_parameters(df_objects, df_parameters)
    
    # Add other identifying info
    df_objects['url'] = url
    
    # Save to full dfs
    df_objects_all = df_objects_all.append(df_objects)
    df_parameters_all = df_parameters_all.append(df_parameters)
    
df_objects_all['project'] = PROJECT

Parsing GovernorBravoDelegate...
Parsing GovernorBravoDelegator...
Parsing GovernorBravoInterfaces%20(delegate)...
Parsing GovernorBravoInterfaces%20(delegator)...
Parsing Timelock...
UsingForDeclaration type in Timelock is not supported by ContractObject
Parsing Comp...


In [16]:
df_objects_all.drop(columns=['line_numbers']).to_csv('tmp/contract_objects.csv')
df_objects_all.drop(columns=['line_numbers', 'full_comment'])[df_objects_all['description'] != ""]

Unnamed: 0,object_name,contract,type,inheritance,modifiers,values,visibility,description,title,notice,dev,param,return,url,project
1,initialize,GovernorBravoDelegate,FunctionDefinition,,[],[],public,Used to initialize the contract during delegat...,,Used to initialize the contract during delegat...,,"[timelock_, comp_, votingPeriod_, votingDelay_...",,https://raw.githubusercontent.com/notchia/meta...,Governor Bravo
2,propose,GovernorBravoDelegate,FunctionDefinition,,[],[],public,Function used to propose a new proposal. Sende...,,Function used to propose a new proposal. Sende...,,"[targets, values, signatures, calldatas, descr...",Proposal id of new proposal,https://raw.githubusercontent.com/notchia/meta...,Governor Bravo
3,queue,GovernorBravoDelegate,FunctionDefinition,,[],[],external,Queues a proposal of state succeeded,,Queues a proposal of state succeeded,,[proposalId],,https://raw.githubusercontent.com/notchia/meta...,Governor Bravo
5,execute,GovernorBravoDelegate,FunctionDefinition,,[],[],external,Executes a queued proposal if eta has passed,,Executes a queued proposal if eta has passed,,[proposalId],,https://raw.githubusercontent.com/notchia/meta...,Governor Bravo
6,cancel,GovernorBravoDelegate,FunctionDefinition,,[],[],external,Cancels a proposal only if sender is the propo...,,Cancels a proposal only if sender is the propo...,,[proposalId],,https://raw.githubusercontent.com/notchia/meta...,Governor Bravo
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11,transferFrom,Comp,FunctionDefinition,,[],[],external,Transfer `amount` tokens from `src` to `dst`,,Transfer `amount` tokens from `src` to `dst`,,"[src, dst, rawAmount]",Whether or not the transfer succeeded,https://raw.githubusercontent.com/notchia/meta...,Governor Bravo
12,delegate,Comp,FunctionDefinition,,[],[],public,Delegate votes from `msg.sender` to `delegatee`,,Delegate votes from `msg.sender` to `delegatee`,,[delegatee],,https://raw.githubusercontent.com/notchia/meta...,Governor Bravo
13,delegateBySig,Comp,FunctionDefinition,,[],[],public,Delegates votes from signatory to `delegatee`,,Delegates votes from signatory to `delegatee`,,"[delegatee, nonce, expiry, v, r, s]",,https://raw.githubusercontent.com/notchia/meta...,Governor Bravo
14,getCurrentVotes,Comp,FunctionDefinition,,[],[],external,Gets the current votes balance for `account`,,Gets the current votes balance for `account`,,[account],The number of current votes for `account`,https://raw.githubusercontent.com/notchia/meta...,Governor Bravo


In [17]:
df_parameters_all.drop(columns=['line_number']).to_csv('tmp/contract_parameters.csv')
df_parameters_all.drop(columns=['line_number', 'full_comment'])[df_parameters_all['description'] != ""]

Unnamed: 0,parameter_name,object_name,contract,type,type_category,initial_value,visibility,description
0,name,GovernorBravoDelegate,GovernorBravoDelegate,string,string,,public,The name of this contract
1,MIN_PROPOSAL_THRESHOLD,GovernorBravoDelegate,GovernorBravoDelegate,uint,uint,,public,The minimum setable proposal threshold
2,MAX_PROPOSAL_THRESHOLD,GovernorBravoDelegate,GovernorBravoDelegate,uint,uint,,public,The maximum setable proposal threshold
3,MIN_VOTING_PERIOD,GovernorBravoDelegate,GovernorBravoDelegate,uint,uint,,public,The minimum setable voting period
4,MAX_VOTING_PERIOD,GovernorBravoDelegate,GovernorBravoDelegate,uint,uint,,public,The max setable voting period
...,...,...,...,...,...,...,...,...
42,r,delegateBySig,Comp,bytes32,bytes,,,Half of the ECDSA signature pair
43,s,delegateBySig,Comp,bytes32,bytes,,,Half of the ECDSA signature pair
44,account,getCurrentVotes,Comp,address,address,,,The address to get votes balance
45,account,getPriorVotes,Comp,address,address,,,The address of the account to check


In [18]:
print(sorted(df_parameters_all['type_category'].unique()))
print(sorted(df_parameters_all['type'].unique()))

['address', 'array', 'bool', 'bytes', 'map', 'string', 'uint', 'userdefined']
['CompInterface', 'TimelockInterface', 'address', 'bool', 'bytes', 'bytes32', 'error: could not parse', 'string', 'uint', 'uint256', 'uint32', 'uint8', 'uint96']
