In [1]:
import os
import re
import requests
import pprint
import pandas as pd
from solidity_parser import parser

In [2]:
if not os.path.isdir('/tmp'):
    os.mkdir('/tmp')

In [3]:
ERRORMSG = 'error: could not parse'

# Construct governance surface of a Solidity smart contract
- [x] Parse structure of the smart contract, extracting all functions/modifiers/events, their parameters, and other relevant properties (e.g., visibility)
- [x] Get comments corresponding to function/parameter definitions to contextualize the structure
- [ ] Select subset of functions/parameters relevant to governance, preserving their structural relationships

TODO:
- [ ] Don't add inline comment for a parameter if it's the same as it's object's description (e.g., Governor Bravo)
- [ ] Fix inline comment finding (e.g., Moloch)
- [x] Clearly separate parsing from Airtable-specific coding (e.g., in get_parameter_type(param)); ideally, have flag which turns coding on/off for all cases

## Parse contract (functions)
Use the `solidity_parser` library to parse the contract as an [abstract syntax tree](https://en.wikipedia.org/wiki/Abstract_syntax_tree) (AST). This gets us the full structure of the contract, including functions, their parameters, and which functions call which other functions. 

Since this library does not preserve comments, add location information to the AST nodes so that later we can go back to the text and extract the relevant function/parameter descriptions.

Note that it's also possible to create abject from the OrderedList using `parser.objectify`, but this turns out not to actually be that useful, since only some of the information in the AST is preserved.

In [4]:
class ContractObject():
    SUPPORTED_OBJECTS = ['ContractDefinition', 'EventDefinition', 'ModifierDefinition', 'FunctionDefinition', 'StructDefinition', 'EnumDefinition']
    
    def __init__(self, ast_item, contractName):
        """Initialize object given portion of AST tree"""
        
        self.contract = contractName
        self.type = ast_item['type']
        
        assert self.type in self.SUPPORTED_OBJECTS, f"{self.type} type in {contractName} is not supported by ContractObject"
        
        if self.type == 'ContractDefinition':
            self.objectName = contractName
            self.inheritance = [b['baseName']['namePath'] for b in ast_item.get('baseContracts', [])]
            self.modifiers = ''
            self.values = ''
            self.visibility = ''
        else:
            name = ast_item['name']
            if name.startswith('function()'):
                # Nameless delegator functions are not parsed correctly by solidity_parser
                name = '(none)'
            self.objectName = name
            self.inheritance = ''
            self.modifiers = self.get_object_modifiers(ast_item)
            self.values = self.get_object_values(ast_item)
            self.visibility = ast_item.get('visibility', '')
            
        self.lineNumbers = (ast_item['loc']['start']['line'], ast_item['loc']['end']['line'])
                
        self.description = ''
        
    def as_Series(self):
        """Return variables as pd.Series"""
        
        objDict = {'object_name': self.objectName, 
                   'contract': self.contract, 
                   'type': self.type, 
                   'inheritance': self.inheritance, 
                   'modifiers': self.modifiers, 
                   'values': self.values, 
                   'visibility': self.visibility, 
                   'line_numbers': self.lineNumbers, 
                   'description': self.description
                  }
        return pd.Series(objDict)
    
    def get_object_modifiers(self, obj):
        """Get object modifiers"""

        modifiers = obj.get('modifiers', [])
        modifiers = [m.get('name', ERRORMSG) for m in modifiers]

        return modifiers

    def get_object_values(self, obj):
        """Get object options ("members" as defined in enum objects only)"""

        values = []

        if obj['type'] == 'EnumDefinition':
            members = obj.get('members', [])
            values = [m.get('name', ERRORMSG) for m in members]

        return values

In [5]:
class ContractParameter():
    def __init__(self, ast_item, parentObject):
        """Initialize parameter given portion of AST tree"""
        
        self.parameterName = ast_item['name']
        self.parentObject = parentObject

        self.lineNumber = ast_item['loc']['start']['line']
        self.visibility = ast_item.get('visibility', '')        
        
        self.type = self.get_parameter_type(ast_item)
        self.typeCategory = self.get_parameter_type_category(ast_item)
        self.initialValue = self.get_parameter_initialValue(ast_item)
        
        self.description = ''
        
    def as_Series(self):
        """Return variables as pd.Series"""
        
        paramDict = {'parameter_name': self.parameterName, 
                     'object_name': self.parentObject.objectName, 
                     'contract': self.parentObject.contract, 
                     'type': self.type, 
                     'type_category': self.typeCategory, 
                     'line_number': self.lineNumber, 
                     'initial_value': self.initialValue, 
                     'visibility': self.visibility, 
                     'description': self.description
                    }
        return pd.Series(paramDict)
    
    def get_parameter_type(self, param):
        """Get parameter data type"""

        typeDict = param['typeName']
        typeType = typeDict.get('type')

        paramType = typeDict.get('name', typeDict.get('namePath', ERRORMSG))

        return paramType
    
    def get_parameter_initialValue(self, param):
        """Get parameter initialValue"""

        value = param.get('initialValue')
        if value is not None:
            value = value.get('value', str(value))

        return value
    
    def get_parameter_type_category(self, param):
        """Get category of parameter dtype
        
        If ElementaryTypeName: returns the type stripped of any specific size indication (e.g., 'uint8' --> 'uint')
        If Mapping: returns 'map'
        If ArrayTypeName or UserDefinedTypeName: returns 'array' or 'userdefined'
        """

        typeDict = param['typeName']
        typeType = typeDict.get('type')
        if 'TypeName' in typeType and not 'Elementary' in typeType:
            paramCategory = typeType[:-8].lower()
        elif typeType == 'Mapping':
            paramCategory = 'map'
        else:
            paramCategory = typeDict.get('name', typeDict.get('namePath', ERRORMSG))

        # Strip digits from (end of) string (to remove size spcification from bytes, int, uint8)
        paramCategory = re.sub(r"\d+", "", paramCategory)

        return paramCategory

In [6]:
def extract_objects_and_parameters(sourceUnit):
    """Collect information on contract objects and their parameters
    
    Input: solidity-parser parsed AST for the contract file
    
    Returns two DataFrames:
      - df_objects contains contracts, function, event, modifier, struct, and enum definitions
      - df_parameters contains state variables, function arguments, struct values, and other
        parameters needed to define or call the above
        
    Each contract/parameter is first defined using the ContractObject or ContractParameter class
    to pull the relevant information from the AST node, then exported to a Series for storage in
    the corresponding DataFrame.
    """
    
    # Get list of contract nodes defined in Solidity file
    contracts = [c for c in sourceUnit['children'] if c.get('type') == 'ContractDefinition']
    
    df_objects = pd.DataFrame()
    df_parameters = pd.DataFrame()
    
    # Iterate through contracts to extract objects and their parameters
    for c in contracts:
        contractName = c['name']
    
        # Append object for the contract itself
        contract = ContractObject(c, contractName)
        df_objects = df_objects.append(contract.as_Series(), ignore_index=True)

        # Iterate through relevant subnodes in contract
        for item in c.get('subNodes', []):
            itemType = item['type']
            
            if itemType == 'StateVariableDeclaration':
                # Append contract state variables to parameters DataFrame
                for param in item.get('variables', {}):
                    stateVar = ContractParameter(param, contract)
                    df_parameters = df_parameters.append(stateVar.as_Series(), ignore_index=True)
            else:
                try: 
                    # Append function/event/modifier definition to objects DataFrame
                    contractObj = ContractObject(item, contractName)
                    df_objects = df_objects.append(contractObj.as_Series(), ignore_index=True)

                    # Append each parameter to DataFrame
                    paramObj = item.get('parameters', (item.get('members', {})))
                    if isinstance(paramObj, dict):
                        values = paramObj.get('parameters', [])
                    elif isinstance(paramObj, list) and itemType == 'StructDefinition':
                        values = paramObj
                    else:
                        values = []
                    for param in values:
                        contractParam = ContractParameter(param, contractObj)
                        df_parameters = df_parameters.append(contractParam.as_Series(), ignore_index=True)
                except AssertionError as e:
                    # If unsupported object type is encountered
                    print(e)

    return df_objects, df_parameters

## Parse comments (functions)
Extract two kinds of comments:
- Docstrings, or any other set of comments right before a function
- Inline comments on parameter definitions

In [7]:
def clean_object_comment_lines(lines):
    """Clean list of strings of comment block prior to object declaration"""
    
    # Try to get comment block, if there is one
    linesStr = '\n'.join(lines)
    pattern_commentBlock = re.compile(r'/\*\*.+?\*/', re.DOTALL)
    match = re.search(pattern_commentBlock, linesStr)
    if match:
        lines_new = match[0].split('\n')
        lines_new = [s.strip().strip('*') for s in lines if s]
    else:
        # Otherwise, parse individual line comments
        lines_new = [s.strip() for s in lines if s]
        lines_new = [s.split('//')[-1] for s in lines if s.startswith('//')]
                                                          
    lines_new = [s.strip() for s in lines_new if s]

    return lines_new

In [8]:
def clean_parameter_comment_lines(lines):
    """Clean list of strings of comment block prior to/at parameter declaration"""
    
    lines_new = [s.strip() for s in lines if s]
    if len(lines_new) > 0:
        prevLines = [s for s in lines_new[:-1] if s.startswith('//')]
        tmp = lines_new[-1].split('//')
        inLine = [tmp[-1]] if len(tmp) > 1 else ['']
        lines_new = prevLines + inLine
        lines_new = [s.split('//')[-1] for s in lines_new if len(lines_new) > 1]
        lines_new = [s.strip('/* ').strip() for s in lines_new]
        lines_new = [s for s in lines_new if s]

    return lines_new

In [9]:
def parse_object_description(lines_raw):
    """Clean and parse list of comment strings before an object definition.
    May be a block comment or individual line comments."""
    
    lines = clean_object_comment_lines(lines_raw)
    
    commentDict = {}
    description = ''
    
    # Add full (cleaned) comment
    commentDict['full_comment'] = '\n'.join(lines)
    commentDict['description'] = ''

    # Parse notice line(s); keep just the first one
    noticeLines = [s.split('@notice')[-1].strip() for s in lines if s.startswith('@notice')]
    try: 
        s = noticeLines[0]
        commentDict['notice'] = s
        description = s
    except IndexError:
        commentDict['notice'] = None    
    
    # Parse dev line(s); keep just the first one
    devLines = [s.split('@dev')[-1].strip() for s in lines if s.startswith('@dev')]
    try: 
        s = devLines[0]
        commentDict['dev'] = s
        if description == '':
            description = s        
    except IndexError:
        commentDict['dev'] = None


    # Save first line if no @dev or @notice, and if's probably actually useful
    if (commentDict['dev'] is None) and (commentDict['notice'] is None) and len(lines) > 0:
        first_line = lines[0]
        if (not first_line.startswith('pragma')) and (not len(first_line) == 1):
            commentDict['first_line'] = first_line
            if description == '':
                description == first_line
    else:
        commentDict['first_line'] = None
    
    commentDict['description'] = description
    
    # Parse parameter lines; create list of dict of parameter:description pairs (empty list if none found)
    paramLines = [s.split('@param')[-1].strip().split(' ', 1) for s in lines if s.startswith('@param')]
    commentDict['param'] = [{'parameter': p[0], 'description': p[1]} for p in paramLines]

    return commentDict

In [10]:
def parse_parameter_description(lines_raw, parameterName):
    """Clean and parse comment relating to parameter, either inline or right before the paramter"""

    lines = clean_parameter_comment_lines(lines_raw)
    
    commentDict = {}
    description = ''
    
    # Add full (cleaned) comment
    commentDict['full_comment'] = '\n'.join(lines)

    # Try to get notice first
    noticeLines = [s.split('@notice')[-1].strip() for s in lines if s.startswith('@notice')]
    if len(noticeLines) > 0:
        description = noticeLines[-1]

    # Parse parameter lines; create dict of parameter:description pairs
    paramLines = [s.split('@param')[-1].strip().split(' ', 1) for s in lines if '@param' in s]
    paramDict = {p[0]: p[1] for p in paramLines}
    description = paramDict.get(parameterName, description)

    # If above two methods failed, just grab the last line, if any
    if description == '' and len(lines) > 0:
        description = lines[-1]

    commentDict['description'] = description
        
    return commentDict

In [11]:
def add_docstring_comments(lines, df_objects, df_parameters):
    """Parse comments and add them to the relevant rows in the object and parameter DataFrames"""

    df_o = df_objects.copy(deep=True)
    df_p = df_parameters.copy(deep=True)

    df_o['full_comment'] = ''
    df_o['dev'] = ''
    df_o['notice'] = ''
    df_o['first_line'] = ''

    prevObjectLoc = (0,0)
    for i, row in df_o.iterrows():
        # Get, clean, and parse object comment lines
        commentEnd = row['line_numbers'][0] - 1
        if prevObjectLoc[1] <= commentEnd:
            commentStart = prevObjectLoc[1]
        else:
            commentStart = prevObjectLoc[0]
        commentLines = lines[commentStart:commentEnd]
        commentDict = parse_object_description(commentLines)

        # Add object descriptions to objects
        for key, value in commentDict.items():
            if key in df_o.columns:
                df_o.iat[i, df_o.columns.get_loc(key)] = value

        # Add parameter descriptions to parameters
        for item in commentDict['param']:
            index = df_p.loc[(df_p['object_name']==row['object_name']) &
                             (df_p['parameter_name']==item['parameter'])].index[0]
            df_p.iat[index, df_p.columns.get_loc('description')] = item['description']

        prevObjectLoc = row['line_numbers']

    return df_o, df_p

In [12]:
def add_inline_comments(lines, df_parameters):
    """Parse comments and add them to the relevant rows in the parameter DataFrame"""

    df_p = df_parameters.copy(deep=True)
    df_p['full_comment'] = ''

    commentStart = 0
    for i, row in df_p.iterrows():   
        # Grab and parse comment lines
        commentEnd = int(row['line_number'])
        commentLines = lines[min(commentStart, commentEnd - 2):commentEnd + 1]
        commentDict = parse_parameter_description(commentLines, row['parameter_name'])
        
        # Add to dict (but don't overwrite previously found value)
        for key, value in commentDict.items():
            if key in df_p.columns:
                currentValue = df_p.iat[i, df_p.columns.get_loc(key)]
                if not currentValue:
                    df_p.iat[i, df_p.columns.get_loc(key)] = value

        commentStart = commentEnd

    return df_p

## Run example: Compound Governor Bravo

In [13]:
PROJECT = 'Governor Bravo'
urls = ['https://raw.githubusercontent.com/notchia/metagov/main/data/contracts/Compound/GovernorBravoDelegate.sol',
        'https://raw.githubusercontent.com/notchia/metagov/main/data/contracts/Compound/GovernorBravoDelegator.sol',
        'https://raw.githubusercontent.com/notchia/metagov/main/data/contracts/Compound/GovernorBravoInterfaces%20(delegate).sol', 
        'https://raw.githubusercontent.com/notchia/metagov/main/data/contracts/Compound/GovernorBravoInterfaces%20(delegator).sol', 
        'https://raw.githubusercontent.com/notchia/metagov/main/data/contracts/Compound/Timelock.sol', 
        'https://raw.githubusercontent.com/notchia/metagov/main/data/contracts/Compound/Comp.sol', 
       ] 

fpath = 'tmp/solidity.txt'

In [14]:
df_objects_all = pd.DataFrame()
df_parameters_all = pd.DataFrame()
for url in urls:
    contractFilename = url.split('/')[-1].split('.sol')[0]
    print(f"Parsing {contractFilename}...")
    # Get content of Gnosis Safe contract and save to temporary file
    content = requests.get(url).text
    with open(fpath, 'w') as f:
        f.write(content)
    lines = content.split('\n')

    # Get file structure as OrderedList and split into contracts
    sourceUnit = parser.parse_file(fpath, loc=True)
    
    # Save to file
    savename = f"tmp/parsed_{contractFilename}.txt"
    with open(savename, 'w') as f:
        pprint.pprint(sourceUnit, stream=f)    
    
    # Get object and parameter DataFrames (selecting from solidity_parser AST)
    df_objects, df_parameters = extract_objects_and_parameters(sourceUnit)
    
    # Add comments to the DataFrames
    df_objects, df_parameters = add_docstring_comments(lines, df_objects, df_parameters)
    df_parameters = add_inline_comments(lines, df_parameters)
    
    df_objects['url'] = url
    
    df_objects_all = df_objects_all.append(df_objects)
    df_parameters_all = df_parameters_all.append(df_parameters)
    
df_objects_all['project'] = PROJECT

Parsing GovernorBravoDelegate...
Parsing GovernorBravoDelegator...
Parsing GovernorBravoInterfaces%20(delegate)...
Parsing GovernorBravoInterfaces%20(delegator)...
Parsing Timelock...
UsingForDeclaration type in Timelock is not supported by ContractObject
Parsing Comp...


In [15]:
df_objects_all.drop(columns=['line_numbers', 'full_comment']).to_csv('tmp/contract_objects.csv')
df_objects_all.drop(columns=['line_numbers'])

Unnamed: 0,object_name,contract,type,inheritance,modifiers,values,visibility,description,full_comment,dev,notice,first_line,url,project
0,GovernorBravoDelegate,GovernorBravoDelegate,ContractDefinition,"[GovernorBravoDelegateStorageV2, GovernorBravo...",,,,,,,,,https://raw.githubusercontent.com/notchia/meta...,Governor Bravo
1,initialize,GovernorBravoDelegate,FunctionDefinition,,[],[],public,Used to initialize the contract during delegat...,/// @notice The name of this contract\nstring ...,,Used to initialize the contract during delegat...,,https://raw.githubusercontent.com/notchia/meta...,Governor Bravo
2,propose,GovernorBravoDelegate,FunctionDefinition,,[],[],public,Function used to propose a new proposal. Sende...,/\n@notice Function used to propose a new prop...,,Function used to propose a new proposal. Sende...,,https://raw.githubusercontent.com/notchia/meta...,Governor Bravo
3,queue,GovernorBravoDelegate,FunctionDefinition,,[],[],external,Queues a proposal of state succeeded,/\n@notice Queues a proposal of state succeede...,,Queues a proposal of state succeeded,,https://raw.githubusercontent.com/notchia/meta...,Governor Bravo
4,queueOrRevertInternal,GovernorBravoDelegate,FunctionDefinition,,[],[],internal,,,,,,https://raw.githubusercontent.com/notchia/meta...,Governor Bravo
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20,safe32,Comp,FunctionDefinition,,[],[],internal,,,,,,https://raw.githubusercontent.com/notchia/meta...,Governor Bravo
21,safe96,Comp,FunctionDefinition,,[],[],internal,,,,,,https://raw.githubusercontent.com/notchia/meta...,Governor Bravo
22,add96,Comp,FunctionDefinition,,[],[],internal,,,,,,https://raw.githubusercontent.com/notchia/meta...,Governor Bravo
23,sub96,Comp,FunctionDefinition,,[],[],internal,,,,,,https://raw.githubusercontent.com/notchia/meta...,Governor Bravo


In [16]:
df_parameters_all.drop(columns=['line_number', 'full_comment']).to_csv('tmp/contract_parameters.csv')
df_parameters_all.drop(columns=['line_number', 'full_comment'])[df_parameters_all['description'] != ""]

Unnamed: 0,parameter_name,object_name,contract,type,type_category,initial_value,visibility,description
0,name,GovernorBravoDelegate,GovernorBravoDelegate,string,string,,public,The name of this contract
1,MIN_PROPOSAL_THRESHOLD,GovernorBravoDelegate,GovernorBravoDelegate,uint,uint,,public,The minimum setable proposal threshold
2,MAX_PROPOSAL_THRESHOLD,GovernorBravoDelegate,GovernorBravoDelegate,uint,uint,,public,The maximum setable proposal threshold
3,MIN_VOTING_PERIOD,GovernorBravoDelegate,GovernorBravoDelegate,uint,uint,,public,The minimum setable voting period
4,MAX_VOTING_PERIOD,GovernorBravoDelegate,GovernorBravoDelegate,uint,uint,,public,The max setable voting period
...,...,...,...,...,...,...,...,...
43,s,delegateBySig,Comp,bytes32,bytes,,,Half of the ECDSA signature pair
44,account,getCurrentVotes,Comp,address,address,,,The address to get votes balance
45,account,getPriorVotes,Comp,address,address,,,The address of the account to check
46,blockNumber,getPriorVotes,Comp,uint,uint,,,The block number to get the vote balance at


In [17]:
print(sorted(df_parameters_all['type_category'].unique()))
print(sorted(df_parameters_all['type'].unique()))

['address', 'array', 'bool', 'bytes', 'map', 'string', 'uint', 'userdefined']
['CompInterface', 'TimelockInterface', 'address', 'bool', 'bytes', 'bytes32', 'error: could not parse', 'string', 'uint', 'uint256', 'uint32', 'uint8', 'uint96']
