<a href="https://colab.research.google.com/github/mille-s/FORGe_count-rules/blob/main/FORGe_count_rules.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Upload a zipped folder with rules and unzip by running this cell
! unzip /content/rule.zip

In [None]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import codecs
import re
import itertools
import glob
import os

# RE for number of LUs in lexicons: ^\s*"[^"]+_[A-Z]{2}_[0-9]+"

folderPath = '/content/rule'
encoding = 'utf-8'

pathOut = '/content/out.txt'

# Delete existing property file
if os.path.exists(pathOut):
  os.remove(pathOut)

list_filepaths = sorted(glob.glob(os.path.join(folderPath, '*.*')))
#V4Design D5.3
language_IDs = ['CA', 'DE', 'EL', 'EN', 'ES', 'FR', 'IT', 'PL']
print("WARNING! Check that the list of languages is complete: "+str(language_IDs))
#build a dictionary with all language combinations possible
language_IDs_combinations = []
for L in range(0, len(language_IDs)+1):
    for subset in itertools.combinations(language_IDs, L):
        language_IDs_combinations.append('_'.join(subset))
#language_IDs_combinations = filter(None, language_IDs_combinations)
language_IDs_combinations = list(filter(None, language_IDs_combinations))

rule_all_count = []
rule_con_count = []
rule_agg_count = []
rule_dsynt_count = []
rule_ssynt_count = []
rule_lin_count = []
rule_sent_count = []

def make_stats(list_prefixes_rules, list_prefixes_rules_meta, fo, level):
    num_all = int(len(list_prefixes_rules))
    num_lang_ind = int(list_prefixes_rules.count('GEN'))
    num_lang_spec = int(num_all - num_lang_ind)
    if level == 'all':
      print('# Rules: ' +str(num_all))
      print('# Language-independent rules: ' +str(num_lang_ind))
      print('% Language-independent rules: ' +str(round(100*float(num_lang_ind)/float(num_all), 2)) +'%')
      print('# Language-specific rules: ' +str(num_lang_spec))
      print('% Language-specific rules: '+str(round(100*float(num_lang_spec)/float(num_all), 2)) +'%')
      #print('# Meta rules: ' +str(list_prefixes_rules_meta))
    fo.write('# Rules: '+str(num_all)+'\n')
    fo.write('# Language-independent rules: '+str(num_lang_ind)+'\n')
    fo.write('% Language-independent rules: '+str(round(100*float(num_lang_ind)/float(num_all), 2))+'%\n')
    fo.write('# Language-specific rules: '+str(num_lang_spec)+'\n')
    fo.write('% Language-specific rules: '+str(round(100*float(num_lang_spec)/float(num_all), 2))+'%\n')
    
def fill_lists_rule_count(filename, prefix, list_all, list_con, list_agg, list_dsynt, list_ssynt, list_lin, list_sent):
    list_all.append(prefix)
    if re.search('Con_Sem', filename) or re.search('Sem[^_]*_Sem', filename):
        list_con.append(prefix)
    elif re.search('Con_Agg[1-9]', filename):
        list_agg.append(prefix)
    elif re.search('SemComm_DSynt', filename):
        list_dsynt.append(prefix)
    elif re.search('DSynt_SSynt', filename) or re.search('SSynt_PostProc', filename) or re.search('SSynt_Agg', filename):
        list_ssynt.append(prefix)
    elif re.search('SSynt_DMorph_linearize', filename):
        list_lin.append(prefix)
    elif re.search('S[mM]orph', filename):
        list_sent.append(prefix)

#print('2 Arguments needed: pathInputFolder encoding')
ngrammars = 0
fo = codecs.open(pathOut, 'a', 'utf-8')
for filepath in list_filepaths:
    filename = filepath.rsplit('/', 1)[1]
    if filename.startswith(('1', '2', '3', '4', '5', '6', '7', '8', '9', '0')):
        ngrammars = ngrammars + 1
        #print('--------------')
        fo.write('--------------\n')
        #print(filename)
        fo.write(filename+'\n')
        #print('--------------')
        fo.write('--------------\n')
        fd = codecs.open(filepath, 'r', encoding)
        rule_grammar_count = []
        rule_meta_count = 0
        lines = fd.readlines()
        # store the language prefix of each rule
        # we start counting at 2 because "leftside" can only happen on line 3 earliest
        x = 2
        while x < len(lines):
            if re.search('^\s*leftside\s*=\s*\[\r*\n', lines[x]):
                if re.search('^[^\]\n]', lines[x+1]):
                    # \ufeff is BOM
                    if re.search('^\ufeff*\s*[a-zA-Z]+<=>[a-zA-Z]+\s[^\n]+\r*\n', lines[x-2]):
                        #print(lines[x-2])
                        list_prefixes = []
                        for language_IDs_combination in language_IDs_combinations:
                            if re.search (language_IDs_combination+'_', lines[x-2]):
                                list_prefixes.append(language_IDs_combination)
                        # keep longest prefix only
                        if len(list_prefixes) > 0:
                            longest_language_combo = ''
                            for prefix in list_prefixes:
                                if len(prefix) > len(longest_language_combo):
                                    longest_language_combo = prefix
                            rule_grammar_count.append(longest_language_combo)
                            #rule_all_count.append(longest_language_combo)
                            fill_lists_rule_count(filename, longest_language_combo, rule_all_count, rule_con_count, rule_agg_count, rule_dsynt_count, rule_ssynt_count, rule_lin_count, rule_sent_count)
                        else:
                            rule_grammar_count.append('GEN')
                            #rule_all_count.append('GEN')
                            fill_lists_rule_count(filename, 'GEN', rule_all_count, rule_con_count, rule_agg_count, rule_dsynt_count, rule_ssynt_count, rule_lin_count, rule_sent_count)
                        x = x + 1
                    else:
                        x = x + 1
                else:
                    rule_meta_count = rule_meta_count + 1
                    x = x + 1
            else:
                x = x + 1
        if len(rule_grammar_count) > 0:
            make_stats(rule_grammar_count, rule_meta_count, fo, str(filename))
    else:
        pass

dot = '--------------\n'
noRule = 'No rules found!\n'
#print(dot)
fo.write(dot)
allRules = '\n\nAll rulesets ('+str(ngrammars)+' grammars)'
print(allRules+'--------------')
fo.write(allRules+'\n')
fo.write(dot)
if len(rule_all_count) > 0:
    make_stats(rule_all_count, '', fo, 'all')
else:
    print(noRule)
#print(dot)
fo.write(dot)
#print('Con rulesets')
fo.write('Con rulesets\n')
#print(dot)
fo.write(dot)
if len(rule_con_count) > 0:
    make_stats(rule_con_count, '', fo, 'con')
else:
    print(noRule)
    fo.write(noRule)
#print(dot)
fo.write(dot)
#print('Agg rulesets')
fo.write('Agg rulesets\n')
#print(dot)
fo.write(dot)
if len(rule_agg_count) > 0:
    make_stats(rule_agg_count, '', fo, 'agg')
else:
    print(noRule)
    fo.write(noRule)
#print(dot)
fo.write(dot)
#print('DSynt rulesets')
fo.write('DSynt rulesets\n')
#print(dot)
fo.write(dot)
if len(rule_dsynt_count) > 0:
    make_stats(rule_dsynt_count, '', fo, 'dsynt')
else:
    print(noRule)
    fo.write(noRule)
#print(dot)
fo.write(dot)
#print('SSynt rulesets')
fo.write('SSynt rulesets\n')
#print(dot)
fo.write(dot)
if len(rule_ssynt_count) > 0:
    make_stats(rule_ssynt_count, '', fo, 'ssynt')
else:
    print(noRule)
    fo.write(noRule)
#print(dot)
fo.write(dot)
#print('Lin rulesets')
fo.write('Lin rulesets\n')
#print(dot)
fo.write(dot)
if len(rule_lin_count) > 0:
    make_stats(rule_lin_count, '', fo, 'lin')
else:
    print(noRule)
    fo.write(noRule)
#print(dot)
fo.write(dot)
#print('Sent rulesets')
fo.write('Sent rulesets\n')
#print(dot)
fo.write(dot)
if len(rule_sent_count) > 0:
    make_stats(rule_sent_count, '', fo, 'sent')
else:
    print(noRule)
    fo.write(noRule)
fo.close()

    #fo = codecs.open(os.path.join(outputFolder, filename+'.txt'), 'w', 'utf-8')
    #fo.close()