# Grammar Learner 0.4: generalization `2018-05-25`

**New features**: 
- categories generalization for disjunct-based sparse word space using Jaccard index as similarity measure,  
- link grammar rules generalizations for disjunct-based rules using Jaccard index as  similarity measure,
- updated Grammar Learner parameters with ```**kwargs```  
- categories saved as cat_tree.txt
- parse_ability and parse_quality metrics

Static html of this notebook is shared via  
[http://88.99.210.144/data/clustering_2018/html/Grammar-Learner-04-Generalization.html](http://88.99.210.144/data/clustering_2018/html/Grammar-Learner-04-Generalization.html)  
Data: [http://88.99.210.144/data/clustering_2018/Grammar-Learner-04-Generalization/](http://88.99.210.144/data/clustering_2018/Grammar-Learner-04-Generalization/)

## Basic settings

In [1]:
import os, sys, time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.display import display
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path: sys.path.append(module_path)
src_path = module_path + '/src'
if os.path.exists(src_path) and src_path not in sys.path: sys.path.append(src_path)
lg_path = '/home/oleg/miniconda3/envs/ull4/lib/python3.6/site-packages/linkgrammar'
if os.path.exists(lg_path) and lg_path not in sys.path: sys.path.append(lg_path)
from src.utl.utl import UTC
from src.utl.read_files import check_dir
from src.utl.turtle import html_table
from src.grammar_learner.poc04 import learn_grammar, params, parse_metrics, run_learn_grammar
prefix = '' # unused option
tmpath = module_path + '/tmp/'
check_dir(tmpath, True, 'none')
print(UTC(), ':: module_path =', module_path)

2018-05-25 18:30:19 UTC :: module_path = /home/oleg/language-learning


## Grammar Learner parameters

In [2]:
# New Grammar Learner parameters: input_parses, output_categories, output_grammar, **kwargs
kwargs = {
    'parse_mode'    :   'given'     ,   # 'given' (default) / 'explosive' (next)
    'left_wall'     :   'LEFT-WALL' ,   # '','none' - don't use / 'LEFT-WALL' - replace ###LEFT-WALL###
    'period'        :   True        ,   # use period in links learning: True/False
    'context'       :   2           ,   # 1: connectors / 2,3...: disjuncts
    'window'        :   'mst'       ,   # 'mst' / reserved options for «explosive» parsing
    'weighting'     :   'ppmi'      ,   # 'ppmi' / future options
    'group'         :   True        ,   # group items after link parsing
    'distance'      :   False       ,   # reserved options for «explosive» parsing
    'word_space'    :   'discrete'  ,   # 'vectors' / 'discrete' - no dimensionality reduction
    'dim_max'       :   100         ,   # max vector space dimensionality
    'sv_min'        :   0.1         ,   # minimal singular value (fraction of the max value)
    'dim_reduction' :   'none'      ,   # 'svm' / 'none' (discrete word_space, group)
    'clustering'    :   'group'     ,   # 'kmeans' / 'group'~'identical_entries' / future options
    'cluster_range' :   (2,48,1)    ,   # min, max, step
    'cluster_criteria': 'silhouette',   #
    'cluster_level' :   0.9         ,   # level = 0, 1, 0.-0.99..: 0 - max number of clusters
    'categories_generalization': 'off', # 'off' / 'cosine' - cosine similarity, 'jaccard'
    'categories_merge': 0.8         ,   # merge categories with similarity > this 'merge' criteria
    'categories_aggregation': 0.3   ,   # aggregate categories with similarity > this criteria
    'grammar_rules' :   2           ,   # 1: 'connectors' / 2 - 'disjuncts' / 0 - 'words' (TODO?)
    'rules_generalization': 'off'   ,   # 'off' / 'cosine' - cosine similarity, 'jaccard'
    'rules_merge'   :   0.8         ,   # merge rules with similarity > this 'merge' criteria
    'rules_aggregation':   0.3      ,   # aggregate rules similarity > this criteria
    'tmpath'        :   module_path + '/tmp/',
    'verbose': 'min', # display intermediate results: 'none', 'min', 'mid', 'max'
    # Additional (optional) parameters for parse_metrics (_abiity & _quality):
    'test_corpus'   :   module_path + '/data/POC-Turtle/poc-turtle-corpus.txt',
    'reference_path':   module_path + '/data/POC-Turtle/poc-turtle-parses-expected.txt',
    'template_path' : 'poc-turtle',
    'linkage_limit' : 1
}

# Integration test: POC-Turtle 

## Baseline: MST_fixed disjuncts-ILE-disjuncts, no generalization

In [3]:
%%capture
corpus = 'POC-Turtle'
dataset = 'MST_fixed_manually'
out_dir = module_path + '/output/Grammar-Learner-04-' + str(UTC())[:10]
kwargs['categories_generalization'] = ''
kwargs['rules_generalization'] = ''
input_parses, output_categories, output_grammar = \
    params(corpus, dataset, module_path, out_dir, **kwargs)
response = learn_grammar(input_parses, output_categories, output_grammar, **kwargs)
pa, pq, lg_parse_path = parse_metrics(response['grammar_file'], **kwargs)

In [4]:
print('Parse ability (PA), parse quality(PQ), PA*PQ:', \
      str(pa)+'%, '+str(pq)+'%, '+str(int(round(pa*pq/100,0)))+'%;')
print('Category tree "cat_tree.txt" file:')
with open(response['tree_file'],'r') as f: x = f.read().splitlines()
display(html_table([y.split('\t') for y in x]))

Parse ability (PA), parse quality(PQ), PA*PQ: 100%, 100%, 100%;
Category tree "cat_tree.txt" file:


0,1,2,3,4,5
C01,0,1,0.0,.,0
C02,0,2,0.0,LEFT-WALL,0
C03,0,3,0.0,bird extremity fish,0 0 0
C04,0,4,0.0,eagle herring parrot tuna,0 0 0 0
C05,0,5,0.0,feather scale,0 0
C06,0,6,0.0,fin wing,0 0
C07,0,7,0.0,has,0
C08,0,8,0.0,isa,0


In [5]:
display(html_table([[k,v] for k,v in response.items()]))

0,1
datime,2018-05-25 18:30:19 UTC
learn_grammar,80511
input files,['/home/oleg/language-learning/data/POC-Turtle/MST_fixed_manually/poc-turtle-parses-expected.txt']
parsed_links,60
unique_links,29
unique_words,15
word-link_pairs,44
category_learner,80525
categories_file,/home/oleg/language-learning/output/Grammar-Learner-04-2018-05-25/POC-Turtle/MST_fixed_manually/disjuncts-ILE-disjuncts/LEFT-WALL_period/no_generalization/8_categories.txt
rule_list,8


## Generalization of word categories

In [6]:
%%capture
kwargs['categories_generalization'] = 'jaccard'
kwargs['rules_generalization'] = ''
# All-in-one test function - /src/grammar_learner/poc04.py:
re22 = run_learn_grammar(corpus, dataset, module_path, out_dir, **kwargs)

In [7]:
def display_tree(response):
    print('Parse ability (PA), parse quality(PQ), PA*PQ:', \
          str(response['parse_ability']) + '%, ' + \
          str(response['parse_quality']) + '%, ' + \
          str(response['parse_quability']) + '%;')
    print('Category tree "cat_tree.txt" file:')
    with open(response['tree_file'],'r') as f: x = f.read().splitlines()
    display(html_table([y.split('\t') for y in x]))
display_tree(re22)

Parse ability (PA), parse quality(PQ), PA*PQ: 100%, 100%, 100%;
Category tree "cat_tree.txt" file:


0,1,2,3,4,5
C01,0,1,0.0,.,0
C02,0,2,0.0,LEFT-WALL,0
C03,0,3,0.0,bird extremity fish,0 0 0
C04,0,9,0.0,eagle herring parrot tuna feather scale fin wing,0 0 0 0 0 0 0 0
,9,4,0.0,eagle herring parrot tuna,0 0 0 0
,9,5,0.0,feather scale,0 0
,9,6,0.0,fin wing,0 0
C05,0,7,0.0,has,0
C06,0,8,0.0,isa,0


_Primary clusters 4,5,6 aggregated to a new cluster 9 (C04)_

## 2-step generalization

In [8]:
%%capture
kwargs['categories_generalization'] = 'jaccard'
kwargs['categories_aggregation'] = 0.6
kwargs['rules_generalization'] = 'jaccard'
kwargs['rules_aggregation'] = 0.3
re23 = run_learn_grammar(corpus, dataset, module_path, out_dir, **kwargs)

In [9]:
display_tree(re23)

Parse ability (PA), parse quality(PQ), PA*PQ: 100%, 100%, 100%;
Category tree "cat_tree.txt" file:


0,1,2,3,4,5
C01,0,1,0.0,.,0
C02,0,2,0.0,LEFT-WALL,0
C03,0,3,0.0,bird extremity fish,0 0 0
C04,0,8,0.0,eagle fin herring parrot tuna wing feather scale,0 0 0 0 0 0 0 0
,8,4,0.0,eagle fin herring parrot tuna wing,0 0 0 0 0 0
,8,5,0.0,feather scale,0 0
C05,0,6,0.0,has,0
C06,0,7,0.0,isa,0


_Secondary clusters 4,5 aggregated to a new cluster 8 (C04) during rule generalization.  
Secondary cluster 4 was formed from 2 category clusters during category generalization.  
Cluster hierarchy not preserved :(_

## Connectors-DRK-disjuncts, Generalize rules

In [10]:
%%capture
kwargs['left_wall'] = ''
kwargs['period'] = False
kwargs['context'] = 1
kwargs['word_space'] = 'vectors'
kwargs['dim_reduction'] = 'svm'
kwargs['clustering'] = 'kmeans'
kwargs['categories_generalization'] = 'off'
kwargs['rules_generalization'] = 'jaccard'
kwargs['rules_aggregation'] = 0.3
kwargs['verbose'] = 'mid'
re24 = run_learn_grammar(corpus, dataset, module_path, out_dir, **kwargs)

In [11]:
display_tree(re24)

Parse ability (PA), parse quality(PQ), PA*PQ: 100%, 100%, 100%;
Category tree "cat_tree.txt" file:


0,1,2,3,4,5
C01,0,1,0.0,has isa,0 0
C02,0,5,0.0,bird extremity fish feather fin scale wing eagle herring parrot tuna,0 0 0 0 0 0 0 0 0 0 0
,5,2,0.0,bird extremity fish,0 0 0
,5,3,0.0,feather fin scale wing,0 0 0 0
,5,4,0.0,eagle herring parrot tuna,0 0 0 0


# POC-English-NoAmb

In [12]:
%%capture
corpus = 'POC-English-NoAmb'
kwargs['test_corpus'] = module_path + '/data/POC-English-NoAmb/poc_english_noamb_corpus.txt'
kwargs['reference_path'] = module_path + '/data/POC-English-NoAmb/poc-english_noAmb-parses-gold.txt'
re31 = run_learn_grammar(corpus, dataset, module_path, out_dir, **kwargs)

In [13]:
display_tree(re31)

Parse ability (PA), parse quality(PQ), PA*PQ: 98%, 97%, 95%;
Category tree "cat_tree.txt" file:


0,1,2,3,4,5
C01,0,1,0.0,a is liked likes was,0 0 0 0 0
C02,0,7,0.0,child food human parent dad daughter mom son,0 0 0 0 0 0 0 0
,7,2,0.0,child food human parent,0 0 0 0
,7,5,0.0,dad daughter mom son,0 0 0 0
C03,0,7,0.0,now before not cake sausage,0 0 0 0 0
,7,3,0.0,now,0
,7,4,0.0,before not,0 0
,7,6,0.0,cake sausage,0 0


# POC-English-Amb

In [14]:
%%capture
corpus = 'POC-English-Amb'
kwargs['test_corpus'] = module_path + '/data/POC-English-Amb/poc_english.txt'
kwargs['reference_path'] = module_path + '/data/POC-English-Amb/poc-english_ex-parses-gold.txt'
re41 = run_learn_grammar(corpus, dataset, module_path, out_dir, **kwargs)

In [15]:
display_tree(re41)

Parse ability (PA), parse quality(PQ), PA*PQ: 37%, 33%, 12%;
Category tree "cat_tree.txt" file:


0,1,2,3,4,5
C01,0,32,0.0,dad mom daughter son,0 0 0 0
,32,1,0.0,dad mom,0 0
,32,2,0.0,daughter son,0 0
C02,0,3,0.0,cake sausage,0 0
C03,0,32,0.0,before not now,0 0 0
,32,4,0.0,before not,0 0
,32,5,0.0,now,0
C04,0,6,0.0,das,0
C05,0,7,0.0,to,0
C06,0,32,0.0,child food human parent tool,0 0 0 0 0
