## Linguistic Case Study

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.pyplot import boxplot
import itertools
import glob
from scipy import stats
import sys
sys.path.insert(0,'../..')
import g4l.display
from g4l import SmallestMaximizerCriterion
from g4l.estimators.ctm_scanner import CTMScanner
import g4l.tree.generation
from g4l.estimators.prune import Prune
from g4l.evaluation.bootstrap import Bootstrap
from g4l.evaluation.t_test import TTest
from g4l.data import Sample


In [7]:
cache_dir = '../example1/cache'
cache_dir = None
X_bp = Sample('../example1/folha.txt', [0, 1, 2, 3, 4])
X_ep = Sample('../example1/publico.txt', [0, 1, 2, 3, 4])

In [8]:
ctm_scan = CTMScanner(penalty_interval=(0.1, 400), epsilon=0.01)
smc = SmallestMaximizerCriterion(ctm_scan, max_depth=4, read_cache_dir=cache_dir, tree_initialization_method=g4l.tree.generation.incremental_strategy)

In [9]:
num_resamples = 200
bootstrap = Bootstrap(X_bp, partition_string='4')
small_resamples_bp = bootstrap.resample(num_resamples, size=len(X_bp.data) * 0.3)
large_resamples_bp = bootstrap.resample(num_resamples, size=len(X_bp.data) * 0.9)
t_test = TTest(small_resamples_bp, large_resamples_bp, alpha=0.01)

In [10]:
# Generating trees for Brazilian Portuguese (BP)
BP = smc.fit(X_bp, t_test, processors=3)

KeyError: 'transition_probs'

In [None]:
num_resamples = 200
bootstrap_ep = Bootstrap(X_ep, partition_string='4')
small_resamples_ep = bootstrap.resample(num_resamples, size=len(X_ep.data) * 0.3)
large_resamples_ep = bootstrap.resample(num_resamples, size=len(X_ep.data) * 0.9)
t_test = TTest(small_resamples_bp, large_resamples_bp, alpha=0.01)

In [None]:
# Generating trees for European Portuguese (BP)
EP = smc.fit(X_ep, t_test, processors=3)

In [None]:
print("Best tree for Brazilian Portuguese:")
print(BP.best_tree().to_str())
g4l.display.draw_tree(BP.best_tree().to_str())

In [None]:
print("Best tree for European Portuguese:")
print(EP.best_tree().to_str())
g4l.display.draw_tree(EP.best_tree().to_str())

In [None]:
ll_ep = [x.log_likelihood() for x in EP.champion_trees]
num_contexts_ep = [len(x.leaves()) for x in EP.champion_trees]

ll_bp = [x.log_likelihood() for x in BP.champion_trees]
num_contexts_bp = [len(x.leaves()) for x in BP.champion_trees]

In [None]:
plt.figure(figsize=(14, 6))
plt.plot(num_contexts_bp, ll_bp, marker='o', linewidth=1, label="BP")
plt.plot(num_contexts_ep, ll_ep, marker='o', linewidth=1, label="EP")
plt.title("BP/EP Log-likelihood functions")
plt.grid()
plt.legend()
plt.show()