# POS Entropy in different syntactic positions

In [1]:
import sys
sys.path.append('../')

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import scipy.stats
import glob
import collections

import src.ud_corpus
import src.syntax_entropy

%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
ud_data_classical = src.ud_corpus.POSCorpus.create_from_ud(glob.glob('../data/UD_Classical_Chinese-Kyoto/*.conllu'), split_chars=False)
ud_data_modern = src.ud_corpus.POSCorpus.create_from_ud(glob.glob('../data/UD_Chinese-GSD/*.conllu'), split_chars=False)

## Find most common particles

In [3]:
PARTICLE_POS = ['AUX', 'PART', 'CCONJ', 'SCONJ', 'AUX', 'ADP']

def get_most_common_particles(ud):
  ctr = collections.Counter()
  for sentence in ud.sentences:
    for tok in sentence:
      if tok['pos'] in PARTICLE_POS:
        ctr[tok['word']] += 1
  return ctr

In [4]:
ctr = get_most_common_particles(ud_data_classical)
ctr.most_common(10)

[('也', 2545),
 ('之', 1962),
 ('而', 1737),
 ('者', 1284),
 ('于', 972),
 ('矣', 557),
 ('所', 503),
 ('可', 476),
 ('乎', 471),
 ('为', 403)]

In [5]:
ctr = get_most_common_particles(ud_data_modern)
ctr.most_common(10)

[('的', 5637),
 ('在', 1060),
 ('是', 884),
 ('和', 843),
 ('了', 808),
 ('为', 685),
 ('与', 485),
 ('于', 399),
 ('中', 383),
 ('会', 337)]

## Split into segments without punctuation

In [6]:
segments_classical = src.syntax_entropy.split_into_segments(ud_data_classical.sentences)
segments_modern = src.syntax_entropy.split_into_segments(ud_data_modern.sentences)

## POS distribution in various sentence positions

In [7]:
src.syntax_entropy.display_entropy(src.syntax_entropy.get_start_distribution(segments_classical))

Counter({'OTHER': 6990, 'VERB': 4479, 'NOUN': 3646})
1.0600933000574104


In [8]:
src.syntax_entropy.display_entropy(src.syntax_entropy.get_start_distribution(segments_modern))

Counter({'OTHER': 10589, 'NOUN': 3427, 'VERB': 2309})
0.8851155125050056


In [9]:
src.syntax_entropy.display_entropy(src.syntax_entropy.get_end_distribution(segments_classical))

Counter({'VERB': 5799, 'OTHER': 5218, 'NOUN': 4098})
1.0885771364998045


In [10]:
src.syntax_entropy.display_entropy(src.syntax_entropy.get_end_distribution(segments_modern))

Counter({'NOUN': 8582, 'OTHER': 5312, 'VERB': 2431})
0.9869532257558223


In [11]:
for ch, _ in get_most_common_particles(ud_data_classical).most_common(5):
  print('Before', ch)
  src.syntax_entropy.display_entropy(src.syntax_entropy.get_before_distribution(segments_classical, ch))
  print('After', ch)
  src.syntax_entropy.display_entropy(src.syntax_entropy.get_after_distribution(segments_classical, ch))
  print()

Before 也
Counter({'NOUN': 1067, 'VERB': 977, 'OTHER': 501})
1.0519315141208563
After 也
Counter({'VERB': 154, 'OTHER': 148, 'NOUN': 18})
0.8704934678846507

Before 之
Counter({'VERB': 1577, 'NOUN': 1560, 'OTHER': 468})
0.9891940573445255
After 之
Counter({'NOUN': 1274, 'VERB': 885, 'OTHER': 719})
1.0698816618511504

Before 而
Counter({'VERB': 775, 'NOUN': 537, 'OTHER': 196})
0.9749982370895953
After 而
Counter({'VERB': 944, 'OTHER': 538, 'NOUN': 251})
0.9738956752144691

Before 者
Counter({'NOUN': 554, 'VERB': 516, 'OTHER': 214})
1.027658229840905
After 者
Counter({'OTHER': 598, 'VERB': 349, 'NOUN': 138})
0.9554657910779525

Before 于
Counter({'VERB': 722, 'NOUN': 148, 'OTHER': 120})
0.7701205229957517
After 于
Counter({'NOUN': 615, 'OTHER': 350, 'VERB': 102})
0.9076376236305117



In [12]:
for ch, _ in get_most_common_particles(ud_data_modern).most_common(5):
  print('Before', ch)
  src.syntax_entropy.display_entropy(src.syntax_entropy.get_before_distribution(segments_modern, ch))
  print('After', ch)
  src.syntax_entropy.display_entropy(src.syntax_entropy.get_after_distribution(segments_modern, ch))
  print()

Before 的
Counter({'OTHER': 2448, 'NOUN': 2113, 'VERB': 964})
1.0328970058721147
After 的
Counter({'NOUN': 3893, 'OTHER': 1259, 'VERB': 272})
0.7271302344613362

Before 在
Counter({'OTHER': 629, 'NOUN': 308, 'VERB': 274})
1.024696050100764
After 在
Counter({'OTHER': 938, 'NOUN': 543, 'VERB': 149})
0.9028654701947718

Before 是
Counter({'OTHER': 495, 'NOUN': 398, 'VERB': 49})
0.8558999824509612
After 是
Counter({'OTHER': 759, 'NOUN': 211, 'VERB': 203})
0.8938226372705123

Before 和
Counter({'NOUN': 444, 'OTHER': 315, 'VERB': 29})
0.8113025746045924
After 和
Counter({'NOUN': 389, 'OTHER': 378, 'VERB': 60})
0.902952162562862

Before 了
Counter({'VERB': 788, 'OTHER': 16, 'NOUN': 6})
0.14064354250696148
After 了
Counter({'OTHER': 480, 'NOUN': 204, 'VERB': 58})
0.8360075946109897

