# POS Entropy in different syntactic positions

In [1]:
import sys
sys.path.append('../')

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import scipy.stats
import glob
import collections

import src.ud_corpus

%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
ud_data_classical = src.ud_corpus.POSCorpus.create_from_ud(glob.glob('../data/UD_Classical_Chinese-Kyoto/*.conllu'))
ud_data_modern = src.ud_corpus.POSCorpus.create_from_ud(glob.glob('../data/UD_Chinese-GSD/*.conllu'))

## Find most common particles

In [3]:
PARTICLE_POS = ['AUX', 'PART', 'CCONJ', 'SCONJ', 'AUX', 'ADP']

def get_most_common_particles(ud):
  ctr = collections.Counter()
  for sentence in ud.sentences:
    for tok in sentence:
      if tok['pos'] in PARTICLE_POS:
        ctr[tok['char']] += 1
  return ctr

In [4]:
ctr = get_most_common_particles(ud_data_classical)
ctr.most_common(10)

[('也', 2545),
 ('之', 1962),
 ('而', 1737),
 ('者', 1284),
 ('于', 972),
 ('矣', 557),
 ('所', 503),
 ('可', 476),
 ('乎', 471),
 ('为', 403)]

In [5]:
ctr = get_most_common_particles(ud_data_modern)
ctr.most_common(10)

[('的', 5646),
 ('是', 1086),
 ('在', 1064),
 ('了', 870),
 ('为', 862),
 ('和', 844),
 ('于', 568),
 ('与', 486),
 ('以', 439),
 ('之', 404)]