### Corpus Building

In [15]:
from textbook import build_corpus

# Path to the CSV file containing text data
csv_file = './data/教科書課文.csv'

# Name of the output folder
folder = "textbook_corpus"

# Building the corpus
build_corpus(csv_file, folder)

### Corpus Reading and Concordancer Initialization

In [16]:
from hgct import PlainTextReader, Concordancer, CompoAnalysis, Dispersion

# Initialize a PlainTextReader with the directory containing the corpus
corpus = PlainTextReader(
    dir_path = "textbook_corpus/"
    ).corpus

# Initialize a Concordancer with the corpus for performing text searches
c = Concordancer(corpus)

Indexing corpus for text retrival...


100%|██████████| 8/8 [00:00<00:00, 12758.34it/s]


Indexing corpus for concordance search...


100%|██████████| 8/8 [00:00<00:00, 405.56it/s]


### Utility Function for CQL Search

In [17]:
def get_first_n(cql, n=10, left=5, right=5):
    out = []
    for i, r in enumerate(c.cql_search(cql, left=left, right=right)):
        if i == n: break
        out.append(r)
    return out

### Search by Character

In [18]:
cql = """
[char="窗"] [char="[一-窗]"] 
"""
results = get_first_n(cql, n=5)
display("Top 5 results for the search query '[char=\"窗\"] [char=\"[一-窗]\"]':", results)  

result_1 = results[0]
display("Relevant information of the matching result:", result_1.data)  

'Top 5 results for the search query \'[char="窗"] [char="[一-窗]"]\':'

[<Concord 門、客廳和{窗戶}旁貼上年畫>,
 <Concord 水匠來，把{窗子}用甎頭堵上>,
 <Concord 子，就指著{窗子}說：「這兩>,
 <Concord 大門上、紙{窗旁}，幾乎都貼>,
 <Concord 要把這兩個{窗子}堵起來。」>]

'Relevant information of the matching result:'

{'left': '門、客廳和',
 'keyword': '窗戶',
 'right': '旁貼上年畫',
 'position': (3, 11, 0, 25),
 'meta': {'id': '4S/90-1-有趣的年畫.txt',
  'time': {'label': '教科書課文 - 4下', 'ord': 4, 'year': ['90', '88', '69']},
  'text': {'lesson': '1', 'title': '有趣的年畫', 'year': '90'}},
 'captureGroups': {}}

### Search by Character Components

In [19]:
# Display radicals
display("List of all Kangxi radicals present in the characters of the corpus:", c.chr_radicals)

# Search by radical
cql = """
[radical="穴"]
"""
display("Top 5 results for characters containing the radical '穴':", get_first_n(cql, 5))


# Display Ideographic Description Characters (IDCs)
display(c.chr_idcs)

# Search using ideographic description and components
cql = '''
[compo="木" & idc="horz2"]
'''
display("Top 5 results for characters with '木' component and 'horz2' ideographic description:", get_first_n(cql, 5))

'List of all Kangxi radicals present in the characters of the corpus:'

{'',
 '一',
 '丨',
 '丶',
 '丿',
 '乙',
 '亅',
 '二',
 '亠',
 '人',
 '儿',
 '入',
 '八',
 '冂',
 '冖',
 '冫',
 '几',
 '凵',
 '刀',
 '力',
 '勹',
 '匕',
 '匚',
 '匸',
 '十',
 '卜',
 '卩',
 '厂',
 '厶',
 '又',
 '口',
 '囗',
 '土',
 '士',
 '夊',
 '夕',
 '大',
 '女',
 '子',
 '宀',
 '寸',
 '小',
 '尢',
 '尸',
 '屮',
 '山',
 '巛',
 '工',
 '己',
 '巾',
 '干',
 '幺',
 '广',
 '廴',
 '廾',
 '弋',
 '弓',
 '彡',
 '彳',
 '心',
 '戈',
 '戶',
 '手',
 '支',
 '攴',
 '文',
 '斗',
 '斤',
 '方',
 '无',
 '日',
 '曰',
 '月',
 '木',
 '欠',
 '止',
 '歹',
 '殳',
 '毋',
 '比',
 '毛',
 '氏',
 '气',
 '水',
 '火',
 '爪',
 '父',
 '爻',
 '爿',
 '片',
 '牙',
 '牛',
 '犬',
 '玄',
 '玉',
 '瓜',
 '瓦',
 '甘',
 '生',
 '用',
 '田',
 '疋',
 '疒',
 '癶',
 '白',
 '皮',
 '皿',
 '目',
 '矛',
 '矢',
 '石',
 '示',
 '禸',
 '禾',
 '穴',
 '立',
 '竹',
 '米',
 '糸',
 '缶',
 '网',
 '羊',
 '羽',
 '老',
 '而',
 '耒',
 '耳',
 '聿',
 '肉',
 '臣',
 '自',
 '至',
 '臼',
 '舌',
 '舛',
 '舟',
 '艮',
 '色',
 '艸',
 '虍',
 '虫',
 '血',
 '行',
 '衣',
 '襾',
 '見',
 '角',
 '言',
 '谷',
 '豆',
 '豕',
 '豸',
 '貝',
 '赤',
 '走',
 '足',
 '身',
 '車',
 '车',
 '辛',
 '辰',
 '辵',
 '邑',
 '酉',
 '釆',
 '里',
 '金',

"Top 5 results for characters containing the radical '穴':"

[<Concord 在車行裡研{究}飛行的機器>,
 <Concord ，跑去問個{究}竟。只見郭>,
 <Concord 了科學家研{究}發明的熱情>,
 <Concord 奇心，想探{究}天文地理的>,
 <Concord 上第一架研{究}地震的儀器>]

{'horz2': '⿰',
 'vert2': '⿱',
 'horz3': '⿲',
 'vert3': '⿳',
 'encl': '⿴',
 'surN': '⿵',
 'surU': '⿶',
 'curC': '⿷',
 'surT': '⿸',
 'sur7': '⿹',
 'surL': '⿺',
 'over': '⿻'}

"Top 5 results for characters with '木' component and 'horz2' ideographic description:"

[<Concord 握細緻的小{楷}筆,一筆一>,
 <Concord ，潔白的菜{梗}，翠綠的菜>,
 <Concord 詞（二）浮{槎}真個到天邊>,
 <Concord 仙境。「浮{槎}」是漂浮的>,
 <Concord 有人乘著浮{槎}在大海裡漂>]

### Search by Radical Semantic Type

In [20]:
cql = '''
[semtag="植物"] [semtag="植物"]
'''
display("Top 5 characters tagged with the semantic type '植物' (plant):", get_first_n(cql, 5))

"Top 5 characters tagged with the semantic type '植物' (plant):"

[<Concord 握細緻的小{楷筆},一筆一畫>,
 <Concord 年的歷史，{梁柱}雕刻很細緻>,
 <Concord 在二十世紀{萌芽}的新科技，>,
 <Concord 印象深刻。{花蓮}秀姑巒溪的>,
 <Concord 遠到宜蘭、{花蓮}，深入原住>]

### Search by Phonetic Properties

In [21]:
# Display available phonetic properties
display("List of available phonetic properties in the corpus:", c.cql_attrs['CharPhonetic'])

# Search by phonetic properties for Mandarin
cql = '''
[phon="ㄨㄥ" & tone="1" & sys="moe"]
'''
display("Top 5 results for Mandarin phonetic property 'ㄨㄥ' with tone 1:", get_first_n(cql, 5))

# Search by phonetic properties for Middle Chinese
cql = '''
[韻母="東" & 聲調="平" & sys="廣韻"]
'''
display("Top 5 results for Middle Chinese phonetic property rhyme '東' with level tone (平):", get_first_n(cql, 5))

'List of available phonetic properties in the corpus:'

{'moe': ['phon', 'tone', 'tp', 'sys="moe"'],
 '廣韻': ['攝', '聲調', '韻母', '聲母', '開合', '等第', '反切', '拼音', 'IPA', 'sys="廣韻"']}

"Top 5 results for Mandarin phonetic property 'ㄨㄥ' with tone 1:"

[<Concord 前有一個富{翁}，很迷信。>,
 <Concord 起來。」富{翁}聽了，就叫>,
 <Concord 。他就對富{翁}說：「這棵>,
 <Concord 棵樹。」富{翁}聽了，覺得>,
 <Concord 有一天，富{翁}的朋友來，>]

"Top 5 results for Middle Chinese phonetic property rhyme '東' with level tone (平):"

[<Concord 我要學給有{蟲}的樹治病，>,
 <Concord ，只剩竹節{蟲}與枯葉蝶，>,
 <Concord 驚嘆。竹節{蟲}與枯葉蝶就>,
 <Concord 眼前，竹節{蟲}和竹子的細>,
 <Concord 比賽，竹節{蟲}與枯葉蝶勝>]

### Component Analysis Initialization

In [22]:
from hgct import CompoAnalysis, PlainTextReader 

# Initialize CompoAnalysis without automatically loading data
CA = CompoAnalysis(PlainTextReader("textbook_corpus/", auto_load=False))

# Initialize Concordancer for text searches
CC = Concordancer(PlainTextReader("textbook_corpus/").corpus)

# Initialize Dispersion analysis tool
DP = Dispersion(PlainTextReader("textbook_corpus/").corpus)

Indexing corpus for text retrival...


100%|██████████| 8/8 [00:00<00:00, 49636.73it/s]


Indexing corpus for concordance search...


100%|██████████| 8/8 [00:00<00:00, 480.16it/s]


Indexing corpus for text retrival...


100%|██████████| 8/8 [00:00<00:00, 178481.02it/s]


Indexing corpus for concordance search...


100%|██████████| 8/8 [00:00<00:00, 653.87it/s]


### Frequency Distribution by Character and Components

In [23]:
# Frequency distribution for characters in 5th grade fall textbook
CA.freq_distr(tp="chr", subcorp_idx=5).most_common(4)

# Frequency distribution for IDCs, counting each type of character only once
CA.freq_distr(tp="idc", use_chr_types=True, subcorp_idx=5)

# Frequency distribution for radicals in 5th grade fall textbook
CA.freq_distr(tp="rad", subcorp_idx=5).most_common(4)

# Frequency distribution for characters containing the radical "水"
CA.freq_distr(tp=None, radical="水", subcorp_idx=5).most_common(4)

# Frequency distribution for characters containing the IDC component "土" with vertical arrangement
CA.freq_distr(tp=None, compo="土", idc="vert2", subcorp_idx=5)

Counter({'王': 30,
         '去': 27,
         '走': 10,
         '堅': 5,
         '幸': 5,
         '墓': 5,
         '至': 4,
         '壁': 4,
         '堂': 3,
         '主': 2,
         '基': 2,
         '赤': 1,
         '堡': 1})

### Dispersion Analysis

In [24]:
import pandas as pd  

# Analyzing dispersion of '的' (function word) and '花' (content word) in the corpus
subcorp_idx = 0
df_disp = []
for ch in '的花':
    stats, raw = DP.char_dispersion(
        char=ch, subcorp_idx=subcorp_idx, return_raw=True
    )
    d = {
        'char': ch,
        'Range(%)': '{:.2f}'.format(100 * stats['Range'] / raw['n']),
        **stats
    }
    df_disp.append(d)
df_disp = pd.DataFrame(df_disp)
df_disp

Unnamed: 0,char,Range(%),Range,DP,DPnorm,KLdivergence,JuillandD,RosengrenS
0,的,100.0,9,0.179573,0.18746,0.12684,0.840689,0.955655
1,花,33.33,3,0.686952,0.717124,1.899763,0.384915,0.290457


### Ngram and Collocation Analysis

In [25]:
# Frequency distribution of 2-gram ngrams in 5th grade fall textbook
CC.freq_distr_ngrams(n=2, subcorp_idx=5).most_common(4)

# Bigram associations sorted by G-squared statistic
bi_asso = CC.bigram_associations(subcorp_idx=5, sort_by="Gsq")
bi_asso[0]

# Dataframe of top 5 bigrams and their associated metrics
d = pd.DataFrame([{'bigram': x[0], **x[1]} for x in bi_asso][:5])
d

Counting 2-grams...


100%|██████████| 8/8 [00:00<00:00, 179.65it/s]


Unnamed: 0,bigram,MI,Xsq,Gsq,Dice,DeltaP21,DeltaP12,FisherExact,RawCount
0,母親,8.254205,7934.804175,329.234999,0.928571,0.962636,0.896443,3.652867e-72,26
1,時候,7.674781,3871.332825,211.585884,0.59375,0.422222,0.997167,1.488648e-46,19
2,企鵝,9.466194,9195.0,196.579722,1.0,1.0,1.0,1.869793e-42,13
3,南極,8.435315,5531.921866,195.019226,0.761905,0.940196,0.639891,2.816916e-43,16
4,小昌,7.235897,2700.565276,186.452284,0.455696,0.295082,0.995314,3.869527e-41,18
