### Corpus Building

In [33]:
from textbook import build_corpus

# Path to the CSV file containing text data
csv_file = './data/教科書課文.csv'

# Name of the output folder
folder = "textbook_corpus"

# Building the corpus
build_corpus(csv_file, folder)

### Corpus Reading and Concordancer Initialization

In [34]:
from hgct import PlainTextReader, Concordancer, CompoAnalysis, Dispersion

# Initialize a PlainTextReader with the directory containing the corpus
corpus = PlainTextReader(
    dir_path = "textbook_corpus/"
    ).corpus

# Initialize a Concordancer with the corpus for performing text searches
c = Concordancer(corpus)

Indexing corpus for text retrival...


100%|██████████| 8/8 [00:00<00:00, 170327.07it/s]


Indexing corpus for concordance search...


100%|██████████| 8/8 [00:00<00:00, 655.09it/s]


### Utility Function for CQL Search

In [35]:
def get_first_n(cql, n=10, left=5, right=5):
    out = []
    for i, r in enumerate(c.cql_search(cql, left=left, right=right)):
        if i == n: break
        out.append(r)
    return out

### Search by Character

In [36]:
cql = """
[char="窗"] [char="[一-窗]"] 
"""
results = get_first_n(cql, n=5)
print("Top 5 results for the search query '[char=\"窗\"] [char=\"[一-窗]\"]':", results)  

result_1 = results[0]
print("Relevant information of the matching result:", result_1.data)  

Top 5 results for the search query '[char="窗"] [char="[一-窗]"]': [<Concord 門、客廳和{窗戶}旁貼上年畫>, <Concord 水匠來，把{窗子}用甎頭堵上>, <Concord 子，就指著{窗子}說：「這兩>, <Concord 大門上、紙{窗旁}，幾乎都貼>, <Concord 要把這兩個{窗子}堵起來。」>]
Relevant information of the matching result: {'left': '門、客廳和', 'keyword': '窗戶', 'right': '旁貼上年畫', 'position': (3, 11, 0, 25), 'meta': {'id': '4S/90-1-有趣的年畫.txt', 'time': {'label': '教科書課文 - 4下', 'ord': 4, 'year': ['90', '88', '69']}, 'text': {'lesson': '1', 'title': '有趣的年畫', 'year': '90'}}, 'captureGroups': {}}


### Search by Character Components

In [37]:
# Display radicals
print("List of all Kangxi radicals present in the characters of the corpus:", c.chr_radicals)

# Search by radical
cql = """
[radical="穴"]
"""
print("Top 5 results for characters containing the radical '穴':", get_first_n(cql, 5))


# Display Ideographic Description Characters (IDCs)
print(c.chr_idcs)

# Search using ideographic description and components
cql = '''
[compo="木" & idc="horz2"]
'''
print("Top 5 results for characters with '木' component and 'horz2' ideographic description:", get_first_n(cql, 5))

List of all Kangxi radicals present in the characters of the corpus: {'一', '自', '艸', '', '卩', '夕', '玄', '羽', '釆', '戶', '齊', '車', '衣', '勹', '黍', '匕', '氏', '子', '而', '龜', '刀', '革', '麥', '里', '臣', '疒', '鹿', '人', '舟', '行', '生', '禸', '毋', '米', '至', '凵', '方', '身', '爪', '尢', '丿', '气', '手', '牛', '土', '鼠', '牙', '赤', '厶', '高', '皮', '穴', '片', '糸', '示', '爿', '巛', '瓦', '鼎', '門', '羊', '矢', '尸', '乙', '斗', '馬', '艮', '厂', '十', '角', '邑', '聿', '斤', '山', '玉', '皿', '支', '言', '心', '龍', '髟', '父', '干', '匸', '魚', '缶', '豸', '豆', '比', '疋', '木', '酉', '風', '广', '長', '見', '骨', '雨', '黑', '冫', '甘', '弓', '血', '夊', '丶', '辰', '毛', '齒', '廾', '小', '爻', '黃', '石', '靑', '足', '虫', '肉', '口', '宀', '曰', '舛', '二', '鳥', '丨', '水', '隹', '冖', '寸', '无', '冂', '食', '目', '大', '欠', '犬', '火', '豕', '禾', '飛', '辵', '立', '田', '貝', '亅', '面', '囗', '廴', '麻', '戈', '首', '屮', '月', '车', '非', '耒', '力', '鼓', '网', '彳', '士', '入', '耳', '文', '香', '舌', '匚', '止', '竹', '用', '辛', '老', '谷', '殳', '鬼', '阜', '虍', '儿', '卜', '頁', '瓜', '又', '韋', '工', '八', '色', '女', '

### Search by Radical Semantic Type

In [38]:
cql = '''
[semtag="植物"] [semtag="植物"]
'''
print("Top 5 characters tagged with the semantic type '植物' (plant):", get_first_n(cql, 5))

Top 5 characters tagged with the semantic type '植物' (plant): [<Concord 的土地，一{朵朵}野菇像一把>, <Concord 這樣的：一{朵芙}蓉頭上戴，>, <Concord 見,只見一{朵朵}梅花般的腳>, <Concord ,只見一朵{朵梅}花般的腳掌>, <Concord 大紅絲被繡{梅花}。媽媽從早>]


### Search by Phonetic Properties

In [39]:
# Display available phonetic properties
print("List of available phonetic properties in the corpus:", c.cql_attrs['CharPhonetic'])

# Search by phonetic properties for Mandarin
cql = '''
[phon="ㄨㄥ" & tone="1" & sys="moe"]
'''
print("Top 5 results for Mandarin phonetic property 'ㄨㄥ' with tone 1:", get_first_n(cql, 5))

# Search by phonetic properties for Middle Chinese
cql = '''
[韻母="東" & 聲調="平" & sys="廣韻"]
'''
print("Top 5 results for Middle Chinese phonetic property rhyme '東' with level tone (平):", get_first_n(cql, 5))

List of available phonetic properties in the corpus: {'moe': ['phon', 'tone', 'tp', 'sys="moe"'], '廣韻': ['攝', '聲調', '韻母', '聲母', '開合', '等第', '反切', '拼音', 'IPA', 'sys="廣韻"']}
Top 5 results for Mandarin phonetic property 'ㄨㄥ' with tone 1: [<Concord 前有一個富{翁}，很迷信。>, <Concord 起來。」富{翁}聽了，就叫>, <Concord 。他就對富{翁}說：「這棵>, <Concord 棵樹。」富{翁}聽了，覺得>, <Concord 有一天，富{翁}的朋友來，>]
Top 5 results for Middle Chinese phonetic property rhyme '東' with level tone (平): [<Concord 熄滅，反而{烘}烘騰起，越>, <Concord 滅，反而烘{烘}騰起，越搧>, <Concord 快，卻已經{烘}托出船隻的>, <Concord 灣特有的萍{蓬}草，開起嬌>, <Concord 在網際網路{蓬}勃發展的過>]


### Component Analysis Initialization

In [40]:
from hgct import CompoAnalysis, PlainTextReader 

# Initialize CompoAnalysis without automatically loading data
CA = CompoAnalysis(PlainTextReader("textbook_corpus/", auto_load=False))

# Initialize Concordancer for text searches
CC = Concordancer(PlainTextReader("textbook_corpus/").corpus)

# Initialize Dispersion analysis tool
DP = Dispersion(PlainTextReader("textbook_corpus/").corpus)

Indexing corpus for text retrival...


100%|██████████| 8/8 [00:00<00:00, 140395.11it/s]


Indexing corpus for concordance search...


100%|██████████| 8/8 [00:00<00:00, 659.52it/s]


Indexing corpus for text retrival...


100%|██████████| 8/8 [00:00<00:00, 145257.28it/s]


Indexing corpus for concordance search...


100%|██████████| 8/8 [00:00<00:00, 672.95it/s]


### Frequency Distribution by Character and Components

In [41]:
# Frequency distribution for characters in 5th grade fall textbook
CA.freq_distr(tp="chr", subcorp_idx=5).most_common(4)

# Frequency distribution for IDCs, counting each type of character only once
CA.freq_distr(tp="idc", use_chr_types=True, subcorp_idx=5)

# Frequency distribution for radicals in 5th grade fall textbook
CA.freq_distr(tp="rad", subcorp_idx=5).most_common(4)

# Frequency distribution for characters containing the radical "水"
CA.freq_distr(tp=None, radical="水", subcorp_idx=5).most_common(4)

# Frequency distribution for characters containing the IDC component "土" with vertical arrangement
CA.freq_distr(tp=None, compo="土", idc="vert2", subcorp_idx=5)

Counter({'王': 30,
         '去': 27,
         '走': 10,
         '堅': 5,
         '幸': 5,
         '墓': 5,
         '至': 4,
         '壁': 4,
         '堂': 3,
         '主': 2,
         '基': 2,
         '赤': 1,
         '堡': 1})

### Dispersion Analysis

In [42]:
import pandas as pd  

# Analyzing dispersion of '的' (function word) and '花' (content word) in the corpus
subcorp_idx = 0
df_disp = []
for ch in '的花':
    stats, raw = DP.char_dispersion(
        char=ch, subcorp_idx=subcorp_idx, return_raw=True
    )
    d = {
        'char': ch,
        'Range(%)': '{:.2f}'.format(100 * stats['Range'] / raw['n']),
        **stats
    }
    df_disp.append(d)
df_disp = pd.DataFrame(df_disp)
df_disp

Unnamed: 0,char,Range(%),Range,DP,DPnorm,KLdivergence,JuillandD,RosengrenS
0,的,100.0,9,0.179573,0.18746,0.12684,0.840689,0.955655
1,花,33.33,3,0.686952,0.717124,1.899763,0.384915,0.290457


### Ngram and Collocation Analysis

In [43]:
# Frequency distribution of 2-gram ngrams in 5th grade fall textbook
CC.freq_distr_ngrams(n=2, subcorp_idx=5).most_common(4)

# Bigram associations sorted by G-squared statistic
bi_asso = CC.bigram_associations(subcorp_idx=5, sort_by="Gsq")
bi_asso[0]

# Dataframe of top 5 bigrams and their associated metrics
d = pd.DataFrame([{'bigram': x[0], **x[1]} for x in bi_asso][:5])
d

Counting 2-grams...


100%|██████████| 8/8 [00:00<00:00, 178.68it/s]


Unnamed: 0,bigram,MI,Xsq,Gsq,Dice,DeltaP21,DeltaP12,FisherExact,RawCount
0,母親,8.254205,7934.804175,329.234999,0.928571,0.962636,0.896443,3.652867e-72,26
1,時候,7.674781,3871.332825,211.585884,0.59375,0.422222,0.997167,1.488648e-46,19
2,企鵝,9.466194,9195.0,196.579722,1.0,1.0,1.0,1.869793e-42,13
3,南極,8.435315,5531.921866,195.019226,0.761905,0.940196,0.639891,2.816916e-43,16
4,小昌,7.235897,2700.565276,186.452284,0.455696,0.295082,0.995314,3.869527e-41,18
