-
Notifications
You must be signed in to change notification settings - Fork 0
/
load_dict.py
87 lines (67 loc) · 3.19 KB
/
load_dict.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
#A parser for the CC-Cedict. Convert the Chinese-English dictionary into a list of python dictionaries with "traditional","simplified", "pinyin", and "english" keys.
#Make sure that the cedict_ts.u8 file is in the same folder as this file, and that the name matches the file name on line 13.
#Before starting, open the CEDICT text file and delete the copyright information at the top. Otherwise the program will try to parse it and you will get an error message.
#Characters that are commonly used as surnames have two entries in CC-CEDICT. This program will remove the surname entry if there is another entry for the character. If you want to include the surnames, simply delete lines 59 and 60.
#This code was written by Franki Allegra in February 2020.
#open CEDICT file
from os.path import dirname, join, abspath
curr_dir = dirname(abspath(__file__))
file_path = join(curr_dir, 'cedict_ts.u8')
with open(file_path, encoding="utf-8") as file:
text = file.read()
lines = text.split('\n')
dict_lines = list(lines)
#define functions
def parse_line(line):
parsed = {}
if line == '':
dict_lines.remove(line)
return 0
line = line.rstrip('/')
line = line.split('/')
if len(line) <= 1:
return 0
# english = ";".join(line[1:])
english = line[1:]
char_and_pinyin = line[0].split('[')
characters = char_and_pinyin[0]
characters = characters.split()
traditional = characters[0]
simplified = characters[1]
# if simplified == '贡献':
# print(english)
pinyin = char_and_pinyin[1]
pinyin = pinyin.rstrip()
pinyin = pinyin.rstrip("]")
parsed['traditional'] = traditional
parsed['simplified'] = simplified
parsed['pinyin'] = pinyin
parsed['english'] = english
list_of_dicts.append(parsed)
def remove_surnames():
for x in range(len(list_of_dicts)-1, -1, -1):
if "surname " in list_of_dicts[x]['english']:
if list_of_dicts[x]['traditional'] == list_of_dicts[x+1]['traditional']:
list_of_dicts.pop(x)
def load_dict():
#make each line into a dictionary
print("Parsing dictionary . . .")
for line in dict_lines:
parse_line(line)
#remove entries for surnames from the data (optional):
print("Removing Surnames . . .")
remove_surnames()
# Turn list into dict of dicts
new_dict = {}
for entry in list_of_dicts:
new_dict[entry['simplified']] = entry
return new_dict
#If you want to save to a database as JSON objects, create a class Word in the Models file of your Django project:
# print("Saving to database (this may take a few minutes) . . .")
# for one_dict in list_of_dicts:
# new_word = Word(traditional = one_dict["traditional"], simplified = one_dict["simplified"], english = one_dict["english"], pinyin = one_dict["pinyin"], hsk = one_dict["hsk"])
# new_word.save()
print('Done!')
list_of_dicts = []
# parsed_dict = load_dict()
# print(type(parsed_dict["贡献"]['english']))