## Jieba-Traditional Chinese

In [1]:
import jieba

seg_list = jieba.cut("在非洲，每六十秒，就有一分鐘過去") 
print("|".join(seg_list))

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\User\AppData\Local\Temp\jieba.cache
Loading model cost 1.231 seconds.
Prefix dict has been built successfully.


在|非洲|，|每|六十|秒|，|就|有|一分|鐘過|去


In [2]:
import jieba

# jieba.cut -> 分詞模式
# 全模式：列出各種可能的字詞，cut_all = True
# 精確模式：Default，列出最適合的字詞，cut_all = False
seg_list = jieba.cut("交通大學在新竹大學路上", cut_all = True)
print("Full Mode: " + "/ ".join(seg_list))

seg_list = jieba.cut("交通大學在新竹大學路上", cut_all = False)
print("Default Mode: " + "/ ".join(seg_list))

# jieba.cut_for_search -> 適合用於搜尋引擎構建倒排索引的分詞，分詞較細
seg_list = jieba.cut_for_search("小明碩士畢業于台灣大學電機所，後在日本京都大學深造")  # 搜索引擎模式
print(", ".join(seg_list))

Full Mode: 交通/ 大/ 學/ 在/ 新竹/ 大/ 學/ 路上
Default Mode: 交通/ 大學/ 在/ 新竹/ 大學/ 路上
小明, 碩士, 畢業于, 台灣, 大學, 電機, 所, ，, 後, 在, 日本, 京都, 大學, 深造


## Traditional and Simplified Chinese exchange
https://pypi.org/project/OpenCC/  
'''
s2t.json Simplified Chinese to Traditional Chinese 簡體到繁體  
t2s.json Traditional Chinese to Simplified Chinese 繁體到簡體  
s2tw.json Simplified Chinese to Traditional Chinese (Taiwan Standard) 簡體到臺灣正體  
tw2s.json Traditional Chinese (Taiwan Standard) to Simplified Chinese 臺灣正體到簡體  
s2hk.json Simplified Chinese to Traditional Chinese (Hong Kong variant) 簡體到香港繁體  
hk2s.json Traditional Chinese (Hong Kong variant) to Simplified Chinese 香港繁體到簡體  
s2twp.json Simplified Chinese to Traditional Chinese (Taiwan Standard) with Taiwanese idiom 簡體到繁體（臺灣正體標準）並轉換爲臺灣常用詞彙  
tw2sp.json Traditional Chinese (Taiwan Standard) to Simplified Chinese with Mainland Chinese idiom 繁體（臺灣正體標準）到簡體並轉換爲中國大陸常用詞彙  
t2tw.json Traditional Chinese (OpenCC Standard) to Taiwan Standard 繁體（OpenCC標準）到臺灣正體  
hk2t.json Traditional Chinese (Hong Kong variant) to Traditional Chinese 香港繁體到繁體（OpenCC標準）  
t2hk.json Traditional Chinese (OpenCC Standard) to Hong Kong variant 繁體（OpenCC標準）到香港繁體  
t2jp.json Traditional Chinese Characters (Kyūjitai) to New Japanese Kanji (Shinjitai) 繁體（OpenCC標準，舊字體）到日文新字體  
jp2t.json New Japanese Kanji (Shinjitai) to Traditional Chinese Characters (Kyūjitai) 日文新字體到繁體（OpenCC標準，舊字體）  
tw2t.json Traditional Chinese (Taiwan standard) to Traditional Chinese 臺灣正體到繁體（OpenCC標準）  

'''

In [3]:
!pip install opencc

Collecting opencc
  Downloading OpenCC-1.1.1-py2.py3-none-win_amd64.whl (726 kB)
     -------------------------------------- 726.1/726.1 kB 1.4 MB/s eta 0:00:00
Installing collected packages: opencc
Successfully installed opencc-1.1.1


In [4]:
import opencc
converter = opencc.OpenCC('s2tw.json')
converter.convert('汉字')

'漢字'

In [5]:
import opencc
converter = opencc.OpenCC('tw2sp.json')
converter.convert('臺灣滑鼠')

'台湾鼠标'

## 中文斷字

In [6]:
import jieba

result = jieba.tokenize(u'永和服裝飾品有限公司')
for tk in result: 
    print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))

word 永和		 start: 0 		 end:2
word 服裝飾品		 start: 2 		 end:6
word 有限公司		 start: 6 		 end:10


In [7]:
import jieba

result = jieba.tokenize(u'全台灣大停電')
for tk in result: 
    print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))

word 全台灣		 start: 0 		 end:3
word 大		 start: 3 		 end:4
word 停電		 start: 4 		 end:6


## 語音辨識

In [1]:
!pip install SpeechRecognition
!pip install pyaudio

Collecting SpeechRecognition
  Downloading SpeechRecognition-3.10.0-py2.py3-none-any.whl (32.8 MB)
     ---------------------------------------- 32.8/32.8 MB 5.9 MB/s eta 0:00:00
Installing collected packages: SpeechRecognition
Successfully installed SpeechRecognition-3.10.0
Collecting pyaudio
  Downloading PyAudio-0.2.13-cp310-cp310-win_amd64.whl (164 kB)
     ------------------------------------ 164.1/164.1 kB 545.5 kB/s eta 0:00:00
Installing collected packages: pyaudio
Successfully installed pyaudio-0.2.13


In [2]:
import speech_recognition as sr
print('請說話......')

# 錄音
recognizer = sr.Recognizer()
microphone = sr.Microphone()
with microphone as source:
    recognizer.adjust_for_ambient_noise(source)
    audio = recognizer.listen(source)

# 語音辨識    
# pip install SpeechRecognition
# pip install pyaudio
try:
    text = recognizer.recognize_google(audio, language = 'zh-tw')
    print(text)
except:
    pass

# jieba 分詞
# pip install jieba
import jieba

# 加詞
jieba.add_word('三天三夜')
seg_list = jieba.cut(text)
print("/".join(seg_list))

請說話......


Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\User\AppData\Local\Temp\jieba.cache


哈囉


Loading model cost 0.781 seconds.
Prefix dict has been built successfully.


哈/囉


In [7]:
import speech_recognition as sr
print('請說話......')

# 錄音
recognizer = sr.Recognizer()
microphone = sr.Microphone()
with microphone as source:
    recognizer.adjust_for_ambient_noise(source)
    audio = recognizer.listen(source)

# 語音辨識
try:
    text = recognizer.recognize_google(audio, language = 'zh-tw')
    print(text)
except:
    pass

# jieba分詞
import jieba

# 加詞
jieba.add_word('三天三夜')
seg_list = jieba.cut(text)
print("/".join(seg_list))

請說話......
要開始寫論文了
要/開始/寫/論文/了


In [11]:
import speech_recognition as sr
print('請說話......')

# 錄音
recognizer = sr.Recognizer()
microphone = sr.Microphone()
with microphone as source:
    recognizer.adjust_for_ambient_noise(source)
    audio = recognizer.listen(source)

# 語音辨識
try:
    text = recognizer.recognize_google(audio, language = 'zh-tw')
    print(text)
except:
    pass

# jieba分詞
import jieba

# 加詞
jieba.add_word('三天三夜')
seg_list = jieba.cut(text)
print("/".join(seg_list))

請說話......
要去找k u k u
要/去/找/k/ /u/ /k/ /u


In [20]:
import speech_recognition as sr
print('請說話......')

# 錄音
recognizer = sr.Recognizer()
microphone = sr.Microphone()
with microphone as source:
    recognizer.adjust_for_ambient_noise(source)
    audio = recognizer.listen(source)

# 語音辨識
try:
    text = recognizer.recognize_google(audio, language = 'zh-tw')
    print(text)
except:
    pass

# jieba分詞
import jieba

# 加詞
jieba.add_word('三天三夜')
seg_list = jieba.cut(text)
print("/".join(seg_list))

請說話......
找葉家妤
找葉家/妤


In [39]:
import speech_recognition as sr
print('請說話......')

# 錄音
recognizer = sr.Recognizer()
microphone = sr.Microphone()
with microphone as source:
    recognizer.adjust_for_ambient_noise(source)
    audio = recognizer.listen(source)

# 語音辨識    
# pip install SpeechRecognition
# pip install pyaudio
try:
    text = recognizer.recognize_google(audio, language = 'zh-tw')
    print(text)
except:
    pass

# jieba 分詞
# pip install jieba
import jieba

# 加詞
jieba.add_word('三天三夜')
seg_list = jieba.cut(text)
print("/".join(seg_list))

請說話......
Lucy在忙什麼
Lucy/在/忙/什麼
