# 第6章: 英語テキストの処理
> 英語のテキスト（nlp.txt）に対して，以下の処理を実行せよ．

In [2]:
import re

from graphviz import Digraph

### 50. 文区切り
> (. or ; or : or ? or !) → 空白文字 → 英大文字というパターンを文の区切りと見なし，入力された文書を1行1文の形式で出力せよ．

In [3]:
def cut_into_lines(filename):
    # Examples of re:
    # re.search(r'Isaac (?=Asimov)', 'Isaac Asimov') -> Issac
    # re.search(r'(?<=Issac )Asimov', 'Isaac Asimov') -> Asimov
    pattern = re.compile(r'(?<=[.;:?!])\s(?=[A-Z])')
    with open(filename) as f:
        for lines in f:
            lines = lines.rstrip()
            for line in pattern.split(lines):
                yield line

print(*cut_into_lines('nlp.txt'), sep='\n')

FileNotFoundError: [Errno 2] No such file or directory: 'nlp.txt'

### 51. 単語の切り出し
> 空白を単語の区切りとみなし，50の出力を入力として受け取り，1行1単語の形式で出力せよ．ただし，文の終端では空行を出力せよ．

In [None]:
def line_to_words(line):
    for word in line.split():
        yield word.strip(r',.;:?!"()')
    return ''

words = [word for line in cut_into_lines('nlp.txt') for word in line_to_words(line)]
words[:20]

### 52. ステミング
> 51の出力を入力として受け取り，Porterのステミングアルゴリズムを適用し，単語と語幹をタブ区切り形式で出力せよ． Pythonでは，Porterのステミングアルゴリズムの実装としてstemmingモジュールを利用するとよい．

In [None]:
#  last year

# from stemming import porter

# for w in l[:20]:
#     print('\t'.join((w, porter.stem(w))))

In [None]:
from nltk.stem.porter import PorterStemmer

ps = PorterStemmer()
for word in words[:20]:
    print('\t'.join((word, ps.stem(word))))

### 53. Tokenization
> Stanford Core NLPを用い，入力テキストの解析結果をXML形式で得よ．また，このXMLファイルを読み込み，入力テキストを1行1単語の形式で出力せよ．

In [None]:
# %%bash
# ran in my mac in last year
# java -cp "/usr/local/stanford-corenlp-full-2017-06-09/*" -Xmx2g edu.stanford.nlp.pipeline.StanfordCoreNLP -props Props.properties

In [None]:
import xml.etree.ElementTree as ET

tree = ET.parse('nlp.txt.xml')
root = tree.getroot()
G = Digraph(strict=True)

# check the strcture of XML
def make_xml_tree(parent):
    children = list(parent)
    for child in children:
        G.edge(parent.tag, child.tag)
        make_xml_tree(child)

make_xml_tree(root)
G

In [None]:
words = [child.text for child in root.iter('word')]
print(*words[:15], sep='\n')

### 54. 品詞タグ付け
> Stanford Core NLPの解析結果XMLを読み込み，単語，レンマ，品詞をタブ区切り形式で出力せよ．

In [None]:
for i, token in enumerate(root.iter('token')):
    word = token.findtext('word') 
    lemma = token.findtext('lemma')
    pos = token.findtext('POS')
    print('\t'.join((word, lemma, pos)))
    if i > 15: break

### 55. 固有表現抽出
> 入力文中の人名をすべて抜き出せ．

In [None]:
tokenizer = root.iter('token')
for token in tokenizer:
    person_name = []
    # extract person name as like "Alan Turing", not "Alan" and "Turing"
    while token.findtext('NER') == 'PERSON':
        person_name.append(token.findtext('word'))
        token = next(tokenizer)
    if person_name: 
        print(' '.join(person_name))

### 56. 共参照解析
> Stanford Core NLPの共参照解析の結果に基づき，文中の参照表現（mention）を代表参照表現（representative mention）に置換せよ．ただし，置換するときは，「代表参照表現（参照表現）」のように，元の参照表現が分かるように配慮せよ．

In [None]:
# make a list of sentences in nlp.txt.xml.
nlp_text = []
sentence_iter = root.iterfind('document/sentences/sentence')
for sentence in sentence_iter:
    tokens = sentence[0]
    str_sentence = ' '.join(token.findtext('word') for token in tokens)
    nlp_text.append(str_sentence)

In [None]:
coreferences = root.find('document/coreference')
for coref in coreferences:
    # get a representative mention and mentions
    represent = coref.find('mention[@representative="true"]')
    represent_text = represent.findtext('text')
    mention_list = [m for m in coref.iterfind('mention') if m.get('representative', 'false') == 'false']
    for mention in mention_list:
        # make a new sentence with a mention and replace the sentence.
        sentence_id = int(mention.findtext('sentence'))
        start_id = int(mention.findtext('start'))
        end_id = int(mention.findtext('end'))
        tokens = root.find('document/sentences/sentence[@id="{}"]/tokens'.format(sentence_id))
        token_list = [token.findtext('word') for token in tokens]
        former_text = ' '.join(token_list[:start_id-1])
        latter_text = ' '.join(token_list[end_id:])
        mention_text = mention.findtext('text')
        replace_text = f'「{represent_text} ({mention_text})」'
        replaced_sentence = former_text + replace_text + latter_text
        nlp_text[sentence_id - 1] = replaced_sentence

In [None]:
print(*nlp_text[:10], sep='\n')

### 57. 係り受け解析
> Stanford Core NLPの係り受け解析の結果（collapsed-dependencies）を有向グラフとして可視化せよ．可視化には，係り受け木をDOT言語に変換し，Graphvizを用いるとよい．また，Pythonから有向グラフを直接的に可視化するには，pydotを使うとよい．

In [None]:
TEXT_INDEX = 5
print(nlp_text[TEXT_INDEX])
dependencies = root.findall(".//dependencies[@type='collapsed-dependencies']")[TEXT_INDEX]

G = Digraph()
for deps in dependencies:
    governor = deps[0]
    dependent = deps[1]
    gov_idx = governor.get('idx')
    dep_idx = dependent.get('idx')
    G.node(gov_idx, governor.text)
    G.node(dep_idx, dependent.text)
    G.edge(gov_idx, dep_idx)

G

### 58. タプルの抽出
> Stanford Core NLPの係り受け解析の結果（collapsed-dependencies）に基づき，「主語 述語 目的語」の組をタブ区切り形式で出力せよ．ただし，主語，述語，目的語の定義は以下を参考にせよ．
- 述語: nsubj関係とdobj関係の子（dependant）を持つ単語
- 主語: 述語からnsubj関係にある子（dependent）
- 目的語: 述語からdobj関係にある子（dependent）

In [None]:
from collections import defaultdict

target_types = ['nsubj', 'dobj']
d = defaultdict(list)
for deps in dependencies:
    dtype = deps.get('type')
    if dtype in target_types:
        governor = deps[0]
        dependent = deps[1]
        gov_idx = governor.get('idx')
        d[(gov_idx, governor.text)].append((dtype, dependent.text))
        
for item in d.items():
    verb_text = item[0][1]
    nsubj_list = [word[1] for word in item[1] if word[0] == 'nsubj']
    dobj_list = [word[1] for word in item[1] if word[0] == 'dobj']
    print(' '.join(nsubj_list), verb_text, ' '.join(dobj_list), sep='\t')

### 59. S式の解析
> Stanford Core NLPの句構造解析の結果（S式）を読み込み，文中のすべての名詞句（NP）を表示せよ．入れ子になっている名詞句もすべて表示すること．

In [None]:
import regex

for elem in root.iterfind("document/sentences/sentence/parse"):
    s_expression = elem.text
    NPs = regex.findall(r"\(NP.*", s_expression, overlapped=True)
    for NP_expression in NPs:
        stack = []
        chars = []
        words = []
        for char in NP_expression:
            if char == ')':
                poped = ''
                while poped != '(':
                    poped = stack.pop()
                    chars.append(poped)
                if not stack:
                    print(' '.join(words))
                    break
                else:
                    word = ''.join(reversed(chars)).split(' ')[1]
                    if word:
                        words.append(word)
                chars = []
            else:
                stack.append(char)