In [1]:
import spacy
import json
from tqdm import tqdm
from collections import Counter
import pathlib
import glob
import numpy as np
import pandas as pd

In [2]:
nlp = spacy.load("en_core_web_sm")

In [3]:
with open('../../data/wordlist-sources/1_2020_2.json', 'r') as f:
    data = json.load(f)
data

["Julia, we're considering you for the regional sales director position. You'd still work here at the head office, but it'd mean having to travel more.",
 "Ah . . . that might be a problem. My kids are still young, and you know that I'm a single mother, so I can't be gone too much.",
 "Right. We realize that, but there may be a way to cut down on the trips. We've been trying out video conferencing, and so far, so good.",
 "Then I'd certainly be interested.",
 'By transferring the woman to a regional office.',
 "By getting someone to help with the woman's children.",
 'By offering the woman a different position.',
 "Gary, it's me. I'm afraid I have to work late again.",
 "Again? That's the third time this week! When do you think you'll get home?",
 "Probably around 9. Listen, Clive's baseball practice ends at 8:30. Can you pick him up?",
 "Well, I'm supposed to be giving your mom a ride to the airport.",
 "Ah, I forgot. Well, I guess he'll just have to wait for a bit until I get there."

In [4]:
def extract_words(text):
    doc = nlp(text)
    words = set()
    for token in doc:
        if token.is_alpha and not token.is_stop:
            words.add(token.lemma_.lower())
    return words

In [5]:
NOUN_ARTICLES = {"a", "an", "the"} # これは S/O で消す
BE_VERBS = {"am", "is", "are", "was", "were", "'m", "'re", "'s", "be"} # これは S の直後に残す

def to_varient(doc): # 主語を "S" で置換
    new_doc = [] # すべて lemma に変換、単語用

    for i, token in enumerate(doc):
        prev_token = doc[i-1] if i > 0 else None
        prev_inserted = new_doc[-1] if new_doc else None

        if token.dep_ == "nsubj" or token.dep_ == "nsubjpass":
            # S のとき
            if prev_token and prev_token.lemma_.lower() in NOUN_ARTICLES and prev_inserted != "S":
                new_doc.pop()  # 直前の冠詞を削除
            new_doc.append("S")
        elif token.dep_ == 'obj' or token.dep_ == 'dobj' or token.dep_ == 'iobj':
            # O のとき
            if prev_token and prev_token.lemma_.lower() in NOUN_ARTICLES and prev_inserted != "O":
                new_doc.pop()  # 直前の冠詞を削除
            new_doc.append("O")
        elif (prev_inserted == "S" or prev_inserted == 'O') and token.lemma_ in BE_VERBS:
            new_doc.append("be")
        else:
            new_doc.append(token.text)
    return new_doc

vocabs = Counter()
for text in tqdm(data):
    doc = nlp(text)
    words = to_varient(doc)
    for n in range(2, 5):
        ngrams = zip(*[words[i:] for i in range(n)])
        for ngram in ngrams:
            vocabs[ ' '.join(ngram) ] += 1
vocabs

100%|██████████| 199/199 [00:01<00:00, 146.71it/s]


Counter({'S be': 122,
         'O .': 63,
         ', S': 62,
         'S had': 51,
         '. S': 42,
         'in the': 34,
         'S of': 31,
         'O of': 31,
         'O to': 28,
         'of the': 25,
         'had been': 22,
         'to the': 21,
         'S had been': 21,
         'O ,': 20,
         'O for': 19,
         ', S be': 19,
         ', and': 19,
         'that S': 16,
         'from the': 15,
         'O in': 15,
         'by the': 15,
         'O and': 13,
         'S in': 13,
         '. S be': 12,
         'O S': 12,
         '. The': 12,
         "Germany 's": 12,
         'S have': 11,
         "S 'll": 11,
         'to be': 11,
         'O from': 11,
         'as a': 11,
         'S to': 11,
         '. "': 11,
         ', but': 10,
         'on the': 10,
         'S can': 10,
         'S S': 10,
         'S ,': 10,
         'of Versailles': 10,
         'its O': 10,
         'for the': 9,
         'but S': 9,
         ', but S': 9,
         'and S': 9,

In [6]:
wordlist_by_grade: dict[str, list[str]] = {}

for path in tqdm(glob.glob('../../data/wordlist-sources/*.json')):
    with open(path, 'r') as f:
        data: list[str] = json.load(f)
    grade = pathlib.Path(path).stem.split('_')[0]
    wordlist_by_grade.setdefault(grade, []).extend(data)
wordlist_by_grade

100%|██████████| 105/105 [00:00<00:00, 23144.94it/s]


{'1': ['What are the two people talking about?',
  "Julie's poor work performance.",
  'A project Rachel is working on.',
  'Personal criticisms being shared accidentally.',
  "Problems with the office's online forum.",
  'What is one thing we learn about the play?',
  'Some casting decisions have led to disagreements.',
  "It is the director's first production.",
  'The technical staff is having serious problems.',
  'The opening day will probably be delayed.',
  'What does the man say the woman should have done?',
  'Rearranged her work schedule.',
  'Checked the budget figures herself.',
  'Reported the reason for the delay.',
  'Taken the blame for the delay.',
  'What do we learn from the conversation?',
  'Oceans have become a source of new medicines.',
  'Testing medicines on marine animals is becoming common.',
  'New medicines are likely to help with space exploration.',
  'The approval process for a new medicine has been shortened.',
  'What does the man think about his daugh

In [7]:
import count
import importlib
importlib.reload(count)

all_unigrams = set()
all_phrases = set()

counts: dict[str, count.CountResult] = {}
for grade, texts in tqdm(wordlist_by_grade.items()):
    grade_count = counts[grade] = count.count_collocations(
        texts,
        nlp=nlp,
        n_process=4,
        include_edges=False,
    )
    all_unigrams.update(grade_count.unigram)
    all_phrases.update(grade_count.phrase)

all_unigrams = list(sorted(all_unigrams))
all_phrases = list(sorted(all_phrases))

counts_array = dict[str, (np.ndarray[int], np.ndarray[int])]()

for grade, grade_count in counts.items():
    unigram_counts = np.array([grade_count.unigram.get(word, 0) for word in all_unigrams], dtype=int)
    phrase_counts = np.array([grade_count.phrase.get(phrase, 0) for phrase in all_phrases], dtype=int)
    counts_array[grade] = (unigram_counts, phrase_counts)

counts_array

100%|██████████| 7/7 [00:22<00:00,  3.25s/it]


{'1': (array([1413,    4,    0, ...,    3,    0,    0], shape=(6349,)),
  array([0, 0, 2, ..., 0, 0, 0], shape=(10193,))),
 '5': (array([404,   0,   0, ...,   0,   4,   0], shape=(6349,)),
  array([0, 0, 0, ..., 0, 0, 0], shape=(10193,))),
 '2': (array([1420,    0,    0, ...,    0,    2,    6], shape=(6349,)),
  array([0, 0, 0, ..., 0, 0, 2], shape=(10193,))),
 '4': (array([1009,    0,    0, ...,    0,   11,    0], shape=(6349,)),
  array([0, 0, 0, ..., 1, 0, 0], shape=(10193,))),
 'pre1': (array([930,   1,   1, ...,   0,   0,   0], shape=(6349,)),
  array([0, 0, 0, ..., 0, 0, 0], shape=(10193,))),
 '3': (array([1410,    0,    0, ...,    0,    9,    0], shape=(6349,)),
  array([1, 0, 0, ..., 0, 1, 0], shape=(10193,))),
 'pre2': (array([1262,    0,    0, ...,    0,    5,    0], shape=(6349,)),
  array([0, 1, 0, ..., 0, 0, 0], shape=(10193,)))}

In [8]:
# カウントをスコアに変換
scores_by_grade: dict[str, (
    np.ndarray[float], # unigram
    np.ndarray[float] # phrase
)] = {}

grades = [
    '1',
    'pre1',
    '2',
    'pre2',
    '3',
    '4',
    '5',
]

MIN_UNIGRAM_CH = 3
MIN_PHRASE_CH = 2

A = 1
for grade, (unigram_counts, phrase_counts) in counts_array.items():
    # 下位の級を合算
    lower_grades = grades[grades.index(grade)+1:]

    # 下位の級の確率を計算
    unigram_cL = np.zeros(len(all_unigrams), dtype=int)
    phrase_cL = np.zeros(len(all_phrases), dtype=int)
    unigram_nL = 0 # nL ... 下位の級で出現した総 unigram 回数
    phrase_nL = 0 # nL ... 下位の級で出現した総 phrase 回数
    for lg in lower_grades:
        unigram_counts_l, phrase_counts_l = counts_array[lg]
        unigram_cL += unigram_counts_l
        phrase_cL += phrase_counts_l
        unigram_nL += counts[lg].unigram_n
        phrase_nL += counts[lg].phrase_n

    unigram_cH = unigram_counts
    phrase_cH = phrase_counts
    unigram_nH = counts[grade].unigram_n
    phrase_nH = counts[grade].phrase_n

    unigram_V = len(all_unigrams)
    phrase_V = len(all_phrases)

    unigram_pH = np.log((unigram_cH + A) / (unigram_nH + A * unigram_V)) # 上位の級の確率
    phrase_pH = np.log((phrase_cH + A) / (phrase_nH + A * phrase_V)) # 上位の級の確率
    unigram_pL = np.log((unigram_cL + A) / (unigram_nL + A * unigram_V)) # 下位の級の確率
    phrase_pL = np.log((phrase_cL + A) / (phrase_nL + A * phrase_V)) # 下位の級の確率

    unigram_scores = unigram_pH - unigram_pL # スコア計算
    phrase_scores = phrase_pH - phrase_pL # スコア計算

    scores_by_grade[grade] = (
        unigram_scores,
        phrase_scores,
    )

In [9]:
# rank で unigram と phrase を合わせる

merged_words = []
merged_words.extend(all_unigrams)
merged_words.extend(all_phrases)

merged_ranks_by_grade: dict[str, np.ndarray[float]] = {}


for grade, (unigram_scores, phrase_scores) in scores_by_grade.items():
    unigram_sorted_indices = np.argsort(unigram_scores)[::-1]
    phrase_sorted_indices = np.argsort(phrase_scores)[::-1]

    unigram_rank = np.empty(len(all_unigrams), dtype=int)
    unigram_rank[unigram_sorted_indices] = np.arange(1, len(all_unigrams)+1)
    phrase_rank = np.empty(len(all_phrases), dtype=int)
    phrase_rank[phrase_sorted_indices] = np.arange(1, len(all_phrases)+1)

    n_unigrams = len(all_unigrams)
    n_phrases = len(all_phrases)

    unigram_normed_ranks = (n_unigrams - unigram_rank + 0.5) / n_unigrams
    phrase_normed_ranks = (n_phrases - phrase_rank + 0.5) / n_phrases

    merged_scores = np.concatenate([unigram_normed_ranks, phrase_normed_ranks])
    merged_ranks_by_grade[grade] = merged_scores

merged_ranks_by_grade


{'1': array([0.1985352 , 0.77468893, 0.39588912, ..., 0.45516531, 0.45526342,
        0.27092122], shape=(16542,)),
 '5': array([0.99960624, 0.64017956, 0.64002205, ..., 0.33115864, 0.33125674,
        0.33135485], shape=(16542,)),
 '2': array([0.13962829, 0.63151677, 0.62411403, ..., 0.23256156, 0.19606593,
        0.91803198], shape=(16542,)),
 '4': array([0.91423846, 0.33194204, 0.32690187, ..., 0.93598548, 0.64039046,
        0.32674384], shape=(16542,)),
 'pre1': array([0.12954796, 0.81753032, 0.77909907, ..., 0.3846267 , 0.38472481,
        0.22814677], shape=(16542,)),
 '3': array([0.07426366, 0.36092298, 0.35950543, ..., 0.03634847, 0.9063573 ,
        0.7668498 ], shape=(16542,)),
 'pre2': array([0.08875413, 0.63498189, 0.25602457, ..., 0.13004022, 0.11630531,
        0.41337192], shape=(16542,))}

In [10]:
# ソート
vocablist_by_grade: dict[str, list[(str, float)]] = {}
for grade, merged_scores in merged_ranks_by_grade.items():
    sorted_indices = np.argsort(merged_scores)[::-1]  # スコアの高い順にソート
    sorted_vocablist = [
        (merged_words[i], merged_scores[i].item())
        for i in sorted_indices
    ]
    vocablist_by_grade[grade] = sorted_vocablist

In [11]:
# 最終的に csv に保存
# 単語, 型(unigram or phrase), 1級スコア, pre1スコア, ..., 5級スコア, 1級カウント, ..., 5級カウント, 1級ランク, ..., 5級ランク
import os

rows = []
for i, vocab in enumerate(merged_words):
    is_unigram = i < len(all_unigrams)

    scores = {
        grade: scores_by_grade[grade][0 if is_unigram else 1][i if is_unigram else i - len(all_unigrams)].item()
        for grade in grades
    }
    counts = {
        grade: counts_array[grade][0 if is_unigram else 1][i if is_unigram else i - len(all_unigrams)].item()
        for grade in grades
    }
    ranks = {
        grade: merged_ranks_by_grade[grade][i].item()
        for grade in grades
    }
    
    row = {
        'vocab': vocab,
        'type': 'unigram' if is_unigram else 'phrase',
    }
    for grade in grades:
        row[f'score_{grade}'] = scores[grade]
    for grade in grades:
        row[f'count_{grade}'] = counts[grade]
    for grade in grades:
        row[f'rank_{grade}'] = ranks[grade]
    rows.append(row)

df = pd.DataFrame(rows)
target_path = '../../data/wordlist-scores/vocablist_by_grade.csv'
os.makedirs(os.path.dirname(target_path), exist_ok=True)
df.to_csv(target_path, index=False)