In [1]:
import pandas as pd
from natto import MeCab
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
from __future__ import unicode_literals

import re
import unicodedata


def unicode_normalize(cls, s):
    pt = re.compile("([{}]+)".format(cls))

    def norm(c):
        return unicodedata.normalize("NFKC", c) if pt.match(c) else c

    s = "".join(norm(x) for x in re.split(pt, s))
    s = re.sub("－", "-", s)
    return s


def remove_extra_spaces(s):
    s = re.sub("[ 　]+", " ", s)
    blocks = "".join(
        (
            "\u4E00-\u9FFF",  # CJK UNIFIED IDEOGRAPHS
            "\u3040-\u309F",  # HIRAGANA
            "\u30A0-\u30FF",  # KATAKANA
            "\u3000-\u303F",  # CJK SYMBOLS AND PUNCTUATION
            "\uFF00-\uFFEF",  # HALFWIDTH AND FULLWIDTH FORMS
        )
    )
    basic_latin = "\u0000-\u007F"

    def remove_space_between(cls1, cls2, s):
        p = re.compile("([{}]) ([{}])".format(cls1, cls2))
        while p.search(s):
            s = p.sub(r"\1\2", s)
        return s

    s = remove_space_between(blocks, blocks, s)
    s = remove_space_between(blocks, basic_latin, s)
    s = remove_space_between(basic_latin, blocks, s)
    return s


def normalize_neologd(s):
    s = s.strip()
    s = unicode_normalize("０-９Ａ-Ｚａ-ｚ｡-ﾟ", s)

    def maketrans(f, t):
        return {ord(x): ord(y) for x, y in zip(f, t)}

    s = re.sub("[˗֊‐‑‒–⁃⁻₋−]+", "-", s)  # normalize hyphens
    s = re.sub("[﹣－ｰ—―─━ー]+", "ー", s)  # normalize choonpus
    s = re.sub("[~∼∾〜〰～]", "", s)  # remove tildes
    s = s.translate(
        maketrans(
            "!\"#$%&'()*+,-./:;<=>?@[¥]^_`{|}~｡､･｢｣",
            "！”＃＄％＆’（）＊＋，－．／：；＜＝＞？＠［￥］＾＿｀｛｜｝〜。、・「」",
        )
    )
    s = remove_extra_spaces(s)
    s = unicode_normalize("！”＃＄％＆’（）＊＋，－．／：；＜＞？＠［￥］＾＿｀｛｜｝〜", s)  # keep ＝,・,「,」
    s = re.sub("[’]", "'", s)
    s = re.sub("[”]", '"', s)
    return s

In [3]:
%cd "/Users/ottan/Desktop/ottanxyz"

/Users/ottan/Desktop/ottanxyz


In [4]:
m = MeCab("-d /opt/homebrew/lib/mecab/dic/mecab-ipadic-neologd/")
# m = MeCab()


def extract(text):
    results = []
    for n in m.parse(text, as_nodes=True):
        if n.stat == 0:
            features = n.feature.split(",")
            # if features[0] == "名詞" and (features[1] == "一般" or features[1] == "固有名詞"):
            if (
                features[0] == "名詞"
                and (features[1] == "一般" or features[1] == "固有名詞")
            ):
                results.append(n.surface)
    return " ".join(results)

In [5]:
df = pd.read_json("public/index.json")
df.head()

Unnamed: 0,path,text,title
0,posts/2022/03/apple-silicon-x86_64-docker/inde...,Appleシリコン搭載Macで手軽にDocker開発環境をLimaで構築するで、Appleシ...,Appleシリコン搭載Macで手軽にx86_64開発環境をLimaで構築する
1,posts/2022/02/how-to-docker-lima-arm-mac/index.md,Limaは、GitHubで「unofficial &ldquo;containerd for...,Appleシリコン搭載Macで手軽にDocker開発環境をLimaで構築する
2,posts/2022/01/how-to-install-windows11-arm-mac...,Apple Silicon搭載のM1 Mac（MacBook Air 2020のエントリーモ...,UTMでM1 MacにWindows 11 Insider Previewをインストールする方法
3,posts/2021/11/1password-to-icloud-keychain/ind...,macOS Montereyより、OS標準のキーチェーンで、パスワードやワンタイムパスワード...,1PasswordからiCloudキーチェーンへ移行する方法
4,posts/2021/09/1006246663/index.md,一昔前までのiPhoneの機種変と言えば、母艦と言われるiTunesへの『完全なバックアップ...,古いiPhoneから新しいiPhoneへ移行する方法の比較


In [6]:
df["tags"] = ""
df["title_text"] = df["title"] + " " + df["text"]
df["title_text"] = df["title_text"].map(lambda x: normalize_neologd(x))

In [7]:
docs = []
for doc in df["title_text"].values:
    docs.append(extract(doc))
del doc

In [16]:
vectorizer = TfidfVectorizer(stop_words='english', min_df=0.0001)
X = vectorizer.fit_transform(docs)
values = X.toarray()
feature_names = vectorizer.get_feature_names_out()
df_result = pd.DataFrame(values, columns=feature_names)

In [17]:
df_result.head()

Unnamed: 0,0m,0t,0件,0円,0秒,10,100,1000m,1000万,100s,...,鬼滅の刃,魅力,魅力的,魑魅魍魎,魔法,魔法の言葉,黄色,黎明期,黒字,黒縁
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.022753,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.027095,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
for idx, row in df_result.iterrows():
    df.at[idx, "tags"] = row.sort_values(ascending=False).head(5).index.tolist()

In [19]:
df

Unnamed: 0,path,text,title,tags,title_text
0,posts/2022/03/apple-silicon-x86_64-docker/inde...,Appleシリコン搭載Macで手軽にDocker開発環境をLimaで構築するで、Appleシ...,Appleシリコン搭載Macで手軽にx86_64開発環境をLimaで構築する,"[x86, mysql, シリコン, docker, 仮想マシン]",Appleシリコン搭載Macで手軽にx86_64開発環境をLimaで構築するAppleシリコ...
1,posts/2022/02/how-to-docker-lima-arm-mac/index.md,Limaは、GitHubで「unofficial &ldquo;containerd for...,Appleシリコン搭載Macで手軽にDocker開発環境をLimaで構築する,"[仮想マシン, docker, en, require, cli]",Appleシリコン搭載Macで手軽にDocker開発環境をLimaで構築するLimaは、Gi...
2,posts/2022/01/how-to-install-windows11-arm-mac...,Apple Silicon搭載のM1 Mac（MacBook Air 2020のエントリーモ...,UTMでM1 MacにWindows 11 Insider Previewをインストールする方法,"[utm, arm, qemu, 仮想マシン, preview]",UTMでM1 MacにWindows 11 Insider Previewをインストールする...
3,posts/2021/11/1password-to-icloud-keychain/ind...,macOS Montereyより、OS標準のキーチェーンで、パスワードやワンタイムパスワード...,1PasswordからiCloudキーチェーンへ移行する方法,"[キーチェーン, エクスポート, icloud, パスワード, インポート]",1PasswordからiCloudキーチェーンへ移行する方法macOS Montereyより...
4,posts/2021/09/1006246663/index.md,一昔前までのiPhoneの機種変と言えば、母艦と言われるiTunesへの『完全なバックアップ...,古いiPhoneから新しいiPhoneへ移行する方法の比較,"[icloud, iphone, 母艦, 容量, データ]",古いiPhoneから新しいiPhoneへ移行する方法の比較一昔前までのiPhoneの機種変と...
...,...,...,...,...,...
474,posts/2014/09/quick-look-plugin/index.md,Quick Lookは、macOSを代表する便利な機能です。フォルダーやファイルを選...,Macを購入したら最初に導入しておきたいQuick Lookのプラグイン,"[look, homebrew, プラグイン, インストール, 手動]",Macを購入したら最初に導入しておきたいQuick LookのプラグインQuick Look...
475,posts/2014/09/mac-preference/index.md,@おったんです。システム環境設定は、Macを快適に利用するために欠かせないもの。とく...,Macを購入したら最初に設定しておきたいシステム環境設定,"[ウインドウ, キー, macbook, アプリケーション, ことえり]",Macを購入したら最初に設定しておきたいシステム環境設定@おったんです。システム環境設定は、...
476,posts/2014/09/mac-shutdown-slowly/index.md,@おったんです。Macのシャットダウンが遅くなりイライラしてしまったことはありません...,Macのシャットダウンが遅くなってしまった場合に見直したい項目,"[シャットダウン, ディスク, システム, sudo, write]",Macのシャットダウンが遅くなってしまった場合に見直したい項目@おったんです。Macのシャッ...
477,pages/privacy-policy.md,プライバシーポリシー 広告の配信について 当サイトは第三者配信の広告サービス「Google ...,,"[cookie, サイト, 権利, クッキー, google]",プライバシーポリシー広告の配信について当サイトは第三者配信の広告サービス「Google Ad...


In [20]:
import frontmatter

In [21]:
for idx, row in df.iterrows():
    post = frontmatter.load("content/" + row["path"])
    post["tags"] = df.at[idx, "tags"]
    with open("content/" + row["path"], "w") as f:
        content = frontmatter.dumps(post)
        f.write(content)