In [1]:
import MeCab

In [2]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

In [3]:
anime = pd.read_csv("../../data/input/anime.csv")
sample_submission = pd.read_csv("../../data/input/sample_submission.csv")
test = pd.read_csv("../../data/input/test.csv")
train = pd.read_csv("../../data/input/train.csv")

In [4]:
text = [
        "genres",
        "japanese_name",
        "aired",
        "producers",
        "licensors",
        "studios",
    ]

In [5]:
anime_text_only = anime[text]

In [6]:
anime["concated"] = anime_text_only.apply(lambda row: " ".join(row), axis=1)
text.append("concated")

In [7]:
wakati = MeCab.Tagger("/opt/homebrew/lib/mecab/dic/mecab-ipadic-neologd")

def mecab_tokenizer(s: str):
    parse_result = wakati.parse(s)
    return [
        result.split("\t")[0]
        for result in parse_result.split("\n")
        if result not in ["EOS", ""]
    ]

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import TruncatedSVD, NMF, LatentDirichletAllocation

In [9]:
for cat in text:
    if cat == "genres":
        continue
    print(cat)
    tfidf = TfidfVectorizer(tokenizer=mecab_tokenizer)
    svd = TruncatedSVD(n_components=50, random_state=42)

    feature_tfidf = tfidf.fit_transform(anime[cat])
    print(feature_tfidf.shape)
    feature_svd = svd.fit_transform(feature_tfidf)
    columns = [f"{cat}_{i}" for i in range(50)]

    anime_embs = pd.DataFrame()
    embs_df = pd.DataFrame(feature_svd, columns=columns)
    anime_embs["anime_id"] = anime["anime_id"]
    anime_embs = pd.concat([anime_embs, embs_df], axis=1)

    anime_embs.to_csv(f"../../data/input/created_features/anime_{cat}_svd50.csv", index=False)

japanese_name
(2000, 3591)




aired
(2000, 91)




producers
(2000, 746)




licensors
(2000, 79)




studios
(2000, 271)




concated
(2000, 4479)




In [10]:
for cat in text:
    if cat != "genres":
        continue
    print(cat)
    tfidf = TfidfVectorizer(tokenizer=mecab_tokenizer)

    feature_tfidf = tfidf.fit_transform(anime[cat])
    print(feature_tfidf.shape)
    feature_tfidf = feature_tfidf.toarray()
    columns = tfidf.get_feature_names_out()

    # ゴミをリネーム
    columns[0] = "comma"
    columns[1] = "hyphen"

    # ゴミを消す
    #feature_tfidf = feature_tfidf[:, 2:]
    #columns = columns[2:]

    anime_embs = pd.DataFrame()
    embs_df = pd.DataFrame(feature_tfidf, columns=columns)
    anime_embs["anime_id"] = anime["anime_id"]
    anime_embs = pd.concat([anime_embs, embs_df], axis=1)

    anime_embs.to_csv(f"../../data/input/created_features/anime_{cat}.csv", index=False)

genres
(2000, 48)


