In [1]:
# 50で作成したデータを使用する
TRAIN_FILE_PATH = "../第6章/news+aggregator/train.txt"
TEST_FILE_PATH = "../第6章/news+aggregator/test.txt"
VALID_FILE_PATH = "../第6章/news+aggregator/valid.txt"

import polars as pl

df_train = pl.read_csv(TRAIN_FILE_PATH, separator="\t", has_header=False, new_columns=["sentence", "category"])
df_test = pl.read_csv(TEST_FILE_PATH, separator="\t", has_header=False, new_columns=["sentence", "category"])
df_valid = pl.read_csv(VALID_FILE_PATH, separator="\t", has_header=False, new_columns=["sentence", "category"])


In [2]:
import re
from gensim.models.keyedvectors import KeyedVectors
model_path = '../第7章/GoogleNews-vectors-negative300.bin.gz'
model = KeyedVectors.load_word2vec_format(model_path, binary=True)

In [5]:
import numpy as np

class Vector_of_sentence:

    def __init__(self, dataframe: pl.DataFrame, model):
        self.__dataframe = dataframe
        self.__model = model

    def __clean_and_split_sentence(self, sentence: str):
        cleaned_sentence = re.sub(r'[^a-zA-Z\s]', '', sentence)
        words = cleaned_sentence.split()
        return words

    def __get_vector_of_sentence(self, sentence: str):  # binary=Trueはバイナリ形式のモデルの場合
        words = self.__clean_and_split_sentence(sentence)
        T = 0
        vec = np.zeros(300)
        for word in words:
            try:
                vec += self.__model.get_vector(word)
                T += 1
            except Exception as e:
                pass
        if T == 0:
            exit(1)
        feature_vec = vec / T
        return feature_vec

    def __create_X(self, sentences):
        vectors = [self.__get_vector_of_sentence(sentence) for sentence in sentences]
        X = np.stack(vectors, axis=0)
        return X

    def __create_Y(self, categories):
        category_map = {"b": [1, 0, 0, 0], "t": [0, 1, 0, 0], "e": [0, 0, 1, 0], "m": [0, 0, 0, 1]}
        categories = self.__dataframe.get_column("category").to_list()
        vectors = [np.array(category_map[category]) for category in categories]
        Y = np.stack(vectors, axis=0)
        return Y

    def get_X_Y_matrix(self):
        X = self.__create_X(self.__dataframe["sentence"])
        Y = self.__create_Y(self.__dataframe["category"])
        return X, Y

In [6]:
X_train, Y_train = Vector_of_sentence(df_train, model).get_X_Y_matrix()
X_test, Y_test= Vector_of_sentence(df_test, model).get_X_Y_matrix()
X_valid, Y_valid = Vector_of_sentence(df_valid, model).get_X_Y_matrix()

np.save("./matrix/x_train", X_train)
np.save("./matrix/y_train", Y_train)
np.save("./matrix/x_test", X_test)
np.save("./matrix/y_test", Y_test)
np.save("./matrix/x_valid", X_valid)
np.save("./matrix/y_valid", Y_valid)

print(X_train.shape)
print(Y_train)

(10671, 300)
[[1 0 0 0]
 [0 1 0 0]
 [1 0 0 0]
 ...
 [1 0 0 0]
 [0 0 1 0]
 [0 0 1 0]]
