<a href="https://colab.research.google.com/github/mipypf/practical-mi-guide/blob/develop/chapter4/src/text_vectorization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# テキストを数値形式のベクトルデータに変換

## Google colabを使用の場合、ランタイムのタブから「ランタイムのタイプを変更」→ハードウェアアクセラレータと進み、T4 GPUを選択 ※「Transformer 系列のモデルを用いたベクトル化」でGPUを使用するため

In [1]:
# 以下のコマンドでGPUが使用可能かを確認
! nvidia-smi

Wed Jan 29 01:41:13 2025       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   66C    P8              14W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

## ライブラリをインポート

In [15]:
import re
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from gensim.models import Word2Vec
from sentence_transformers import SentenceTransformer

## Bag of Words

In [3]:
# サンプルテキスト
sentences = [
    "Tell us about Materials Informatics. Materials Informatics is a hot topic in materials development."
]

# Bag of Wordsモデルの初期化
vectorizer = CountVectorizer()

# モデルの適用
bag_of_words = vectorizer.fit_transform(sentences)

# 結果の表示
pd.DataFrame(bag_of_words.toarray(), columns=vectorizer.get_feature_names_out())

Unnamed: 0,about,development,hot,in,informatics,is,materials,tell,topic,us
0,1,1,1,1,2,1,3,1,1,1


## TF-IDF

In [4]:
# サンプルテキスト
sentences = [
    "Tell us about Materials Informatics. Materials Informatics is a hot topic in materials development."
]

# TF-IDFベクトル化
tfidf_vectorizer = TfidfVectorizer()

# モデルの適用
tfidf_matrix = tfidf_vectorizer.fit_transform(sentences)

# 結果の表示
pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

Unnamed: 0,about,development,hot,in,informatics,is,materials,tell,topic,us
0,0.218218,0.218218,0.218218,0.218218,0.436436,0.218218,0.654654,0.218218,0.218218,0.218218


## Word2Vec

In [16]:
# サンプルテキスト
sentences = [
    "Tell us about Materials Informatics.",
    "Materials Informatics is a hot topic in materials development."
]

# 句読点を削除、小文字化をしてトークン化
tokenized_sentences = [
    re.sub(r"[^\w\s]", "", sentence).lower().split() for sentence in sentences
]

print("Tokenized Sentences:", tokenized_sentences)

# Word2Vecモデルの初期化と訓練
model = Word2Vec(tokenized_sentences, vector_size=100, window=5, min_count=1)

# 各単語のベクトルを取得し、データフレームに変換
word_vectors_list = []
word_list = []

for sentence in tokenized_sentences:
    for word in sentence:
        if word in model.wv:
            word_list.append(word)
            word_vectors_list.append(model.wv[word])  # 単語ごとのベクトルを取得

df_word_vectors = pd.DataFrame(word_vectors_list, index=word_list)
print("Word Vectors:")
display(df_word_vectors)  # 各単語のベクトル

# 各文のベクトルを計算
sentence_vectors = []
for sentence in tokenized_sentences:
    word_vectors = [model.wv[word] for word in sentence if word in model.wv.key_to_index]
    if word_vectors:
        sentence_vector = np.mean(word_vectors, axis=0)
        sentence_vectors.append(sentence_vector)
    else:
        sentence_vectors.append(np.zeros(model.vector_size))

# 文ベクトルをデータフレームに変換
df_sentence_vectors = pd.DataFrame(sentence_vectors)
print("Sentence Vectors:")
display(df_sentence_vectors)

# 文ベクトル全体の平均を計算
document_vector = np.mean(sentence_vectors, axis=0)

# 全体のベクトルをデータフレームに変換
df_document_vector = pd.DataFrame(document_vector.reshape(1, -1))
print("Document Vector:")
display(df_document_vector)

Tokenized Sentences: [['tell', 'us', 'about', 'materials', 'informatics'], ['materials', 'informatics', 'is', 'a', 'hot', 'topic', 'in', 'materials', 'development']]
Word Vectors:


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
tell,0.007089,-0.001568,0.007947,-0.009489,-0.008029,-0.00664,-0.004003,0.004989,-0.003814,-0.00832,...,0.007512,0.001498,-0.001265,0.005768,-0.00564,3.9e-05,0.009457,-0.005481,0.003814,-0.008113
us,-0.005156,-0.006668,-0.007777,0.008311,-0.001982,-0.006855,-0.004154,0.005144,-0.002869,-0.00375,...,-0.008977,0.008592,0.004047,0.00747,0.009746,-0.00729,-0.00904,0.005836,0.009391,0.003507
about,-0.009579,0.008943,0.004165,0.009235,0.006644,0.002925,0.009804,-0.004425,-0.006803,0.004227,...,-0.005085,0.001131,0.002883,-0.001536,0.009932,0.00835,0.002416,0.007118,0.005891,-0.005581
materials,-0.000536,0.000236,0.005103,0.009009,-0.009303,-0.007117,0.006459,0.008973,-0.005015,-0.003763,...,0.001631,0.00019,0.003474,0.000218,0.009619,0.005061,-0.008917,-0.007042,0.000901,0.006393
informatics,-0.00862,0.003666,0.00519,0.005742,0.007467,-0.006168,0.001106,0.006047,-0.00284,-0.006174,...,0.001088,-0.001576,0.002197,-0.007882,-0.002717,0.002663,0.005347,-0.002392,-0.00951,0.004506
materials,-0.000536,0.000236,0.005103,0.009009,-0.009303,-0.007117,0.006459,0.008973,-0.005015,-0.003763,...,0.001631,0.00019,0.003474,0.000218,0.009619,0.005061,-0.008917,-0.007042,0.000901,0.006393
informatics,-0.00862,0.003666,0.00519,0.005742,0.007467,-0.006168,0.001106,0.006047,-0.00284,-0.006174,...,0.001088,-0.001576,0.002197,-0.007882,-0.002717,0.002663,0.005347,-0.002392,-0.00951,0.004506
is,0.008168,-0.004443,0.008985,0.008254,-0.004435,0.000303,0.004274,-0.003926,-0.00556,-0.006512,...,0.002058,-0.004004,-0.008241,0.006278,-0.001949,-0.000666,-0.001771,-0.004536,0.004062,-0.00427
a,-0.008727,0.00213,-0.000874,-0.009319,-0.009428,-0.001411,0.004432,0.003704,-0.006499,-0.006873,...,0.009071,0.008938,-0.008208,-0.003012,0.009887,0.005104,-0.001588,-0.008692,0.002962,-0.006676
hot,0.008132,-0.004457,-0.001068,0.001006,-0.000191,0.001148,0.006114,-2e-05,-0.003246,-0.001511,...,-0.002701,0.000444,-0.003537,-0.000419,-0.000709,0.000823,0.008195,-0.005737,-0.00166,0.005572


Sentence Vectors:


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-0.00336,0.000922,0.002926,0.004562,-0.001041,-0.004771,0.001842,0.004146,-0.004268,-0.003556,...,-0.000766,0.001967,0.002267,0.000808,0.004188,0.001764,-0.000148,-0.000392,0.002098,0.000142
1,-0.001934,0.001221,0.000917,0.002013,-0.001022,-0.001253,0.003235,0.003937,-0.003838,-0.00252,...,0.000442,0.000456,0.000314,-0.001722,0.004893,0.001657,0.001146,-0.004173,0.000461,0.000262


Document Vector:


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-0.002647,0.001071,0.001921,0.003287,-0.001032,-0.003012,0.002538,0.004042,-0.004053,-0.003038,...,-0.000162,0.001212,0.00129,-0.000457,0.004541,0.001711,0.000499,-0.002282,0.001279,0.000202


## Transformer 系列のモデルを用いたベクトル化

In [17]:
# モデルのロード
model = SentenceTransformer("all-MiniLM-L6-v2")

# サンプルテキスト
sentences = [
    "Tell us about Materials Informatics. Materials Informatics is a hot topic in materials development."
]

# 文のベクトル化
sentence_embeddings = model.encode(sentences)

pd.DataFrame(sentence_embeddings.reshape(1, -1))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,374,375,376,377,378,379,380,381,382,383
0,-0.027799,-0.017065,-0.063819,0.038567,0.077383,-0.042008,0.076295,0.046873,-0.103452,0.060175,...,0.015203,0.080945,-0.069885,0.024362,0.05567,0.018277,0.060813,0.010531,0.026846,0.035051


## 実行環境のライブラリverを保存

In [18]:
# Pythonのverを確認
!python3 -V

Python 3.11.11


In [19]:
!pip freeze > requirements_text_vectorization.txt

In [20]:
from google.colab import files

files.download('requirements_text_vectorization.txt')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>