In [1]:
from sklearn.feature_extraction.text import CountVectorizer

# サンプルテキスト
sentences = ['Tell us about Materials Informatics. Materials Informatics is a hot topic in materials development.']

# Bag of Wordsモデルの初期化
vectorizer = CountVectorizer()

# モデルの適用
bag_of_words = vectorizer.fit_transform(sentences)

# 結果の表示
print(bag_of_words.toarray())
print(vectorizer.get_feature_names_out())


[[1 1 1 1 2 1 3 1 1 1]]
['about' 'development' 'hot' 'in' 'informatics' 'is' 'materials' 'tell'
 'topic' 'us']


In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer

# サンプルテキスト
sentences = ['Tell us about Materials Informatics. Materials Informatics is a hot topic in materials development.']

# TF-IDFベクトル化
tfidf_vectorizer = TfidfVectorizer()

# モデルの適用
tfidf_matrix = tfidf_vectorizer.fit_transform(sentences)

# 結果の表示
print(tfidf_matrix.toarray())
print(tfidf_vectorizer.get_feature_names_out())


[[0.21821789 0.21821789 0.21821789 0.21821789 0.43643578 0.21821789
  0.65465367 0.21821789 0.21821789 0.21821789]]
['about' 'development' 'hot' 'in' 'informatics' 'is' 'materials' 'tell'
 'topic' 'us']


In [3]:
from gensim.models import Word2Vec
import numpy as np

# サンプルテキスト
sentences = ['Tell us about Materials Informatics. Materials Informatics is a hot topic in materials development.']

# Word2Vecモデルの初期化と訓練
model = Word2Vec(sentences,
                 vector_size=100,
                 window=5,
                 min_count=1
                 )

# 各文のベクトルを表示
for sentence in sentences:
    word_vectors = [model.wv[word] for word in sentence if word in model.wv.key_to_index]
    sentence_vector = np.mean(word_vectors, axis=0)
    print(sentence_vector)

[-1.9191124e-03  1.9315518e-03  1.1100147e-03  1.9077945e-03
  2.0610406e-04 -1.6489318e-03  2.0105324e-03  3.8164491e-03
 -3.2290018e-03 -2.4694263e-03  1.2226918e-03 -2.0307829e-03
 -4.2606032e-04  7.8102050e-04  8.7051431e-04 -6.7774393e-04
  2.4758417e-03  1.2640398e-03 -2.8001196e-03 -4.4544316e-03
  5.4592121e-04 -1.2628229e-04  4.3689697e-03 -7.6193997e-04
  1.8404066e-04  4.9722468e-04 -6.1112660e-04  1.7580837e-03
 -2.0144626e-03  1.0807632e-03  1.7223611e-03 -1.9228380e-03
  5.7941774e-04 -2.7148058e-03 -5.5578613e-04  1.6052041e-03
  2.1559410e-03  2.1569419e-04 -6.8414293e-04  3.9055398e-05
  1.3411512e-03 -1.0500193e-03 -2.9219005e-03  1.1177887e-03
  1.3583481e-03  3.0102051e-04 -1.2438818e-03  9.8189405e-05
  6.7189569e-04  8.4854179e-04  3.1484160e-04 -1.6922524e-03
 -7.4712548e-04 -8.5977791e-04 -1.5678286e-03 -7.9915370e-04
  9.8279596e-04 -1.4932604e-03 -7.0057664e-04  4.4498296e-04
 -3.5472677e-04 -5.8648444e-04  2.6262235e-03 -1.0158815e-03
 -1.1605487e-03  2.81006

In [5]:
from sentence_transformers import SentenceTransformer
	
# モデルのロード
model = SentenceTransformer('all-MiniLM-L6-v2')
	
# サンプルテキスト
sentences = ['Tell us about Materials Informatics. Materials Informatics is a hot topic in materials development.']
	
# 文のベクトル化
sentence_embeddings = model.encode(sentences)
	
# 各文のベクトルを表示
for sentence, embedding in zip(sentences, sentence_embeddings):
    		print(embedding)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[-2.77985651e-02 -1.70654710e-02 -6.38188645e-02  3.85672487e-02
  7.73833692e-02 -4.20080461e-02  7.62950554e-02  4.68733460e-02
 -1.03451863e-01  6.01753891e-02 -7.78516904e-02 -2.09038332e-02
  4.35080342e-02  5.66060878e-02  4.98813093e-02 -2.07026657e-02
 -3.43178287e-02 -1.88807715e-02 -1.64197050e-02  1.92649420e-02
  5.45002222e-02  7.95379281e-02  5.52619658e-02 -1.50733581e-02
 -6.04570694e-02  1.98761784e-02  4.14988436e-02 -5.20930961e-02
  4.37159874e-02 -3.45854387e-02 -9.80974361e-03  1.07011430e-01
  2.96820123e-02  3.27014402e-02 -4.87940758e-02  3.86687033e-02
 -4.48148325e-02 -8.44214335e-02 -5.33419959e-02  1.20233069e-03
 -4.30473797e-02 -1.31969787e-02 -1.59288161e-02 -1.99758243e-02
  1.22480551e-02 -1.47013497e-02  3.94559640e-04 -6.70878589e-02
 -5.92520982e-02 -2.08120663e-02 -2.01453120e-02 -9.35207084e-02
  2.62778159e-02 -3.16250138e-02 -6.86391965e-02  1.55890267e-02
  8.63396078e-02 -1.79380050e-03 -5.81585243e-03 -2.82000806e-02
  5.26186191e-02 -1.01309