In [17]:
from sklearn.feature_extraction.text import CountVectorizer

# サンプルテキスト
sentences = ['Tell us about Materials Informatics. Materials Informatics is a hot topic in materials development.']

# Bag of Wordsモデルの初期化
vectorizer = CountVectorizer()

# モデルの適用
bag_of_words = vectorizer.fit_transform(sentences)

# 結果の表示
print(bag_of_words.toarray())
print(vectorizer.get_feature_names_out())


[[1 1 1 1 2 1 3 1 1 1]]
['about' 'development' 'hot' 'in' 'informatics' 'is' 'materials' 'tell'
 'topic' 'us']


In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

# サンプルテキスト
sentences = ['Tell us about Materials Informatics.']

# TF-IDFベクトル化
tfidf_vectorizer = TfidfVectorizer()

# モデルの適用
tfidf_matrix = tfidf_vectorizer.fit_transform(sentences)

# 結果の表示
print(tfidf_matrix.toarray())
print(tfidf_vectorizer.get_feature_names_out())


[[0.4472136 0.4472136 0.4472136 0.4472136 0.4472136]]
['about' 'informatics' 'materials' 'tell' 'us']


In [6]:
from gensim.models import Word2Vec
import numpy as np

# サンプルテキスト
sentences = ['Tell us about Materials Informatics.']

# Word2Vecモデルの初期化と訓練
model = Word2Vec(sentences,
                 vector_size=100,
                 window=5,
                 min_count=1
                 )

# 各文のベクトルを表示
for sentence in sentences:
    word_vectors = [model.wv[word] for word in sentence if word in model.wv.key_to_index]
    sentence_vector = np.mean(word_vectors, axis=0)
    print(sentence_vector)

[-2.1797905e-03  1.6589301e-03  5.8459840e-04  1.5391984e-03
  1.0815222e-03 -1.7936298e-03  1.8282192e-03  3.8006173e-03
 -3.0035668e-03 -2.2738699e-03  7.6256419e-04 -2.1877417e-03
 -9.7897416e-04  8.5941469e-04  8.8217575e-04 -3.1100694e-04
  2.4284152e-03  1.4040241e-03 -1.9482790e-03 -4.1181641e-03
  9.4437721e-04 -3.0209278e-04  4.2257141e-03 -9.1968040e-04
  4.3061169e-04  5.4578285e-04 -9.4450224e-04  1.9187866e-03
 -1.8381564e-03  7.7972055e-04  2.2008857e-03 -1.5116888e-03
  3.9013225e-04 -2.3712947e-03 -3.7220033e-04  1.3211550e-03
  1.8836082e-03  7.2593696e-04 -2.7831475e-04  2.0262942e-04
  1.5620745e-03 -1.2833839e-03 -2.3860077e-03  1.2332475e-03
  1.4626342e-03  6.1335752e-04 -1.0499618e-03 -2.0808007e-05
  1.0347271e-03  6.2614836e-04  5.5599154e-04 -1.4212636e-03
 -2.3895453e-04 -9.2512078e-04 -1.2359156e-03 -8.4326550e-04
  8.2906982e-04 -1.0891865e-03 -1.3063211e-04  6.3340802e-04
 -4.3797807e-04 -3.7227236e-04  2.7589414e-03 -5.9280137e-04
 -9.6611917e-04  2.36788

In [5]:
from sentence_transformers import SentenceTransformer
	
# モデルのロード
model = SentenceTransformer('all-MiniLM-L6-v2')
	
# サンプルテキスト
sentences = ['Tell us about Materials Informatics.']
	
# 文のベクトル化
sentence_embeddings = model.encode(sentences)
	
# 各文のベクトルを表示
for sentence, embedding in zip(sentences, sentence_embeddings):
    		print(embedding)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[-4.50960137e-02  2.00210344e-02 -3.21245678e-02  1.43184909e-03
  5.14301360e-02 -4.56097722e-02  8.13956261e-02  8.27503577e-02
 -5.13389073e-02  6.37632832e-02 -4.62777466e-02 -2.34087091e-03
  3.67399827e-02  4.95042764e-02 -1.24838832e-03 -2.00663935e-02
 -1.96949448e-02 -2.27476582e-02 -4.75356309e-03 -2.34705824e-02
  7.38689229e-02  5.44358268e-02  7.20300078e-02 -7.54760997e-03
 -5.89695275e-02  5.02607301e-02 -1.13075245e-02 -8.10740963e-02
  9.23936516e-02 -9.65571776e-02 -8.83917417e-03  7.72606656e-02
  7.14210123e-02  2.14256197e-02 -4.08767723e-02  5.71301728e-02
  4.04598936e-02 -7.83321336e-02 -3.99369299e-02  6.64898055e-03
 -3.70827913e-02 -4.37721685e-02 -1.57778561e-02 -1.45155657e-02
  5.93632124e-02  7.55614310e-04 -3.83709325e-03 -3.87434736e-02
 -4.82256226e-02  1.59138869e-02 -3.12349647e-02 -4.45192903e-02
 -4.26934706e-03 -2.01791860e-02 -1.60550475e-02 -1.85665721e-03
  8.55633244e-02  1.68599300e-02 -2.76838243e-02 -3.35196070e-02
  7.81847164e-02 -5.52143