In [None]:
#if you have any errors, do the following: 
#conda activate NLP_py38 
#cde data download 
#The first step is to collect data. Here we take a one-sentence example to 
#demonstrate the simplest text preprocessing before model training.


#one of the key concept in NLP is tokenize, i.e., split sentences into words.
#This cell and the following will show how this is achieved with gensim and a 
#toolkit that I developed.

from alloy2vec.processing import MaterialsTextProcessor
text_processor = MaterialsTextProcessor()
text_processor.process("New York University is one of the best universities in the world.")

In [None]:
# tokenize sentences directly using gensim

from gensim.utils import tokenize
list(tokenize("New York University is one of the best universities in the world."))

In [None]:
#stop words, like "is", "one", "the", "of", are not really useful when included in
#the training dataset.
#remove the stop words 
from gensim.parsing.preprocessing import remove_stopwords,strip_punctuation
remove_stopwords("New York University is one of the best universities in the world.")

In [None]:
#similarly, punctuations like ",", ".", are also not important to include.

words_no_stopwords=remove_stopwords("New York University is one of the best universities in the world.")
strip_punctuation(words_no_stopwords)

In [None]:
#Let's look at the tokenized words again after text preprocessed.

words_no_stopwords_punctuation=strip_punctuation(words_no_stopwords)
text_processor.process(words_no_stopwords_punctuation)

In [None]:
# we skip the time-consuming data training
# Let's try on one model that I trained.

#load the model. May take a while, given the model's large size.
from gensim.models import Word2Vec 
w2v_model =Word2Vec.load("alloy2vec/training/models/model_121520") 

In [None]:
#let's have a look at the one-dimentional word vector and its dimensional size

word="excellent"
print("word vector of "+str(word)+" :", w2v_model.wv.get_vector(word))
print("dimension size:", len(w2v_model.wv.get_vector(word)))

In [None]:
# one simple example to check the most similar words of "excellent"
# try different words if you like. If the word is not included in the
# vocabulary, it may complains. Then, try a different one.
word="excellent"
w2v_model.wv.most_similar(word)

In [None]:
import matplotlib.pyplot as plt
import numpy as np

a=w2v_model.wv.most_similar(word,topn=20) #200
word_candidates,cosine_similarity=[],[]
print(a)
for i in range(0,20): #len(a)):
  word_candidates.append(a[i*1][0])
  cosine_similarity.append(a[i*1][1])
word_num=np.arange(len(word_candidates))
fig, ax = plt.subplots()
ax.barh(word_num, cosine_similarity,color='blue', align='center') #color='#0504aa',
ax.set_yticks(word_num)
ax.set_yticklabels(word_candidates)
ax.invert_yaxis()  # labels read top-to-bottom
plt.xlim((0.4,0.95))
ax.set_xlabel('cosine similarity')
ax.set_title('Ranking of cosine similarity for'+' "'+word+'"')

In [None]:
# try some chemical concept:
# the full name of chemical elements.

w2v_model.wv.most_similar(
    positive=["magnesium", "Fe"], 
    negative=["Mg"], topn=1)