In [1]:
import torch
import torch.nn
import gensim

In [2]:
vocab = {
    '청년': 0,
    'AI': 1,
    '한국': 2,
    '북한': 3,
    '인공지능': 4,
    '대한민국': 5,
    '실습': 6
}

In [3]:
model = gensim.models.Word2Vec.load('newskor.model')

In [4]:
## vocab에 등록된 단어의 사전훈련 벡터 결과 출력 ##
model.wv['청년']

array([-0.29592422,  0.20896403,  0.35601947, -0.27826557, -0.14623529,
       -0.15336423,  0.39505485, -0.23473871,  0.36711314,  0.61967134,
       -0.0710306 , -0.16104251, -0.12217963,  0.09757436,  0.06636167,
        0.04816947,  0.03682688,  0.05979806,  0.21835425, -0.19626673,
        0.13421342,  0.3241241 , -0.04354725,  0.05034014,  0.10816982,
        0.00693497,  0.1521896 , -0.02264175,  0.04431422,  0.03903328,
       -0.03925376,  0.37178087, -0.04169319, -0.1605893 , -0.08643936,
       -0.06168719,  0.25112796,  0.41481146, -0.05257788,  0.3159834 ,
       -0.23209138,  0.20746532,  0.02771806,  0.1175475 , -0.01354244,
       -0.05445127, -0.36227027, -0.13261822,  0.06570031,  0.1968582 ,
        0.06873151,  0.03088307, -0.09757509, -0.0280185 , -0.02685368,
       -0.0512948 ,  0.13486277,  0.09436584,  0.02388141, -0.02398871,
       -0.00239473,  0.11502101, -0.11450242, -0.07380942,  0.04474166,
       -0.04530367,  0.00131406, -0.0600784 ,  0.42133868, -0.25

In [5]:
## previous version ##
#dim=10
#emb_mtx = torch.nn.Embedding(len(vocab), dim)

## new version ##
#dim=model.wv.vector_size 
#weights=[[weight for 청년], [weight for AI], [weight for BIGDATA], ...]
#emb_mtx = torch.nn.Embedding.from_pretrained(weights)


In [6]:
def get_weights(vocab, embd):
    weights = []
    for word in vocab.keys():
        w = embd.wv[word]
        weights.append(w)
    return torch.tensor(weights)

In [7]:
dim=model.wv.vector_size
weights = get_weights(vocab, model)
emb_mtx = torch.nn.Embedding.from_pretrained(weights)

In [8]:
## Practice 1 ##

# print word embedding of '청년'
idx = torch.tensor([vocab['청년']], dtype=torch.long) # make index tensor for '청년'
print(idx)
emb_mtx(idx) # feed index tensor to emb_mtx

tensor([0])


tensor([[-0.2959,  0.2090,  0.3560, -0.2783, -0.1462, -0.1534,  0.3951, -0.2347,
          0.3671,  0.6197, -0.0710, -0.1610, -0.1222,  0.0976,  0.0664,  0.0482,
          0.0368,  0.0598,  0.2184, -0.1963,  0.1342,  0.3241, -0.0435,  0.0503,
          0.1082,  0.0069,  0.1522, -0.0226,  0.0443,  0.0390, -0.0393,  0.3718,
         -0.0417, -0.1606, -0.0864, -0.0617,  0.2511,  0.4148, -0.0526,  0.3160,
         -0.2321,  0.2075,  0.0277,  0.1175, -0.0135, -0.0545, -0.3623, -0.1326,
          0.0657,  0.1969,  0.0687,  0.0309, -0.0976, -0.0280, -0.0269, -0.0513,
          0.1349,  0.0944,  0.0239, -0.0240, -0.0024,  0.1150, -0.1145, -0.0738,
          0.0447, -0.0453,  0.0013, -0.0601,  0.4213, -0.2519, -0.3670, -0.1256,
         -0.0362, -0.2166, -0.1241, -0.3782, -0.2877, -0.3046, -0.5067, -0.2351,
          0.0524, -0.3830, -0.0945, -0.0414, -0.1875, -0.1994,  0.4680, -0.2031,
          0.4220, -0.1267,  0.1588,  0.2103,  0.0156,  0.0248, -0.0025, -0.2248,
         -0.1427,  0.4242, -