In [8]:
import torch
import torch.nn
import gensim

In [20]:
vocab = {
    '청년': 0,
    'AI': 1,
    '한국': 2,
    '북한': 3,
    '인공지능': 4,
    '대한민국': 5,
    '실습': 6
}

In [9]:
model = gensim.models.Word2Vec.load('newskor.model')

In [16]:
## vocab에 등록된 단어의 사전훈련 벡터 결과 출력 ##
model.wv['청년']

array([ 0.4144971 , -0.04933349, -0.13524283, -0.12233347,  0.2630673 ,
        0.05872554,  0.12592898,  0.6087635 , -0.0278977 ,  0.02563845,
       -0.17880718,  0.44012332, -0.07183573,  0.46530873, -0.10077506,
       -0.16226067,  0.22495694, -0.215634  , -0.28365844,  0.15576017,
        0.35248455, -0.44805998,  0.60364306, -0.07989203,  0.2050995 ,
       -0.13987018,  0.24449357,  0.2418413 , -0.12705767,  0.05245615,
        0.10481708, -0.0792683 ,  0.24522097, -0.10463832,  0.02946578,
       -0.04422118, -0.16302438, -0.39533076, -0.15083945,  0.4125302 ,
        0.09905284, -0.25825524,  0.14984927, -0.45641854, -0.1071222 ,
        0.15623213,  0.18478355,  0.09599242,  0.01177217, -0.01427261,
        0.10405446, -0.34556302, -0.50897235,  0.05642816, -0.17298825,
        0.18189093, -0.21719372, -0.10462724, -0.20299925, -0.10222501,
       -0.27638334,  0.26093516,  0.073791  ,  0.23263739,  0.35804176,
        0.12325641,  0.05927807,  0.0374659 ,  0.14060229, -0.14

In [19]:
## previous version ##
#dim=10
#emb_mtx = torch.nn.Embedding(len(vocab), dim)

## new version ##
#dim=model.wv.vector_size 
#weights=[[weight for 청년], [weight for AI], [weight for BIGDATA], ...]
#emb_mtx = torch.nn.Embedding.from_pretrained(weights)


In [27]:
def get_weights(vocab, embd):
    weights = []
    for word in vocab.keys():
        w = embd.wv[word]
        weights.append(w)
    return torch.tensor(weights)

In [29]:
dim=model.wv.vector_size
weights = get_weights(vocab, model)
emb_mtx = torch.nn.Embedding.from_pretrained(weights)

In [30]:
## Practice 1 ##

# print word embedding of '청년'
idx = torch.tensor([vocab['청년']], dtype=torch.long) # make index tensor for '청년'
print(idx)
emb_mtx(idx) # feed index tensor to emb_mtx

tensor([0])


tensor([[ 0.4145, -0.0493, -0.1352, -0.1223,  0.2631,  0.0587,  0.1259,  0.6088,
         -0.0279,  0.0256, -0.1788,  0.4401, -0.0718,  0.4653, -0.1008, -0.1623,
          0.2250, -0.2156, -0.2837,  0.1558,  0.3525, -0.4481,  0.6036, -0.0799,
          0.2051, -0.1399,  0.2445,  0.2418, -0.1271,  0.0525,  0.1048, -0.0793,
          0.2452, -0.1046,  0.0295, -0.0442, -0.1630, -0.3953, -0.1508,  0.4125,
          0.0991, -0.2583,  0.1498, -0.4564, -0.1071,  0.1562,  0.1848,  0.0960,
          0.0118, -0.0143,  0.1041, -0.3456, -0.5090,  0.0564, -0.1730,  0.1819,
         -0.2172, -0.1046, -0.2030, -0.1022, -0.2764,  0.2609,  0.0738,  0.2326,
          0.3580,  0.1233,  0.0593,  0.0375,  0.1406, -0.1437,  0.2593, -0.1408,
          0.0720, -0.0961, -0.2975, -0.1195, -0.1858,  0.0982, -0.0484, -0.1080,
          0.1188, -0.1626, -0.2228,  0.1169, -0.3478, -0.2076,  0.1804, -0.0265,
          0.1640, -0.0711,  0.1947, -0.3146,  0.2675, -0.1954,  0.3326,  0.5606,
          0.0210, -0.0124, -