## 데이터 읽어들이기

In [1]:
train_data, test_data = [], []

for line in open("data/ratings_train_tokenized.txt", "r", encoding="utf-8").readlines():
    _, tokens, label = line.strip().split("\u241E")
    train_data.append([tokens.split(), label])

for line in open("data/ratings_test_tokenized.txt", "r", encoding="utf-8").readlines():
    _, tokens, label = line.strip().split("\u241E")
    test_data.append([tokens.split(), label])

In [2]:
import pandas as pd
pd.DataFrame(train_data[:10])

Unnamed: 0,0,1
0,"[아, 더, 빙, ., ., 진짜, 짜증, 나, 네요, 목소리]",0
1,"[흠, ., .., 포스터, 보고, 초딩, 영화, 줄, ., ..., 오버, 연기,...",1
2,"[너무, 재, 밓었다그래서보는것을추천한다]",0
3,"[교도소, 이야기, 구먼, ., ., 솔직히, 재미, 는, 없, 다, ., ., 평...",0
4,"[사이몬페그, 의, 익살, 스런, 연기, 가, 돋보였, 던, 영화, !, 스파이더맨...",1
5,"[막, 걸음마, 뗀, 3, 세, 부터, 초등, 학교, 1, 학년, 생, 인, 8, ...",0
6,"[원작, 의, 긴장감, 을, 제대로, 살려, 내, 지, 못했, 다, .]",0
7,"[별, 반개, 도, 아깝, 다, 욕, 나온다, 이응경, 길용우, 연기, 생활, 이,...",0
8,"[액션, 이, 없, 는데, 도, 재미, 있, 는, 몇, 안, 되, 는, 영화]",1
9,"[왜, 케, 평점, 이, 낮, 은, 건데, ?, 꽤, 볼, 만한데, ., ., 헐리...",1


## 임베딩 읽어들이기

In [3]:
import numpy as np
from collections import defaultdict

def load_ft_embeddings(path):
    embeddings = defaultdict(list)
    for line in open(path, "r", encoding="utf-8").readlines()[1:]:
        splited_line = line.strip().split(" ")
        word = splited_line[0]
        vec = [float(el) for el in splited_line[1:]]
        embeddings[word] = vec / np.linalg.norm(vec)
    return embeddings

In [4]:
embeddings = load_ft_embeddings("embedding/fasttext.vec")

## word to sentence-level embedding

In [5]:
def get_sentence_vector(tokens, dim=100):
    vector = np.zeros(dim)
    for token in tokens:
        if token in embeddings.keys():
            vector += embeddings[token]
    if len(tokens) > 1:
        vector /= len(tokens)
    vector_norm = np.linalg.norm(vector)
    if vector_norm != 0:
        unit_vector = vector / vector_norm
    else:
        unit_vector = np.zeros(dim)
    return unit_vector

## train

In [6]:
train_table = np.zeros([len(train_data), 100], dtype=np.float16)
train_labels = []
for idx, train_instance in enumerate(train_data):
    tokens, label = train_instance
    sent_vec = get_sentence_vector(tokens)
    train_table[idx] = sent_vec
    train_labels.append(label)

## predict

In [7]:
def predict(tokens):
    sent_vec = get_sentence_vector(tokens)
    scores = np.dot(train_table, sent_vec)
    pred = train_labels[np.argmax(scores)]
    return pred

In [8]:
print(test_data[0])

[['굳', 'ㅋ'], '1']


In [9]:
predict(test_data[0][0])

'1'

In [10]:
def predict_by_batch(sentences, labels):
    sentence_vectors, eval_score = [], 0
    for tokens in sentences:
        sentence_vectors.append(get_sentence_vector(tokens))
    scores = np.dot(train_table, np.array(sentence_vectors).T)
    preds = np.argmax(scores, axis=0)
    for pred, label in zip(preds, labels):
        if train_labels[pred] == label:
            eval_score += 1
    return preds, eval_score

## evaluation

In [11]:
batch_size = 3000
data_size = len(test_data)
num_batches = int((data_size - 1) / batch_size) + 1
eval_score = 0
for batch_num in range(num_batches):
    batch_tokenized_sentences = []
    batch_labels = []
    start_index = batch_num * batch_size
    end_index = min((batch_num + 1) * batch_size, data_size)
    features = test_data[start_index:end_index]
    for feature in features:
        tokens, label = feature
        batch_tokenized_sentences.append(tokens)
        batch_labels.append(label)
    preds, curr_eval_score = predict_by_batch(batch_tokenized_sentences, batch_labels)
    eval_score += curr_eval_score

In [12]:
print("# of correct:", str(eval_score), ", total:", str(len(test_data)), ", score:", str(eval_score / len(test_data)))

# of correct: 35533 , total: 49997 , score: 0.7107026421585295
