## 09-11 문서 벡터를 이용한 추천 시스템
(Recommendation System using Document Embedding)

#### 1. 데이터 로드

In [18]:
# 데이터 다운로드 링크 : https://drive.google.com/file/d/15Q7DZ7xrJsI2Hji-WbkU9j1mwnODBd5A/view?usp=sharing

import urllib.request
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import requests
import re
import gzip
from PIL import Image
from io import BytesIO
from nltk.tokenize import RegexpTokenizer
import nltk
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# 데이터를 데이터프레임으로 로드하고 전체 문서의 수를 출력
urllib.request.urlretrieve("https://raw.githubusercontent.com/ukairia777/tensorflow-nlp-tutorial/main/09.%20Word%20Embedding/dataset/data.csv", filename="data.csv")
df = pd.read_csv("bookdata.csv")
print('전체 문서의 수 :',len(df))

전체 문서의 수 : 2382


In [3]:
# 상위 5개의 행만 출력
df[:5]

Unnamed: 0.2,Unnamed: 0.1,Desc,Unnamed: 0,author,genre,image_link,rating,title
0,0,We know that power is shifting: From West to E...,0.0,Moisés Naím,Business,https://i.gr-assets.com/images/S/compressed.ph...,3.63,The End of Power: From Boardrooms to Battlefie...
1,1,Following the success of The Accidental Billio...,1.0,Blake J. Harris,Business,https://i.gr-assets.com/images/S/compressed.ph...,3.94,"Console Wars: Sega, Nintendo, and the Battle t..."
2,2,How to tap the power of social software and ne...,2.0,Chris Brogan,Business,https://i.gr-assets.com/images/S/compressed.ph...,3.78,Trust Agents: Using the Web to Build Influence...
3,3,William J. Bernstein is an American financial ...,3.0,William J. Bernstein,Business,https://i.gr-assets.com/images/S/compressed.ph...,4.2,The Four Pillars of Investing
4,4,Amazing book. And I joined Steve Jobs and many...,4.0,Akio Morita,Business,https://i.gr-assets.com/images/S/compressed.ph...,4.05,Made in Japan: Akio Morita and Sony


In [4]:
# 줄거리에 해당하는 열인 'Desc 열'이 중요
# 해당 열에 있는 데이터에 대해서 Word2Vec을 학습하기 위해 전처리를 진행
# 해당 열에 대해서 전처리를 수행하고 'cleaned'라는 열에 저장
def _removeNonAscii(s):
    return "".join(i for i in s if ord(i)<128)

def make_lower_case(text):
    return text.lower()

def remove_stop_words(text):
    text = text.split()
    stops = set(stopwords.words("english"))
    text = [w for w in text if not w in stops]
    text = " ".join(text)
    return text 

def remove_html(text):
    html_pattern = re.compile('<.*?>')
    return html_pattern.sub(r'', text)

def remove_punctuation(text):
    tokenizer = RegexpTokenizer(r'[a-zA-Z]+')
    text = tokenizer.tokenize(text)
    text = " ".join(text)
    return text


df['cleaned'] = df['Desc'].apply(_removeNonAscii)
df['cleaned'] = df.cleaned.apply(make_lower_case)
df['cleaned'] = df.cleaned.apply(remove_stop_words)
df['cleaned'] = df.cleaned.apply(remove_punctuation)
df['cleaned'] = df.cleaned.apply(remove_html)

In [5]:
# 상위 5개의 행만 출력
df['cleaned'][:5]

0    know power shifting west east north south pres...
1    following success accidental billionaires mone...
2    tap power social software networks build busin...
3    william j bernstein american financial theoris...
4    amazing book joined steve jobs many akio morit...
Name: cleaned, dtype: object

In [6]:
# 빈 값이 생긴 행이 있다면, nan 값으로 변환 후에 해당 행을 제거
df['cleaned'].replace('', np.nan, inplace=True)
df = df[df['cleaned'].notna()]
print('전체 문서의 수 :',len(df))

전체 문서의 수 : 2381


In [7]:
# 토큰화를 수행하여 corpus라는 리스트에 저장
# 해당 리스트 corpus를 통해 Word2Vec을 훈련
corpus = []
for words in df['cleaned']:
    corpus.append(words.split())

#### 2. 사전 훈련된 워드 임베딩 사용하기

- 사전 훈련된 Word2Vec을 로드하고 초기 단어 벡터값으로 사용

In [36]:
# # urllib.request.urlretrieve("https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz", \
# #                            filename="GoogleNews-vectors-negative300.bin.gz")

# f = open('./data/GoogleNews-vectors-negative300.bin','rb')
# data = f.read()    # bytes

# word2vec_model = Word2Vec(vector_size = 300, window=5, min_count = 2, workers = -1)
# word2vec_model.build_vocab(corpus)
# word2vec_model.wv.intersect_word2vec_format('GoogleNews-vectors-negative300.bin', lockf=1.0, binary=True)
# word2vec_model.train(corpus, total_examples = word2vec_model.corpus_count, epochs = 15)