## 문서 벡터를 이용한 추천 시스템(Recommendation System using Document Embedding)

## Library Import

In [1]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=False)

Mounted at /content/gdrive


In [120]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dropout, Conv1D, GlobalMaxPooling1D, Dense
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.models import load_model
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
from nltk.tokenize import word_tokenize, sent_tokenize
import copy
import pickle
from collections import OrderedDict
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

전체 리뷰 데이터의 string형태를 토큰화된 corpus로 전처리 하는 과정

(1회만 수행 후 결과를 pickle로 저장하고 그 후에는 해당 경로를 인자로 입력하여 호출)

In [34]:
# df = pd.read_csv('/content/gdrive/MyDrive/Colab Notebooks/deeplearning_NLP/perfume/word embedding_hyun/data/dataset_210626_215600.csv')
# df.drop('Unnamed: 0', axis=1, inplace=True)
# sent_text = df['lemmatizated']
# # 원본 데이터의 str부분을 전처리
# nltk.download('punkt')
# normalized_text = []
# for string in sent_text:
#     tokens = re.sub(r"[^a-z0-9]+", " ", string.lower())
#     tokens = ' '.join([w for w in tokens.split() if len(w)>=3])
#     normalized_text.append(tokens)
# result = []
# result = [word_tokenize(sentence) for sentence in normalized_text]

# corpus_dir = '/content/gdrive/MyDrive/Colab Notebooks/deeplearning_NLP/perfume/word embedding_hyun/data/tokenized_doc.pickle'

# # save
# with open(corpus_dir, 'wb') as f:
#     pickle.dump(result, f, pickle.HIGHEST_PROTOCOL)

### 1. 사용자 입력 문장을 토큰화하고 전체 리뷰 데이터에 추가

In [36]:
def sentence_preprocessing(tokenized_doc_path, user_sentence):
  '''토큰화된 전체 리뷰 불러오기'''
  with open(tokenized_doc_path, 'rb') as f:
    result = pickle.load(f)

  '''사용자 입력 문장 전처리'''
  '''규원님 전처리 라이브러리 사용하는 방식으로 코드 수정해야함'''
  user_sentence = user_sentence.replace("[^a-zA-Z]", " ")
  # 길이가 3이하인 단어는 제거 (길이가 짧은 단어 제거)
  user_sentence = ' '.join([w for w in user_sentence.split() if len(w)>=3])
  # 전체 단어에 대한 소문자 변환
  user_sentence = user_sentence.lower()
  nltk.download('stopwords')
  # NLTK로부터 불용어 로드
  stop_words = stopwords.words('english') 
  tokenized_doc = user_sentence.split() # 토큰화
  tokenized_doc = [item for item in tokenized_doc if item not in stop_words] # 불용어 제거
  nltk.download('wordnet')
  n = WordNetLemmatizer()
  tokenized_doc = [n.lemmatize(item) for item in tokenized_doc] # 표제어 추출

  '''토큰화된 데이터에 사용자 문장 추가'''
  final_result = copy.deepcopy(result)
  final_result.append(tokenized_doc)

  return final_result

### 2. 사전 훈련된 워드 임베딩 로드하여 단어 벡터 평균 계산

In [17]:
# 단어 벡터 평균 구하기
def vectors(model_path, document_list):
    # 모델 로드
    from gensim.models import Word2Vec, KeyedVectors
    word2vec_model = KeyedVectors.load_word2vec_format(model_path)

    document_embedding_list = []

    # 각 문서에 대해서
    for line in document_list:
        doc2vec = None
        count = 0
        for word in line:
            if word in word2vec_model.vocab:
                count += 1
                # 해당 문서에 있는 모든 단어들의 벡터값을 더한다.
                if doc2vec is None:
                    doc2vec = word2vec_model[word]
                else:
                    doc2vec = doc2vec + word2vec_model[word]
        
        if doc2vec is None:
            doc2vec = np.empty(100,)
            doc2vec[:] = 0
            document_embedding_list.append(doc2vec)
        else:
            # 단어 벡터를 모두 더한 벡터의 값을 문서 길이로 나눠준다.
            doc2vec = doc2vec / count
            document_embedding_list.append(doc2vec)

    # 각 문서에 대한 문서 벡터 리스트를 리턴
    return document_embedding_list

### 3. 문서 간 유사도 계산

향수 데이터에서는 전체 문서간의 코사인 유사도 매트릭스가 아닌 같은 라벨 내에서 사용자 입력문장과의 유사도 매트릭스를 구해야 함

In [40]:
def recommendations(df_path, document_embedding_list):

    df = pd.read_csv(df_path)
    df.drop('Unnamed: 0', axis=1, inplace=True)

    # 다른 문서들과의 유사도 측정
    similarity = cosine_similarity([document_embedding_list[-1]], document_embedding_list[0:-1])

    perfumes = df[['name', 'review']]

    # 전체 cosine유사도 행렬에서 사용자 입력 문장과 가장 유사한 순으로 리뷰 정렬
    sim_scores = list(enumerate(similarity.reshape(-1,1)))
    sim_scores = sorted(sim_scores, key = lambda x: x[1], reverse = True)
    sim_scores = sim_scores[1:10]

    # 가장 유사한 리뷰 10개의 인덱스
    per_indices = [i[0] for i in sim_scores]

    # 전체 데이터프레임에서 해당 인덱스의 행만 추출. 5개의 행을 가진다.
    recommend = df.iloc[per_indices].reset_index(drop=True)

    top3_df = pd.DataFrame(columns=['name','similarity','review'])

    # 데이터프레임으로부터 순차적으로 출력
    recommend_perfume = []
    for index, row in recommend.iterrows():
      if len(recommend_perfume)==3:
        break
      if row['name'] in recommend_perfume:
        continue
      else:
        recommend_perfume.append(row['name'])
        top3_df = top3_df.append({'name':row['name'], 'similarity':sim_scores[index][1], 'review':row['review']},ignore_index=True)
      print('Top {}'.format(len(recommend_perfume)))
      print('향수 명: ' ,row['name'])
      print('유사도: ',sim_scores[index][1])
      print('리뷰: ', row['review'])
      print()
      print()
    
    return top3_df

전체 실행

In [27]:
user_sentence = 'The guitarist of the band Sensual and sexy Wearing a shirt and ripped jeans Sweet and drowsy eyes He soaked in sweat in the heat of the stage'

In [37]:
df_path = '/content/gdrive/MyDrive/Colab Notebooks/deeplearning_NLP/perfume/word embedding_hyun/data/dataset_210626_215600.csv'
tokenized_doc_path = '/content/gdrive/MyDrive/Colab Notebooks/deeplearning_NLP/perfume/word embedding_hyun/data/tokenized_doc.pickle'
model_path = '/content/gdrive/MyDrive/Colab Notebooks/deeplearning_NLP/perfume/word embedding_hyun/model/w2v_10window'

In [41]:
final_result = sentence_preprocessing(tokenized_doc_path, user_sentence)
document_embedding_list = vectors(model_path, final_result)
top3_df = recommendations(df_path, document_embedding_list)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Top 1
향수 명:  Shalimar Eau de Parfum Guerlain for women
유사도:  [0.91948322]
리뷰:  Shalimar is hands down the sexiest, most seductive perfume a woman can smell of. Beautiful, stunning. Must be accompanied by a nice outfit and a pair of heels.
Hair in waves, perfectly painted lips..
Not for a gum chewing airhead whose thong is showing above her jeans.


Top 2
향수 명:  Narciso Rodriguez For Her Narciso Rodriguez for women
유사도:  [0.91844152]
리뷰:  This is certainly modern, and makes me smile.  I think of a well dressed women, who starts the evening making herself up with nice makeup, maybe from the Clinique counter.  She washes her hair with fancy shampoo and wears expensive leather boots.  Fall is over, winter is coming, and by midnight it is cold out, but the air 

In [42]:
top3_df

Unnamed: 0,name,similarity,review
0,Shalimar Eau de Parfum Guerlain for women,[0.9194832199600513],"Shalimar is hands down the sexiest, most seduc..."
1,Narciso Rodriguez For Her Narciso Rodriguez fo...,[0.9184415210691973],"This is certainly modern, and makes me smile. ..."
2,Fahrenheit Christian Dior for men,[0.9164512658515446],"This beast can be ultra male, leather, boots, ..."


### 4. 키워드 추출 및 하이라이트 색상 지정

reference code : https://towardsdatascience.com/textrank-for-keyword-extraction-by-python-c0bae21bcec0

In [77]:
class TextRank4Keyword():
  from collections import OrderedDict
  import numpy as np
  import spacy
  from spacy.lang.en.stop_words import STOP_WORDS

  nlp = spacy.load('en_core_web_sm')
  """Extract keywords from text"""
  
  def __init__(self):
    self.d = 0.85 # damping coefficient, usually is .85
    self.min_diff = 1e-5 # convergence threshold
    self.steps = 10 # iteration steps
    self.node_weight = None # save keywords and its weight

  
  def set_stopwords(self, stopwords):  
    """Set stop words"""
    for word in STOP_WORDS.union(set(stopwords)):
        lexeme = nlp.vocab[word]
        lexeme.is_stop = True
  
  def sentence_segment(self, doc, candidate_pos, lower):
    """Store those words only in cadidate_pos"""
    sentences = []
    for sent in doc.sents:
        selected_words = []
        for token in sent:
            # Store words only with cadidate POS tag
            if token.pos_ in candidate_pos and token.is_stop is False:
                if lower is True:
                    selected_words.append(token.text.lower())
                else:
                    selected_words.append(token.text)
        sentences.append(selected_words)
    return sentences
      
  def get_vocab(self, sentences):
    """Get all tokens"""
    vocab = OrderedDict()
    i = 0
    for sentence in sentences:
        for word in sentence:
            if word not in vocab:
                vocab[word] = i
                i += 1
    return vocab
  
  def get_token_pairs(self, window_size, sentences):
    """Build token_pairs from windows in sentences"""
    token_pairs = list()
    for sentence in sentences:
        for i, word in enumerate(sentence):
            for j in range(i+1, i+window_size):
                if j >= len(sentence):
                    break
                pair = (word, sentence[j])
                if pair not in token_pairs:
                    token_pairs.append(pair)
    return token_pairs
      
  def symmetrize(self, a):
    return a + a.T - np.diag(a.diagonal())
  
  def get_matrix(self, vocab, token_pairs):
    """Get normalized matrix"""
    # Build matrix
    vocab_size = len(vocab)
    g = np.zeros((vocab_size, vocab_size), dtype='float')
    for word1, word2 in token_pairs:
        i, j = vocab[word1], vocab[word2]
        g[i][j] = 1
        
    # Get Symmeric matrix
    g = self.symmetrize(g)
    
    # Normalize matrix by column
    norm = np.sum(g, axis=0)
    g_norm = np.divide(g, norm, where=norm!=0) # this is ignore the 0 element in norm
    
    return g_norm

  
  def get_keywords(self, number=10):
    """Return top number keywords"""
    node_weight = OrderedDict(sorted(self.node_weight.items(), key=lambda t: t[1], reverse=True))
    dic = dict()
    for i, (key, value) in enumerate(node_weight.items()):
        dic[key]=value
        if i > number:
            break
    return dic
      
      
  def analyze(self, text, 
            candidate_pos=['NOUN', 'PROPN'], 
            window_size=4, lower=False, stopwords=list()):
    """Main function to analyze text"""
    
    # Set stop words
    self.set_stopwords(stopwords)
    
    # Pare text by spaCy
    doc = nlp(text)
    
    # Filter sentences
    sentences = self.sentence_segment(doc, candidate_pos, lower) # list of list of words
    
    # Build vocabulary
    vocab = self.get_vocab(sentences)
    
    # Get token_pairs from windows
    token_pairs = self.get_token_pairs(window_size, sentences)
    
    # Get normalized matrix
    g = self.get_matrix(vocab, token_pairs)
    
    # Initionlization for weight(pagerank value)
    pr = np.array([1] * len(vocab))
    
    # Iteration
    previous_pr = 0
    for epoch in range(self.steps):
        pr = (1-self.d) + self.d * np.dot(g, pr)
        if abs(previous_pr - sum(pr))  < self.min_diff:
            break
        else:
            previous_pr = sum(pr)

    # Get weight for each node
    node_weight = dict()
    for word, index in vocab.items():
        node_weight[word] = pr[index]
    
    self.node_weight = node_weight

In [126]:
def keyword_highlighter(user_sentence, top3_df, model_path):
  """사용자 입력문장과 추천문장에서 키워드 추출"""
  tr4w = TextRank4Keyword()

  from gensim.models import Word2Vec, KeyedVectors
  word2vec_model = KeyedVectors.load_word2vec_format(model_path)

  # 사용자 입력 문장의 키워드 추출
  user_keyword = []
  tr4w.analyze(user_sentence, candidate_pos = ['NOUN', 'PROPN', 'ADJ'], window_size=5, lower=False)
  user_keyword = list(tr4w.get_keywords(100).keys())
  user_keyword = [word for word in user_keyword if word in word2vec_model.vocab] # 임베딩 벡터에 없는 단어는 제외
  print('사용자 문장에서 추출된 키워드 : ' ,user_keyword)

  # 추천 향수 리뷰의 키워드 추출하여 dataframe에 cloumn으로 저장
  top3_keyword = []
  for i in range(0,len(top3_df)):
    tr4w.analyze(top3_df['review'][i], candidate_pos = ['NOUN', 'PROPN', 'ADJ'], window_size=5, lower=False)
    keywords = list(tr4w.get_keywords(100).keys())
    keywords = [word for word in keywords if word in word2vec_model.vocab] # 임베딩 벡터에 없는 단어는 제외
    top3_keyword.append(keywords)

  top3_df['keywords']=top3_keyword

  """하이라이트 컬러 할당"""
  import random
  import colorsys

  custom_palette = []
  for i in range(0, len(user_keyword)):
    r = random.random()
    h,s,l = r, 1, 0.82
    r,g,b = colorsys.hls_to_rgb(h, l, s)
    r,g,b = int(r*255),int(g*255),int(b*255)
    color = '#%02x%02x%02x' % (r,g,b)
    custom_palette.append(color)
  
  # user keyword에 랜덤 파스텔 컬러 할당
  user_dict = {word : custom_palette[i] for i,word in enumerate(user_keyword)}
  user_dict

  # 추천 리뷰 keyword에 컬러 할당
  color_list = []
  for i in range(0, len(top3_df)):
    top3_dict = dict.fromkeys(top3_df['keywords'][i])
    index = 0
    for uw in user_dict.keys():
      for tw in top3_dict.keys():
        # 임계값 0.6로 잡아봄
        if word2vec_model.similarity(uw, tw) > 0.65:
          # 컬러 할당이 안되어있는 상태라면 처음 값 넣어줌
          if top3_dict[tw] is None:
            top3_dict[tw] = list(user_dict.items())[index]
          # 컬러 할당이 되어있는 상태라면 유사도 더 높은 컬러로 넣어줌
          elif word2vec_model.similarity(uw, tw) > word2vec_model.similarity(top3_dict[tw][0], tw): #이전 user word와 비교
            top3_dict[tw] = list(user_dict.items())[index]
      index+=1
    color_list.append(top3_dict)

  # 데이터 프레임에 색상 정보 추가
  top3_df['colors']=color_list

  return top3_df

In [127]:
top3_df = keyword_highlighter(user_sentence,top3_df, model_path)

사용자 문장에서 추출된 키워드 :  ['shirt', 'sexy', 'jeans', 'sweat', 'heat', 'stage', 'band', 'guitarist', 'eyes']


In [128]:
for i in top3_df.colors:
  print(i)

{'gum': None, 'chewing': ('guitarist', '#a3aefe'), 'lips': ('guitarist', '#a3aefe'), 'airhead': ('guitarist', '#a3aefe'), 'hands': ('guitarist', '#a3aefe'), 'sexiest': None, 'seductive': ('sexy', '#fea3d9'), 'perfume': None, 'stunning': None, 'nice': None, 'outfit': ('jeans', '#a3a6fe'), 'pair': None, 'waves': ('guitarist', '#a3aefe'), 'thong': ('guitarist', '#a3aefe'), 'woman': None, 'jeans': ('jeans', '#a3a6fe')}
{'cold': None, 'hair': ('shirt', '#a3fedc'), 'skin': None, 'night': ('jeans', '#a3a6fe'), 'air': None, 'man': None, 'musky': None, 'scent': None, 'strip': None, 'crisp': None, 'evening': ('jeans', '#a3a6fe'), 'nice': None, 'makeup': None, 'friends': None, 'dresser': ('eyes', '#c9a3fe'), 'bed': None, 'hours': None, 'cleansing': ('guitarist', '#a3aefe'), 'midnight': None, 'middle': None, 'room': None, 'fancy': None, 'shampoo': None, 'expensive': None, 'leather': None, 'women': ('eyes', '#c9a3fe'), 'counter': None, 'bare': None, 'thin': None, 'winter': None, 'city': None, 'boot

In [119]:
top3_df

Unnamed: 0,name,similarity,review,keywords,colors
0,Shalimar Eau de Parfum Guerlain for women,[0.9194832199600513],"Shalimar is hands down the sexiest, most seduc...","[gum, chewing, lips, airhead, hands, sexiest, ...","{'gum': None, 'chewing': ('guitarist', '#fea3a..."
1,Narciso Rodriguez For Her Narciso Rodriguez fo...,[0.9184415210691973],"This is certainly modern, and makes me smile. ...","[cold, hair, skin, night, air, man, musky, sce...","{'cold': None, 'hair': None, 'skin': None, 'ni..."
2,Fahrenheit Christian Dior for men,[0.9164512658515446],"This beast can be ultra male, leather, boots, ...","[sexy, boots, bike, leather, day, girlfriend, ...","{'sexy': ('sexy', '#b7a3fe'), 'boots': ('guita..."
