In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=False)

Mounted at /content/drive


### Data Load

In [145]:
import pandas as pd
import numpy as np
dataset = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/deeplearning_NLP/perfume/data/20_perfume_all_reviews.csv')
dataset.head()

Unnamed: 0,name,accords,reivew
0,CK One Calvin Klein (unisex) ~ 1994,"['citrus', 'green', 'woody', 'powdery', 'aroma...",I needed to smell good for DeForest Kelley. It...
1,CK One Calvin Klein (unisex) ~ 1994,"['citrus', 'green', 'woody', 'powdery', 'aroma...",This would be the fragrance of my dreams if it...
2,CK One Calvin Klein (unisex) ~ 1994,"['citrus', 'green', 'woody', 'powdery', 'aroma...","Fresh, bright, effortless, androgynous and coo..."
3,CK One Calvin Klein (unisex) ~ 1994,"['citrus', 'green', 'woody', 'powdery', 'aroma...",What a BANGER. 1000% worth the hype. If you're...
4,CK One Calvin Klein (unisex) ~ 1994,"['citrus', 'green', 'woody', 'powdery', 'aroma...",Quite mediocre. Nice fresh scent. But nothing ...


In [3]:
# Nan값을 ''로 채우기
dataset = dataset.fillna('')

In [4]:
# 향수 종류
perfumes = dataset.name.unique()

In [5]:
# 20개의 unisex 향수 종류에 대한 정보가 있음
perfumes

array(['CK One Calvin Klein (unisex) ~ 1994',
       'Tobacco Vanille Tom Ford (unisex) ~ 2007',
       'Un Jardin Sur Le Nil Hermès (unisex) ~ 2005',
       'Chergui Serge Lutens (unisex) ~ 2005',
       'Mugler Cologne Mugler (unisex) ~ 2001',
       'Black Bvlgari (unisex) ~ 1998',
       "02 L'Air du Desert Marocain Tauer Perfumes (unisex) ~ 2005",
       'Philosykos Diptyque (unisex) ~ 1996',
       'Coromandel Eau de Parfum Chanel (unisex) ~ 2016',
       '4711 Original Eau de Cologne 4711 (unisex)',
       'Noir de Noir Tom Ford (unisex) ~ 2007',
       'Neroli Portofino Tom Ford (unisex) ~ 2011',
       "Voyage d'Hermes Hermès (unisex) ~ 2010",
       "Concentre d'Orange Verte Hermès (unisex) ~ 2004",
       'Ambre Sultan Serge Lutens (unisex) ~ 2000',
       'Oud Wood Tom Ford (unisex) ~ 2007',
       'Fève Délicieuse Christian Dior (unisex) ~ 2015',
       'Un Jardin En Mediterranee Hermès (unisex) ~ 2003',
       'Tuscan Leather Tom Ford (unisex) ~ 2007',
       'Musc Ravage

In [6]:
# 향수 종류 별로 모든 review를 합치는 함수
def join_review(dataset=None):
    perfumes = dataset.name.unique()
    join_data=pd.DataFrame(columns=['name', 'accords', 'review'])

    for i in range(len(perfumes)):
      perfume = dataset[dataset['name']==perfumes[i]]
      accord = perfume.loc[:,"accords"].unique()
      perfume = perfume.reset_index(drop=True)
      # 각 향수의 리뷰 데이터를 순회하며 리뷰를 concat
      r = ''
      for j in range(len(perfume)):
        r = ' '.join([r, perfume.loc[:,"reivew"][j]])
      join_data = join_data.append({'name':perfumes[i], 'accords': accord,'review':r}, ignore_index=True)
    return join_data

In [7]:
# main accord를 뽑아내는 함수

def extract_main(dataset=None):
  accord_df = dataset.copy()
  accord_df.drop(['accords','review'], axis=1, inplace=True)

  import re
  accord_list = []
  for i in range(len(join_df)):
    text = dataset['accords'][i][0]
    regex = '[a-z]{1,}'
    p = re.compile(regex)
    m = p.finditer(text)
    accords = [n[0] for n in m]
    accord_list.append(accords)

  accord_df['accords'] = accord_list
  accord_df['main'] = accord_df['accords'].apply(lambda x : x[0])
  accord_df['review'] = dataset['review']

  return accord_df

In [43]:
join_df = join_review(dataset)
join_df = extract_main(join_df)

In [144]:
join_df.head(3)

Unnamed: 0,index,name,accords,main,review
0,0,CK One Calvin Klein (unisex) ~ 1994,"[citrus, green, woody, powdery, aromatic, fres...",citrus,I needed to smell good for DeForest Kelley. I...
1,1,Tobacco Vanille Tom Ford (unisex) ~ 2007,"[vanilla, sweet, tobacco, warm, spicy, fruity,...",vanilla,"When I finally got my decant of TV, very popu..."
2,2,Un Jardin Sur Le Nil Hermès (unisex) ~ 2005,"[citrus, aromatic, floral, fresh, spicy, green...",citrus,tangerine twist\nicy gin\nrepotting the tomat...


In [10]:
documents = join_df.review

### Preprocessing

In [124]:
perfume_df = pd.DataFrame({'document':documents})
# 특수 문자 제거
perfume_df['clean_doc'] = perfume_df['document'].str.replace("[^a-zA-Z]", " ")
# 길이가 3이하인 단어는 제거 (길이가 짧은 단어 제거)
perfume_df['clean_doc'] = perfume_df['clean_doc'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
# 전체 단어에 대한 소문자 변환
perfume_df['clean_doc'] = perfume_df['clean_doc'].apply(lambda x: x.lower())

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
# NLTK로부터 불용어 로드
stop_words = stopwords.words('english') 
tokenized_doc = perfume_df['clean_doc'].apply(lambda x: x.split()) # 토큰화
tokenized_doc = tokenized_doc.apply(lambda x: [item for item in x if item not in stop_words]) # 불용어 제거

nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
n = WordNetLemmatizer()
tokenized_doc = tokenized_doc.apply(lambda x: [n.lemmatize(item) for item in x]) # 표제어 추출
print(tokenized_doc[:5])

# 사용자 정의 불용어 리스트(임시) 불러오기
# 결과 확인하면서 like, would, hour 등 향수 표현시 너무 많이 쓰이는 표현과 향수명에 들어가는 단어와 브랜드명 추가했음

custom_stop_words = []
f = open("/content/drive/MyDrive/Colab Notebooks/deeplearning_NLP/perfume/stopword_sample.txt", "r")
lines = f.readlines()
for line in lines:
    line = line.strip()
    if(len(line)!=1):
        custom_stop_words.append(line)
f.close()

tokenized_doc = tokenized_doc.apply(lambda x: [item for item in x if item not in custom_stop_words]) # 사용자 정의 불용어 제거

print(tokenized_doc[:5])

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
0    [needed, smell, good, deforest, kelley, starlo...
1    [finally, decant, popular, beloved, many, unde...
2    [tangerine, twist, repotting, tomato, plant, f...
3    [purchased, bell, several, year, quite, powder...
4    [make, mistake, lovely, bought, jack, neroli, ...
Name: clean_doc, dtype: object
0    [needed, deforest, kelley, starlog, magazine, ...
1    [finally, decant, popular, beloved, underwhelm...
2    [tangerine, twist, repotting, tomato, plant, f...
3    [purchased, bell, several, powdery, lot, amber...
4    [mistake, lovely, jack, neroli, close, sampled...
Name: clean_doc, dtype: object


### Encoding & Vocab Set

In [125]:
from gensim import corpora
# 정수 인코딩과 빈도수 생성
dictionary = corpora.Dictionary(tokenized_doc)
corpus = [dictionary.doc2bow(text) for text in tokenized_doc]
print(corpus[0])

[(0, 1), (1, 2), (2, 5), (3, 1), (4, 1), (5, 1), (6, 2), (7, 8), (8, 1), (9, 1), (10, 1), (11, 2), (12, 2), (13, 8), (14, 2), (15, 3), (16, 11), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 4), (32, 1), (33, 1), (34, 3), (35, 2), (36, 6), (37, 1), (38, 1), (39, 3), (40, 2), (41, 1), (42, 1), (43, 1), (44, 1), (45, 1), (46, 2), (47, 7), (48, 3), (49, 4), (50, 1), (51, 2), (52, 5), (53, 1), (54, 5), (55, 16), (56, 10), (57, 2), (58, 3), (59, 1), (60, 1), (61, 1), (62, 1), (63, 1), (64, 1), (65, 1), (66, 5), (67, 1), (68, 1), (69, 7), (70, 1), (71, 1), (72, 9), (73, 7), (74, 10), (75, 3), (76, 3), (77, 1), (78, 1), (79, 1), (80, 1), (81, 6), (82, 2), (83, 1), (84, 1), (85, 2), (86, 4), (87, 1), (88, 1), (89, 6), (90, 1), (91, 1), (92, 1), (93, 1), (94, 1), (95, 4), (96, 1), (97, 1), (98, 1), (99, 1), (100, 1), (101, 1), (102, 1), (103, 1), (104, 1), (105, 1), (106, 2), (107, 1), (108, 1), (109, 3), (110,

In [126]:
# 총 학습된 단어 수
len(dictionary)

12750

### Model

In [17]:
# !pip install pyLDAvis==2.1.2

Collecting pyLDAvis==2.1.2
[?25l  Downloading https://files.pythonhosted.org/packages/a5/3a/af82e070a8a96e13217c8f362f9a73e82d61ac8fff3a2561946a97f96266/pyLDAvis-2.1.2.tar.gz (1.6MB)
[K     |▏                               | 10kB 16.8MB/s eta 0:00:01[K     |▍                               | 20kB 21.4MB/s eta 0:00:01[K     |▋                               | 30kB 15.8MB/s eta 0:00:01[K     |▉                               | 40kB 14.5MB/s eta 0:00:01[K     |█                               | 51kB 7.8MB/s eta 0:00:01[K     |█▏                              | 61kB 9.0MB/s eta 0:00:01[K     |█▍                              | 71kB 7.8MB/s eta 0:00:01[K     |█▋                              | 81kB 8.6MB/s eta 0:00:01[K     |█▉                              | 92kB 9.2MB/s eta 0:00:01[K     |██                              | 102kB 7.5MB/s eta 0:00:01[K     |██▎                             | 112kB 7.5MB/s eta 0:00:01[K     |██▍                             | 122kB 7.5MB/s eta 

In [127]:
import gensim
NUM_TOPICS = 4 #토픽 개수 지정
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=20)
# topics = ldamodel.print_topics()
# for topic in topics:
#     print(topic)

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

In [128]:
from gensim.models.coherencemodel import CoherenceModel

# Compute Coherence Score using c_v
coherence_model_lda = CoherenceModel(model=ldamodel, texts=tokenized_doc, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('Coherence Score (c_v): ', coherence_lda)

# Compute Coherence Score using UMass
coherence_model_lda = CoherenceModel(model=ldamodel, texts=tokenized_doc, dictionary=dictionary, coherence="u_mass")
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score (u_mass): ', coherence_lda)
# u_mass는 0에 가까울수록 완벽한 일관성

Coherence Score (c_v):  0.4555578252622955

Coherence Score (u_mass):  -0.191855495044696


In [129]:
import pyLDAvis.gensim
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)
pyLDAvis.display(vis)

In [104]:
# 훈련된 LDA모델에 전체 데이터가 정수 인코딩 된 결과를 넣어줘야함
for i, topic_list in enumerate(ldamodel[corpus]):
    print(i,'번째 문서의 topic 비율은',topic_list)

0 번째 문서의 topic 비율은 [(2, 0.6147777), (3, 0.38512254)]
1 번째 문서의 topic 비율은 [(1, 0.99882585)]
2 번째 문서의 topic 비율은 [(0, 0.010026215), (2, 0.9820833)]
3 번째 문서의 topic 비율은 [(1, 0.13177264), (2, 0.86815095)]
4 번째 문서의 topic 비율은 [(3, 0.99986553)]
5 번째 문서의 topic 비율은 [(2, 0.99832493)]
6 번째 문서의 topic 비율은 [(1, 0.046353515), (2, 0.95357853)]
7 번째 문서의 topic 비율은 [(2, 0.9998633)]
8 번째 문서의 topic 비율은 [(1, 0.9817952), (3, 0.017844463)]
9 번째 문서의 topic 비율은 [(3, 0.999875)]
10 번째 문서의 topic 비율은 [(1, 0.99989337)]
11 번째 문서의 topic 비율은 [(3, 0.99968314)]
12 번째 문서의 topic 비율은 [(0, 0.66561997), (1, 0.014598368), (2, 0.22536471), (3, 0.094416946)]
13 번째 문서의 topic 비율은 [(3, 0.99979323)]
14 번째 문서의 topic 비율은 [(1, 0.99658114)]
15 번째 문서의 topic 비율은 [(1, 0.35904917), (2, 0.015115394), (3, 0.6257946)]
16 번째 문서의 topic 비율은 [(1, 0.045244098), (2, 0.95468354)]
17 번째 문서의 topic 비율은 [(0, 0.974179), (2, 0.021100473)]
18 번째 문서의 topic 비율은 [(1, 0.9998847)]
19 번째 문서의 topic 비율은 [(1, 0.91960484), (3, 0.07348516)]


In [142]:
# 데이터 프레임으로 결과 시각화
def make_topictable_per_doc(ldamodel, corpus, dataset):
    
    topic_table = pd.DataFrame()

    # 몇 번째 문서인지를 의미하는 문서 번호와 해당 문서의 토픽 비중을 한 줄씩 꺼내온다.
    for i, topic_list in enumerate(ldamodel[corpus]):
        doc = topic_list[0] if ldamodel.per_word_topics else topic_list            
        doc = sorted(doc, key=lambda x: (x[1]), reverse=True)
        # 각 문서에 대해서 비중이 높은 토픽순으로 토픽을 정렬한다.
        # EX) 정렬 전 0번 문서 : (2번 토픽, 48.5%), (8번 토픽, 25%), (10번 토픽, 5%), (12번 토픽, 21.5%), 
        # Ex) 정렬 후 0번 문서 : (2번 토픽, 48.5%), (8번 토픽, 25%), (12번 토픽, 21.5%), (10번 토픽, 5%)
        # 48 > 25 > 21 > 5 순으로 정렬이 된 것.

        # 모든 문서에 대해서 각각 아래를 수행
        for j, (topic_num, prop_topic) in enumerate(doc): #  몇 번 토픽인지와 비중을 나눠서 저장한다.
            if j == 0:  # 정렬을 한 상태이므로 가장 앞에 있는 것이 가장 비중이 높은 토픽
                topic_table = topic_table.append(pd.Series([dataset.loc[i,'name'], dataset.loc[i,'main'], dataset.loc[i,'accords'], int(topic_num), round(prop_topic,4), topic_list]), ignore_index=True)
                # 가장 비중이 높은 토픽과, 가장 비중이 높은 토픽의 비중과, 전체 토픽의 비중을 저장한다.
            else:
                break
    return(topic_table)

In [139]:
for i in range(NUM_TOPICS):
  print("토픽 {}을 구성하는 가장 중요한 단어 10개 :".format(i))
  print(ldamodel.show_topic(i))
  print()

토픽 0을 구성하는 가장 중요한 단어 10개 :
[('green', 0.012373825), ('fresh', 0.010593336), ('leather', 0.009392214), ('summer', 0.0069262995), ('skin', 0.0065271617), ('citrus', 0.0063121356), ('leaf', 0.005351816), ('sweet', 0.004941531), ('floral', 0.003380978), ('tree', 0.0033458755)]

토픽 1을 구성하는 가장 중요한 단어 10개 :
[('neroli', 0.012589812), ('fresh', 0.011908492), ('citrus', 0.010359838), ('orange', 0.009706103), ('clean', 0.008605717), ('summer', 0.008569778), ('price', 0.0057255626), ('skin', 0.005493252), ('green', 0.005370621), ('portofino', 0.0053562797)]

토픽 2을 구성하는 가장 중요한 단어 10개 :
[('amber', 0.010422339), ('vanilla', 0.007468511), ('rose', 0.0072865956), ('skin', 0.0065694274), ('sweet', 0.0065126712), ('patchouli', 0.006396091), ('wood', 0.005218093), ('dark', 0.0043435055), ('spice', 0.0037531913), ('chocolate', 0.0036039585)]

토픽 3을 구성하는 가장 중요한 단어 10개 :
[('tobacco', 0.017068502), ('vanilla', 0.014879125), ('sweet', 0.012085575), ('skin', 0.0074916435), ('tonka', 0.005375215), ('gourmand', 0

In [143]:
pd.set_option('display.max_columns', None)
topictable = make_topictable_per_doc(ldamodel, corpus, join_df)
topictable.columns = ['향수명', '탑 어코드', '어코드', '가장 비중이 높은 토픽', '가장 높은 토픽의 비중', '각 토픽의 비중']
topictable.sort_values(by=['가장 비중이 높은 토픽'], axis=0)

Unnamed: 0,향수명,탑 어코드,어코드,가장 비중이 높은 토픽,가장 높은 토픽의 비중,각 토픽의 비중
0,CK One Calvin Klein (unisex) ~ 1994,citrus,"[citrus, green, woody, powdery, aromatic, fres...",0.0,0.7894,"[(0, 0.78935045), (1, 0.2105469)]"
17,Un Jardin En Mediterranee Hermès (unisex) ~ 2003,woody,"[woody, aromatic, citrus, green, fresh, spicy,...",0.0,0.9996,"[(0, 0.9996306)]"
2,Un Jardin Sur Le Nil Hermès (unisex) ~ 2005,citrus,"[citrus, aromatic, floral, fresh, spicy, green...",0.0,0.9999,"[(0, 0.9998777)]"
12,Voyage d'Hermes Hermès (unisex) ~ 2010,warm,"[warm, spicy, citrus, green, aromatic, woody, ...",0.0,0.9992,"[(0, 0.9992499)]"
18,Tuscan Leather Tom Ford (unisex) ~ 2007,leather,"[leather, fruity, animalic, sweet, amber, smoky]",0.0,0.9008,"[(0, 0.90075827), (2, 0.07376417), (3, 0.02543..."
7,Philosykos Diptyque (unisex) ~ 1996,green,"[green, woody, sweet, fresh, fruity, lactonic,...",0.0,0.9999,"[(0, 0.9998817)]"
13,Concentre d'Orange Verte Hermès (unisex) ~ 2004,citrus,"[citrus, fresh, spicy, woody, aromatic, green,...",1.0,0.9952,"[(1, 0.99517375)]"
11,Neroli Portofino Tom Ford (unisex) ~ 2011,citrus,"[citrus, white, floral, fresh, spicy, aromatic]",1.0,0.9991,"[(1, 0.9991018)]"
9,4711 Original Eau de Cologne 4711 (unisex),citrus,"[citrus, aromatic, fresh, spicy, woody, white,...",1.0,0.9999,"[(1, 0.9998704)]"
4,Mugler Cologne Mugler (unisex) ~ 2001,citrus,"[citrus, white, floral, fresh, spicy, aromatic...",1.0,0.9998,"[(1, 0.99983823)]"
