In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=False)

Mounted at /content/drive


### Data Load

In [2]:
import pandas as pd
import numpy as np
dataset = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/deeplearning_NLP/perfume/data/final_data.csv')
dataset.head()

Unnamed: 0,gender,name,accords,review
0,female,Alien Mugler for women,"['white floral', 'amber', 'woody']","Got a sample of this today, and my 9 year old ..."
1,female,Alien Mugler for women,"['white floral', 'amber', 'woody']",First impressions. Test on blotter.\nI have he...
2,female,Alien Mugler for women,"['white floral', 'amber', 'woody']",This perfume reminds me of my best friend. Act...
3,female,Alien Mugler for women,"['white floral', 'amber', 'woody']",Imagine tripping over your own feet and fallin...
4,female,Alien Mugler for women,"['white floral', 'amber', 'woody']",Gorgeous Gorgeous Blend ..\nLove the scent...\...


In [3]:
# Nan값을 ''로 채우기
dataset = dataset.fillna('')

In [4]:
# 향수 종류
perfumes = dataset.name.unique()

In [5]:
print("향수 개수 : {}".format(len(perfumes)))

향수 개수 : 89


In [6]:
# 향수 종류 별로 모든 review를 합치는 함수
def join_review(dataset=None):
    perfumes = dataset.name.unique()
    join_data=pd.DataFrame(columns=['name', 'accords', 'review'])

    for i in range(len(perfumes)):
      perfume = dataset[dataset['name']==perfumes[i]]
      accord = perfume.loc[:,"accords"].unique()
      perfume = perfume.reset_index(drop=True)
      # 각 향수의 리뷰 데이터를 순회하며 리뷰를 concat
      r = ''
      for j in range(len(perfume)):
        r = ' '.join([r, perfume.loc[:,"review"][j]])
      join_data = join_data.append({'name':perfumes[i], 'accords': accord,'review':r}, ignore_index=True)
    return join_data

In [7]:
# main accord를 뽑아내는 함수

def extract_main(dataset=None):
  accord_df = dataset.copy()
  accord_df.drop(['accords','review'], axis=1, inplace=True)

  import re
  accord_list = []
  for i in range(len(join_df)):
    text = dataset['accords'][i][0]
    regex = '[a-z]{1,}'
    p = re.compile(regex)
    m = p.finditer(text)
    accords = [n[0] for n in m]
    accord_list.append(accords)

  accord_df['accords'] = accord_list
  accord_df['main'] = accord_df['accords'].apply(lambda x : x[0])
  accord_df['review'] = dataset['review']

  return accord_df

In [8]:
join_df = join_review(dataset)
join_df = extract_main(join_df)

In [9]:
join_df.head(3)

Unnamed: 0,name,accords,main,review
0,Alien Mugler for women,"[white, floral, amber, woody]",white,"Got a sample of this today, and my 9 year old..."
1,Coco Mademoiselle Chanel for women,"[citrus, woody, patchouli, sweet, white, flora...",citrus,I remember smelling this when it first came o...
2,Black Orchid Tom Ford for women,"[warm, spicy, earthy, woody, sweet, amber, pat...",warm,"It smells good. In my opinion, it's over-hype..."


In [10]:
documents = join_df.review

### Preprocessing

In [11]:
perfume_df = pd.DataFrame({'document':documents})
# 특수 문자 제거
perfume_df['clean_doc'] = perfume_df['document'].str.replace("[^a-zA-Z]", " ")
# 길이가 3이하인 단어는 제거 (길이가 짧은 단어 제거)
perfume_df['clean_doc'] = perfume_df['clean_doc'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
# 전체 단어에 대한 소문자 변환
perfume_df['clean_doc'] = perfume_df['clean_doc'].apply(lambda x: x.lower())

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
# NLTK로부터 불용어 로드
stop_words = stopwords.words('english') 
tokenized_doc = perfume_df['clean_doc'].apply(lambda x: x.split()) # 토큰화
tokenized_doc = tokenized_doc.apply(lambda x: [item for item in x if item not in stop_words]) # 불용어 제거

nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
n = WordNetLemmatizer()
tokenized_doc = tokenized_doc.apply(lambda x: [n.lemmatize(item) for item in x]) # 표제어 추출
print(tokenized_doc[:5])

# 사용자 정의 불용어 리스트(임시) 불러오기
# 결과 확인하면서 like, would, hour 등 향수 표현시 너무 많이 쓰이는 표현과 향수명에 들어가는 단어와 브랜드명 추가했음

custom_stop_words = []
f = open("/content/drive/MyDrive/Colab Notebooks/deeplearning_NLP/perfume/stopword_sample.txt", "r")
lines = f.readlines()
for line in lines:
    line = line.strip()
    if(len(line)!=1):
        custom_stop_words.append(line)
f.close()

tokenized_doc = tokenized_doc.apply(lambda x: [item for item in x if item not in custom_stop_words]) # 사용자 정의 불용어 제거

print(tokenized_doc[:5])

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
0    [sample, today, year, daughter, thought, smell...
1    [remember, smelling, first, came, huge, keira,...
2    [smell, good, opinion, hyped, though, putting,...
3    [first, girl, perfume, really, nightlife, type...
4    [greatest, love, ground, cumin, ground, coconu...
Name: clean_doc, dtype: object
0    [sample, today, daughter, thought, smelling, s...
1    [remember, smelling, came, huge, keira, knight...
2    [opinion, hyped, putting, anyone, others, welc...
3    [girl, nightlife, type, complex, dark, suit, c...
4    [greatest, ground, cumin, ground, coconut, gro...
Name: clean_doc, dtype: object


### Encoding & Vocab Set

In [12]:
from gensim import corpora
# 정수 인코딩과 빈도수 생성
dictionary = corpora.Dictionary(tokenized_doc)
corpus = [dictionary.doc2bow(text) for text in tokenized_doc]
print(corpus[0])

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 3), (5, 1), (6, 2), (7, 7), (8, 4), (9, 1), (10, 1), (11, 5), (12, 44), (13, 1), (14, 1), (15, 1), (16, 1), (17, 6), (18, 43), (19, 2), (20, 29), (21, 127), (22, 2), (23, 1), (24, 1), (25, 3), (26, 1), (27, 1), (28, 1), (29, 1), (30, 3), (31, 1), (32, 2), (33, 1), (34, 2), (35, 1), (36, 1), (37, 1), (38, 2), (39, 1), (40, 3), (41, 1), (42, 4), (43, 1), (44, 3), (45, 2), (46, 1), (47, 31), (48, 6), (49, 2), (50, 1), (51, 1), (52, 3), (53, 1), (54, 2), (55, 1), (56, 2), (57, 1), (58, 7), (59, 1), (60, 1), (61, 1), (62, 2), (63, 2), (64, 4), (65, 1), (66, 3), (67, 40), (68, 2), (69, 1), (70, 1), (71, 1), (72, 1), (73, 17), (74, 157), (75, 1), (76, 1), (77, 2), (78, 2), (79, 13), (80, 1), (81, 16), (82, 6), (83, 10), (84, 1), (85, 7), (86, 7), (87, 46), (88, 5), (89, 7), (90, 3), (91, 1), (92, 1), (93, 2), (94, 1), (95, 2), (96, 1), (97, 2), (98, 1), (99, 5), (100, 1), (101, 2), (102, 1), (103, 25), (104, 1), (105, 3), (106, 2), (107, 1), (108, 3), (109

In [13]:
# 총 학습된 단어 수
len(dictionary)

55034

### Model

In [17]:
!pip install pyLDAvis==2.1.2

Collecting pyLDAvis==2.1.2
[?25l  Downloading https://files.pythonhosted.org/packages/a5/3a/af82e070a8a96e13217c8f362f9a73e82d61ac8fff3a2561946a97f96266/pyLDAvis-2.1.2.tar.gz (1.6MB)
[K     |████████████████████████████████| 1.6MB 4.8MB/s 
Collecting funcy
  Downloading https://files.pythonhosted.org/packages/44/52/5cf7401456a461e4b481650dfb8279bc000f31a011d0918904f86e755947/funcy-1.16-py2.py3-none-any.whl
Building wheels for collected packages: pyLDAvis
  Building wheel for pyLDAvis (setup.py) ... [?25l[?25hdone
  Created wheel for pyLDAvis: filename=pyLDAvis-2.1.2-py2.py3-none-any.whl size=97712 sha256=67c65bb84e27881e89d6ef345c859fb87f9d1531e75e8adff4ed29dc4b6dd8e0
  Stored in directory: /root/.cache/pip/wheels/98/71/24/513a99e58bb6b8465bae4d2d5e9dba8f0bef8179e3051ac414
Successfully built pyLDAvis
Installing collected packages: funcy, pyLDAvis
Successfully installed funcy-1.16 pyLDAvis-2.1.2


In [24]:
import gensim
NUM_TOPICS = 4 #토픽 개수 지정
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=20)
# topics = ldamodel.print_topics()
# for topic in topics:
#     print(topic)

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

In [15]:
from gensim.models.coherencemodel import CoherenceModel

# Compute Coherence Score using c_v
coherence_model_lda = CoherenceModel(model=ldamodel, texts=tokenized_doc, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('Coherence Score (c_v): ', coherence_lda)

# Compute Coherence Score using UMass
coherence_model_lda = CoherenceModel(model=ldamodel, texts=tokenized_doc, dictionary=dictionary, coherence="u_mass")
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score (u_mass): ', coherence_lda)
# u_mass는 0에 가까울수록 완벽한 일관성

Coherence Score (c_v):  0.32610609658760686

Coherence Score (u_mass):  -0.08867637094050607


In [25]:
import pyLDAvis.gensim
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)
pyLDAvis.display(vis)

* 토픽 0 : 아쿠아틱
* 토픽 1 : 토바코
* 토픽 2 : 플로럴
* 토픽 3 : 프루티

In [26]:
# 훈련된 LDA모델에 전체 데이터가 정수 인코딩 된 결과를 넣어줘야함
for i, topic_list in enumerate(ldamodel[corpus]):
    print(i,'번째 문서의 topic 비율은',topic_list)

0 번째 문서의 topic 비율은 [(2, 0.99379677)]
1 번째 문서의 topic 비율은 [(0, 0.12288558), (2, 0.8750732)]
2 번째 문서의 topic 비율은 [(1, 0.098921515), (2, 0.8960525)]
3 번째 문서의 topic 비율은 [(2, 0.9937292)]
4 번째 문서의 topic 비율은 [(1, 0.012987003), (2, 0.8310398), (3, 0.15571904)]
5 번째 문서의 topic 비율은 [(2, 0.9970753)]
6 번째 문서의 topic 비율은 [(1, 0.16385676), (2, 0.55585986), (3, 0.27131933)]
7 번째 문서의 topic 비율은 [(0, 0.85804427), (2, 0.1412345)]
8 번째 문서의 topic 비율은 [(0, 0.14954792), (2, 0.7519443), (3, 0.09538024)]
9 번째 문서의 topic 비율은 [(0, 0.02008221), (2, 0.97580653)]
10 번째 문서의 topic 비율은 [(0, 0.044846836), (2, 0.9481904)]
11 번째 문서의 topic 비율은 [(0, 0.37543043), (2, 0.622602)]
12 번째 문서의 topic 비율은 [(2, 0.33097503), (3, 0.66770345)]
13 번째 문서의 topic 비율은 [(0, 0.06229335), (2, 0.9368991)]
14 번째 문서의 topic 비율은 [(0, 0.06472367), (2, 0.5335793), (3, 0.40168825)]
15 번째 문서의 topic 비율은 [(0, 0.010038575), (2, 0.9846659)]
16 번째 문서의 topic 비율은 [(2, 0.98507404), (3, 0.012882356)]
17 번째 문서의 topic 비율은 [(2, 0.9872581)]
18 번째 문서의 topic 비율은 [(2, 0.25

In [27]:
# 데이터 프레임으로 결과 시각화
def make_topictable_per_doc(ldamodel, corpus, dataset):
    
    topic_table = pd.DataFrame()

    # 몇 번째 문서인지를 의미하는 문서 번호와 해당 문서의 토픽 비중을 한 줄씩 꺼내온다.
    for i, topic_list in enumerate(ldamodel[corpus]):
        doc = topic_list[0] if ldamodel.per_word_topics else topic_list            
        doc = sorted(doc, key=lambda x: (x[1]), reverse=True)
        # 각 문서에 대해서 비중이 높은 토픽순으로 토픽을 정렬한다.
        # EX) 정렬 전 0번 문서 : (2번 토픽, 48.5%), (8번 토픽, 25%), (10번 토픽, 5%), (12번 토픽, 21.5%), 
        # Ex) 정렬 후 0번 문서 : (2번 토픽, 48.5%), (8번 토픽, 25%), (12번 토픽, 21.5%), (10번 토픽, 5%)
        # 48 > 25 > 21 > 5 순으로 정렬이 된 것.

        # 모든 문서에 대해서 각각 아래를 수행
        for j, (topic_num, prop_topic) in enumerate(doc): #  몇 번 토픽인지와 비중을 나눠서 저장한다.
            if j == 0:  # 정렬을 한 상태이므로 가장 앞에 있는 것이 가장 비중이 높은 토픽
                topic_table = topic_table.append(pd.Series([dataset.loc[i,'name'], dataset.loc[i,'main'], dataset.loc[i,'accords'], int(topic_num), round(prop_topic,4), topic_list]), ignore_index=True)
                # 가장 비중이 높은 토픽과, 가장 비중이 높은 토픽의 비중과, 전체 토픽의 비중을 저장한다.
            else:
                break
    return(topic_table)

In [28]:
for i in range(NUM_TOPICS):
  print("토픽 {}을 구성하는 가장 중요한 단어 10개 :".format(i))
  print(ldamodel.show_topic(i))
  print()

토픽 0을 구성하는 가장 중요한 단어 10개 :
[('fresh', 0.010273245), ('skin', 0.0071287714), ('citrus', 0.0054140994), ('summer', 0.005348741), ('projection', 0.005041931), ('compliment', 0.0048183003), ('sweet', 0.0048055984), ('homme', 0.004729978), ('clean', 0.0045807217), ('blue', 0.004368994)]

토픽 1을 구성하는 가장 중요한 단어 10개 :
[('vanilla', 0.0103500765), ('tobacco', 0.009045602), ('leather', 0.008528754), ('sweet', 0.0076107797), ('skin', 0.006599176), ('shalimar', 0.0050725457), ('wood', 0.0042219926), ('strong', 0.0038275712), ('warm', 0.0035221672), ('projection', 0.003321451)]

토픽 2을 구성하는 가장 중요한 단어 10개 :
[('sweet', 0.0135190105), ('skin', 0.0076597813), ('vanilla', 0.007376921), ('patchouli', 0.0058520543), ('angel', 0.0058434336), ('floral', 0.0057183397), ('strong', 0.005661978), ('sexy', 0.003939801), ('woman', 0.003910309), ('jasmine', 0.0038766493)]

토픽 3을 구성하는 가장 중요한 단어 10개 :
[('aventus', 0.017335715), ('rose', 0.008584165), ('sweet', 0.0070723942), ('batch', 0.0066899066), ('skin', 0.00638544

In [29]:
pd.set_option('display.max_columns', None)
topictable = make_topictable_per_doc(ldamodel, corpus, join_df)
topictable.columns = ['향수명', '탑 어코드', '어코드', '가장 비중이 높은 토픽', '가장 높은 토픽의 비중', '각 토픽의 비중']
topictable.sort_values(by=['가장 비중이 높은 토픽'], axis=0)

Unnamed: 0,향수명,탑 어코드,어코드,가장 비중이 높은 토픽,가장 높은 토픽의 비중,각 토픽의 비중
44,Wood Sage & Sea Salt Jo Malone London for wome...,aromatic,"[aromatic, salty, marine, citrus, musky, herba...",0.0,0.8792,"[(0, 0.8791943), (1, 0.015517315), (2, 0.10494..."
55,Un Jardin Sur Le Nil Hermès for women and men,citrus,"[citrus, aromatic, floral, fresh, spicy, green...",0.0,0.8397,"[(0, 0.8397382), (2, 0.15199739)]"
61,La Nuit de l'Homme Yves Saint Laurent for men,aromatic,"[aromatic, warm, spicy, lavender, woody, fresh...",0.0,0.9362,"[(0, 0.93622863), (1, 0.038943842), (2, 0.0118..."
22,Chance Eau Tendre Chanel for women,floral,"[floral, fruity, citrus, sweet, fresh, spicy, ...",0.0,0.6112,"[(0, 0.6112026), (2, 0.38859212)]"
62,Acqua di Giò Profumo Giorgio Armani for men,aromatic,"[aromatic, marine, fresh, spicy, amber, smoky,...",0.0,0.9913,"[(0, 0.99132144)]"
...,...,...,...,...,...,...
50,Hacivat Nishane for women and men,woody,"[woody, citrus, mossy, fruity, earthy, sweet, ...",3.0,0.9693,"[(0, 0.021633292), (3, 0.96932536)]"
45,Baccarat Rouge 540 Extrait de Parfum Maison Fr...,amber,"[amber, almond, woody, warm, spicy, animalic, ...",3.0,0.7652,"[(1, 0.14969607), (2, 0.08038722), (3, 0.765166)]"
37,Cedrat Boise Mancera for women and men,woody,"[woody, fruity, citrus, powdery, aromatic, lea...",3.0,0.9029,"[(0, 0.034976263), (1, 0.061811704), (3, 0.902..."
74,Explorer Montblanc for men,woody,"[woody, citrus, amber, aromatic, musky, oud, f...",3.0,0.9120,"[(0, 0.08779054), (3, 0.9119839)]"


In [30]:
topictable['가장 비중이 높은 토픽'].value_counts()

2.0    27
1.0    26
0.0    21
3.0    15
Name: 가장 비중이 높은 토픽, dtype: int64

In [31]:
topic_data = topictable.copy()
topic_data.drop(['탑 어코드' , '어코드', '가장 높은 토픽의 비중', '각 토픽의 비중'], axis = 1 , inplace=True)
topic_data

Unnamed: 0,향수명,가장 비중이 높은 토픽
0,Alien Mugler for women,2.0
1,Coco Mademoiselle Chanel for women,2.0
2,Black Orchid Tom Ford for women,2.0
3,Black Opium Yves Saint Laurent for women,2.0
4,Hypnotic Poison Christian Dior for women,2.0
...,...,...
84,Spicebomb Extreme Viktor&Rolf for men,1.0
85,Cool Water Davidoff for men,0.0
86,Herod Parfums de Marly for men,1.0
87,L’Homme Ideal Eau de Parfum Guerlain for men,1.0


In [32]:
topic_data.rename(columns={'향수명':'name', '가장 비중이 높은 토픽':'label'}, inplace=True)
topic_data

Unnamed: 0,name,label
0,Alien Mugler for women,2.0
1,Coco Mademoiselle Chanel for women,2.0
2,Black Orchid Tom Ford for women,2.0
3,Black Opium Yves Saint Laurent for women,2.0
4,Hypnotic Poison Christian Dior for women,2.0
...,...,...
84,Spicebomb Extreme Viktor&Rolf for men,1.0
85,Cool Water Davidoff for men,0.0
86,Herod Parfums de Marly for men,1.0
87,L’Homme Ideal Eau de Parfum Guerlain for men,1.0


In [34]:
topic_data.to_csv('/content/drive/MyDrive/Colab Notebooks/deeplearning_NLP/perfume/data/labeled_data.csv')