# 향수 상세정보 크롤링 데이터 적재 스크립트
## 관련 테이블 정보
- evaluation_code, evaluation_field_option, fragrantica_evaluation
- perfume, perfume_brand
- pefume_accord, note, perfume_note

## TODO
- 이슈 사항에 대한 처리
- perfume_brand: 초기 display_order, image_url 추가
- perfume: image_url에 대해 s3 url로 변경(현재 프라그란티카 url)
- 한영 dictionary를 이용하여 한글로 변환 후 적재하는 로직 추가

In [8]:
!pip install pymysql



In [21]:
# test
import pymysql


# MySQL 연결 설정
conn = pymysql.connect(
    host='127.0.0.1', 
    port=3306,
    user='root', 
    password='secret', 
    db='purple', 
    charset='utf8'
)

cursor = conn.cursor()

sql = "SHOW TABLES;"
cursor.execute(sql)
result = cursor.fetchall()
for table in result:
    print(table[0])

conn.commit()
conn.close()

evaluation_code
evaluation_field_option
favorite
fragrantica_evaluation
note
perfume
perfume_accord
perfume_brand
perfume_note
rating
review
user
user_preference_note


In [22]:
import os
import json

# JSON 데이터
DIRECTORY_PATH = './crawling_data'
JSON_DATA_LIST = []
for filename in os.listdir(DIRECTORY_PATH):
    if filename.endswith('.json'):
        file_path = os.path.join(DIRECTORY_PATH, filename)
        
        with open(file_path, 'r', encoding='utf-8') as json_file:
            data = json.load(json_file)
            JSON_DATA_LIST.append(data)

In [23]:
# code set
evaluation_codes = {
	"LONGEVITY": {
		"code": "EF001",
		"name": "지속력",
		"type": "FIELD",
		"options": {
			"very weak": {
				"code": "EO101",
				"name": "매우 약함",
				"type": "OPTION"
			},
			"weak": {
				"code": "EO102",
				"name": "약함",
				"type": "OPTION"
			},
			"moderate": {
				"code": "EO103",
				"name": "보통",
				"type": "OPTION"
			},
			"long lasting": {
				"code": "EO104",
				"name": "오래감",
				"type": "OPTION"
			},
			"eternal": {
				"code": "EO105",
				"name": "매우 오래감",
				"type": "OPTION"
			}
		}
	},
	"SILLAGE": {
		"code": "EF002",
		"name": "시야주",
		"type": "FIELD",
		"options": {
			"intimate": {
				"code": "EO201",
				"name": "향 여운이 약함",
				"type": "OPTION"
			},
			"moderate": {
				"code": "EO202",
				"name": "보통",
				"type": "OPTION"
			},
			"strong": {
				"code": "EO203",
				"name": "향 여운이 강함",
				"type": "OPTION"
			},
			"enormous": {
				"code": "EO204",
				"name": "향 여운이 매우 강함",
				"type": "OPTION"
			}
		}
	},
	"seasonData": {
		"code": "EF003",
		"name": "계절감/시간",
		"type": "FIELD",
		"options": {
			"spring": {
				"code": "EO301",
				"name": "봄",
				"type": "OPTION"
			},
			"summer": {
				"code": "EO302",
				"name": "여름",
				"type": "OPTION"
			},
			"fall": {
				"code": "EO303",
				"name": "가을",
				"type": "OPTION"
			},
			"winter": {
				"code": "EO304",
				"name": "겨울",
				"type": "OPTION"
			},
			"day": {
				"code": "EO305",
				"name": "낮",
				"type": "OPTION"
			},
			"night": {
				"code": "EO306",
				"name": "밤",
				"type": "OPTION"
			}
		}
	},
	"GENDER": {
		"code": "EF004",
		"name": "성별",
		"type": "FIELD",
		"options": {
			"male": {
				"code": "EO401",
				"name": "남성",
				"type": "OPTION"
			},
			"more male": {
				"code": "EO402",
				"name": "남성에 가까운",
				"type": "OPTION"
			},
			"unisex": {
				"code": "EO403",
				"name": "중성",
				"type": "OPTION"
			},
			"more female": {
				"code": "EO404",
				"name": "여성에 가까운",
				"type": "OPTION"
			},
			"female": {
				"code": "EO405",
				"name": "여성",
				"type": "OPTION"
			}
		}
	}
}


In [24]:
from datetime import datetime

def generate_id():
    '''
    현재 시간을 yyyyMMddHHmmssSSSSS 형식으로 포맷팅 후 int로 변환
    '''
    return int(datetime.now().strftime('%Y%m%d%H%M%S%f')[:-1])

def get_or_generate_id(query, variables):
    cursor.execute(query, variables)
    result = cursor.fetchone()
    return generate_id() if result == None else result[0]

In [25]:
# sql_queries

# insert
insert_evaluation_code_query = (
    "INSERT IGNORE INTO "
    "evaluation_code(code, name, type) "
    "VALUES(%s, %s, %s);"
)
insert_evaluation_field_option_query = (
    "INSERT IGNORE INTO "
    "evaluation_field_option("
    "evaluation_field_option_id, field_code, option_code"
    ") "
    "VALUES(%s, %s, %s);"
)

insert_perfume_brand_query = (
    "INSERT IGNORE INTO "
    "perfume_brand(brand_name, image_url, display_order) "
    "VALUES(%s, '', '');"
)
insert_perfume_query = (
    "INSERT INTO "
    "perfume("
    "perfume_id, brand_name, perfume_name, image_url"
    ") "
    "VALUES(%s, %s, %s, %s) "
    "ON DUPLICATE KEY UPDATE "
    "image_url = VALUES(image_url);"
)

insert_fragrantica_evaluation_query = (
    "INSERT INTO "
    "fragrantica_evaluation("
    "fragrantica_evaluation_id, perfume_id, field_code, option_code, votes"
    ") "
    "VALUES(%s, %s, %s, %s, %s) "
    "ON DUPLICATE KEY UPDATE "
    "votes = VALUES(votes);"
)

insert_note_query = (
    "INSERT IGNORE INTO "
    "note(name) "
    "VALUES(%s);"
)
insert_perfume_note_query = (
    "INSERT IGNORE INTO "
    "perfume_note("
    "perfume_note_id, perfume_id, note_name, perfume_note_type"
    ") "
    "VALUES(%s, %s, %s, %s);"
)

insert_perfume_accord_query = (
    "INSERT INTO "
    "perfume_accord("
    "perfume_accord_id, perfume_id, note_name, accord_value"
    ") "
    "VALUES(%s, %s, %s, %s) "
    "ON DUPLICATE KEY UPDATE "
    "accord_value = VALUES(accord_value);"
)

# select id
select_perfume_perfume_id_query = (
    "SELECT perfume_id "
    "FROM perfume "
    "WHERE brand_name=%s AND perfume_name=%s;"
)
select_fragrantica_evaluation_id_query = (
    "SELECT fragrantica_evaluation_id "
    "FROM fragrantica_evaluation "
    "WHERE perfume_id=%s AND field_code=%s AND option_code=%s;"
)
select_perfume_note_id_query = (
    "SELECT perfume_note_id "
    "FROM perfume_note "
    "WHERE perfume_id=%s AND note_name=%s AND perfume_note_type=%s;"
)
select_perfume_accord_id_query = (
    "SELECT perfume_accord_id "
    "FROM perfume_accord "
    "WHERE perfume_id=%s AND note_name=%s;"
)

In [26]:
import pymysql
            
# MySQL 연결 설정
conn = pymysql.connect(
    host='127.0.0.1', 
    port=3306,
    user='root', 
    password='secret', 
    db='purple', 
    charset='utf8'
)

cursor = conn.cursor()

print("=============insert start=============")
# evaluation_code & evaludation_field_option
for field_key, field_data in evaluation_codes.items():
    
    # evaluation_code(field)
    field_code, field_name, code_type = field_data['code'], field_data['name'], field_data['type'] 
    cursor.execute(insert_evaluation_code_query, (field_code, field_name, code_type))
    
    for option_key, option_data in field_data['options'].items():
        
        # evaluation_code(option)
        option_code, option_name, code_type = option_data['code'], option_data['name'], option_data['type'] 
        cursor.execute(insert_evaluation_code_query, (option_code, option_name, code_type))
        
        # evaluation_field_option
        evaluation_field_option_id = generate_id()
        cursor.execute(
            insert_evaluation_field_option_query, 
            (evaluation_field_option_id, field_code, option_code)
        )

conn.commit()
print("===========evaluation_code & evaludation_field_option complete.===========")

field_key_list = ["seasonData", "LONGEVITY", "SILLAGE", "GENDER"]

for brand_data in JSON_DATA_LIST:
    for perfume_data in brand_data:
        # perfume_brand
        # TODO: 초기 display_order, image_url 추가
        brand_name = perfume_data['companyName']
        cursor.execute(insert_perfume_brand_query, (brand_name))
        
        # perfume 
        # TODO: image_url에 대해 s3 url로 변경(현재 프라그란티카 url)
        perfume_name = perfume_data['perfumeName']
        print(f"start: {brand_name} > {perfume_name}")
        
        perfume_id = get_or_generate_id(
            select_perfume_perfume_id_query,
            (brand_name, perfume_name)
        )
        image_url = perfume_data.get('thumbnailSrc')
        cursor.execute(
            insert_perfume_query, 
            (perfume_id, brand_name, perfume_name, image_url)
        )
        
        # fragrantica_evaluation(seasonData)
        field_code = evaluation_codes['seasonData']['code']
        for option_key, votes in perfume_data['seasonData'].items():
            option_code = evaluation_codes['seasonData']['options'][option_key]['code']
            fragrantica_evaluation_id = get_or_generate_id(
                select_fragrantica_evaluation_id_query,
                (perfume_id, field_code, option_code)
            )
            cursor.execute(
                insert_fragrantica_evaluation_query, 
                (fragrantica_evaluation_id, perfume_id, field_code, option_code, votes)
            )
        
        # fragrantica_evaluation(perfumeProperties)
        for field_key, field_data in perfume_data['perfumeProperties'].items():
            if (field_key not in field_key_list): continue;
            field_code = evaluation_codes[field_key]['code']
            
            for option_key, votes in field_data.items():
                option_code = evaluation_codes[field_key]['options'][option_key]['code']
                fragrantica_evaluation_id = get_or_generate_id(
                    select_fragrantica_evaluation_id_query,
                    (perfume_id, field_code, option_code)
                )
                cursor.execute(
                    insert_fragrantica_evaluation_query, 
                    (fragrantica_evaluation_id, perfume_id, field_code, option_code, votes)
                )
        
        # perfume_accord
        for note_name, accord_value in perfume_data['notes'].items():
            perfume_accord_id = get_or_generate_id(
                select_perfume_accord_id_query,
                (perfume_id, note_name)
            )
            cursor.execute(
                insert_perfume_accord_query, 
                (perfume_accord_id, perfume_id, note_name, accord_value)
            )
        
        
        if (perfume_data['noteTopMiddleBase'].get("Top Notes")):
            for note_name in perfume_data['noteTopMiddleBase'].get("Top Notes"):
                # note
                cursor.execute(insert_note_query, (note_name))
                
                # perfume_note
                perfume_note_type = "TOP"
                perfume_note_id = get_or_generate_id(
                    select_perfume_note_id_query,
                    (perfume_id, note_name, perfume_note_type)
                )
                cursor.execute(
                    insert_perfume_note_query,
                    (perfume_note_id, perfume_id, note_name, perfume_note_type)
                )
                
                
                
        if (perfume_data['noteTopMiddleBase'].get("Middle Notes")):
            for note_name in perfume_data['noteTopMiddleBase'].get("Middle Notes"):
                # note
                cursor.execute(insert_note_query, (note_name))
                
                # perfume_note
                perfume_note_type = "MIDDLE"
                perfume_note_id = get_or_generate_id(
                    select_perfume_note_id_query,
                    (perfume_id, note_name, perfume_note_type)
                )
                cursor.execute(
                    insert_perfume_note_query,
                    (perfume_note_id, perfume_id, note_name, perfume_note_type)
                )
                
        
        if (perfume_data['noteTopMiddleBase'].get("Base Notes")):
            for note_name in perfume_data['noteTopMiddleBase'].get("Base Notes"):
                # note
                cursor.execute(insert_note_query, (note_name))        
                
                # perfume_note
                perfume_note_type = "BASE"
                perfume_note_id = get_or_generate_id(
                    select_perfume_note_id_query,
                    (perfume_id, note_name, perfume_note_type)
                )
                cursor.execute(
                    insert_perfume_note_query,
                    (perfume_note_id, perfume_id, note_name, perfume_note_type)
                )
        print(f"no error: {brand_name} > {perfume_name}")
        
        
conn.commit()
print("===========perfume data committed.===========")

cursor.close()
conn.close()


start: Marc Jacobs > Daisy Love
no error: Marc Jacobs > Daisy Love
start: Marc Jacobs > Daisy Eau So Fresh
no error: Marc Jacobs > Daisy Eau So Fresh
start: Marc Jacobs > Daisy Dream
no error: Marc Jacobs > Daisy Dream
start: Marc Jacobs > Perfect
no error: Marc Jacobs > Perfect
start: Marc Jacobs > Daisy
no error: Marc Jacobs > Daisy
start: Marc Jacobs > Daisy Love Pop
no error: Marc Jacobs > Daisy Love Pop
start: Marc Jacobs > Daisy Love Eau So Sweet Petals
no error: Marc Jacobs > Daisy Love Eau So Sweet Petals
start: Marc Jacobs > Daisy Love Skies
no error: Marc Jacobs > Daisy Love Skies
start: Marc Jacobs > Daisy Ever So Fresh
no error: Marc Jacobs > Daisy Ever So Fresh
start: Marc Jacobs > Perfect Intense
no error: Marc Jacobs > Perfect Intense
start: Byredo > Blanche
no error: Byredo > Blanche
start: Byredo > La Tulipe
no error: Byredo > La Tulipe
start: Byredo > Rose Of No Man's Land
no error: Byredo > Rose Of No Man's Land
start: Byredo > Casablanca Lily
no error: Byredo > Casa

no error: Chloé > Nomade Naturelle Eau de Parfum
start: Chloé > Nomade Absolu de Parfum
no error: Chloé > Nomade Absolu de Parfum
start: Le Labo > Another 13
no error: Le Labo > Another 13
start: Le Labo > Bergamote 22
no error: Le Labo > Bergamote 22
start: Le Labo > Santal 33
no error: Le Labo > Santal 33
start: Le Labo > The Matcha 26
no error: Le Labo > The Matcha 26
start: Le Labo > Rose 31
no error: Le Labo > Rose 31
start: Le Labo > The Noir 29
no error: Le Labo > The Noir 29
start: Le Labo > Baie 19
no error: Le Labo > Baie 19
start: Le Labo > Fleur d'Oranger 27
no error: Le Labo > Fleur d'Oranger 27
start: Le Labo > Lys 41
no error: Le Labo > Lys 41
start: Le Labo > Jasmin 17
no error: Le Labo > Jasmin 17
start: Le Labo > Lavande 31
no error: Le Labo > Lavande 31
start: Le Labo > Tonka 25
no error: Le Labo > Tonka 25
start: Le Labo > Ylang 49
no error: Le Labo > Ylang 49
start: Le Labo > Neroli 36
no error: Le Labo > Neroli 36
start: Le Labo > Vetiver 46
no error: Le Labo > Ve

no error: Giorgio Armani > Acqua di Giò Profumo
start: Giorgio Armani > Acqua di Gioia
no error: Giorgio Armani > Acqua di Gioia
start: Giorgio Armani > My Way
no error: Giorgio Armani > My Way
start: Giorgio Armani > Armani Code
no error: Giorgio Armani > Armani Code
start: Giorgio Armani > Armani Code Profumo
no error: Giorgio Armani > Armani Code Profumo
start: Giorgio Armani > Si
no error: Giorgio Armani > Si
start: Giorgio Armani > Sì Intense
no error: Giorgio Armani > Sì Intense
start: Montblanc > Legend Spirit
no error: Montblanc > Legend Spirit
start: Montblanc > Starwalker Extreme
no error: Montblanc > Starwalker Extreme
start: Montblanc > Explorer
no error: Montblanc > Explorer
start: Montblanc > Legend Eau de Parfum
no error: Montblanc > Legend Eau de Parfum
start: Montblanc > Explorer Ultra Blue
no error: Montblanc > Explorer Ultra Blue
start: Montblanc > Emblem Intense
no error: Montblanc > Emblem Intense
start: Montblanc > Signature
no error: Montblanc > Signature
start: 

# 이슈: 필요 데이터가 없는 경우 확인

In [27]:
total_perfume_count = 0
note_error_perfume_count = 0
image_not_exist_perfume_count = 0

print("===========start checking===========")
for brand_data in JSON_DATA_LIST:
    total_perfume_count += len(brand_data)
    for perfume_data in brand_data:
        # perfume_brand
        brand_name = perfume_data['companyName']
        
        # perfume 
        perfume_name = perfume_data['perfumeName']
        
        image_url = perfume_data.get('thumbnailSrc')
        if (image_url == None):
            image_not_exist_perfume_count += 1
            print(f"향수 섬네일 없음 <- 브랜드명: {brand_name}, 향수명: {perfume_name}")
        
        
        top_notes = perfume_data['noteTopMiddleBase'].get("Top Notes")
        middle_notes = perfume_data['noteTopMiddleBase'].get("Middle Notes")
        base_notes = perfume_data['noteTopMiddleBase'].get("Base Notes")
        if (top_notes == middle_notes == base_notes == None):
            note_error_perfume_count += 1
            print(f"Top/Middle/Base 노트 정보 없음 <- 브랜드명: {brand_name}, 향수명: {perfume_name}")
            
        elif (top_notes == None):
            print(f"Top 노트 정보 없음 <- 브랜드명: {brand_name}, 향수명: {perfume_name}")
                
                
                
        elif (middle_notes == None):
            print(f"Middle 노트 정보 없음 <- 브랜드명: {brand_name}, 향수명: {perfume_name}")
                
        
        elif (base_notes == None):
            print(f"Base 노트 정보 없음 <- 브랜드명: {brand_name}, 향수명: {perfume_name}")
        
        
        
print("===========all checked===========")
print(f"총 향수 개수: {total_perfume_count}")
print(f"이미지 없는 향수 개수: {image_not_exist_perfume_count}")
print(f"Top/Middle/Base 노트 정보 없는 향수 개수: {note_error_perfume_count}")

Top/Middle/Base 노트 정보 없음 <- 브랜드명: Penhaligon's, 향수명: Blenheim Bouquet
Top/Middle/Base 노트 정보 없음 <- 브랜드명: Penhaligon's, 향수명: Clandestine Clara
Top/Middle/Base 노트 정보 없음 <- 브랜드명: Penhaligon's, 향수명: Much Ado About The Duke
Top/Middle/Base 노트 정보 없음 <- 브랜드명: Jo Malone London, 향수명: Poppy & Barley
Top/Middle/Base 노트 정보 없음 <- 브랜드명: Jo Malone London, 향수명: Cypress & Grapevine Cologne Intense
향수 섬네일 없음 <- 브랜드명: Jo Malone London, 향수명: Ginger Beer Cologne
Top/Middle/Base 노트 정보 없음 <- 브랜드명: Acqua di Parma, 향수명: Acqua di Parma Blu Mediterraneo - Fico di Amalfi
Top/Middle/Base 노트 정보 없음 <- 브랜드명: Acqua di Parma, 향수명: Essenza di Colonia
Top/Middle/Base 노트 정보 없음 <- 브랜드명: Acqua di Parma, 향수명: Peonia Nobile
Top/Middle/Base 노트 정보 없음 <- 브랜드명: Chloé, 향수명: Jasminum Sambac
Top/Middle/Base 노트 정보 없음 <- 브랜드명: Le Labo, 향수명: Bergamote 22
Top/Middle/Base 노트 정보 없음 <- 브랜드명: Le Labo, 향수명: Santal 33
Top/Middle/Base 노트 정보 없음 <- 브랜드명: Le Labo, 향수명: The Matcha 26
Top/Middle/Base 노트 정보 없음 <- 브랜드명: Le Labo, 향수명: Baie 19
Top/Middl