# User & Whisky name convert to Index
---
- 크롤링 과정 중 어떠한 오류로 인해 인코딩이 잘못되어 위스키 이름이 불일치하는 경우가 발생했습니다.
- 위스키 이름, 유저명을 index로 변환하면서 불일치하는 데이터도 수정하도록 합니다.
---

## 1. whisky_index 파일 생성
## 2. username_index 파일 생성
## 3. rating 파일 수정 - user_id, whisky_id, rating으로 수정
## 4. whisky 파일 수정 - whisky_name to whisky_id

In [10]:
import os
import pandas as pd
import numpy as np

In [11]:
os.getcwd()

'C:\\Users\\SSAFY\\Desktop\\Whizzle\\S08P22A805\\model\\preprocessing'

In [12]:
rating = pd.read_csv("../dataset/rating.csv", index_col=0, encoding='utf-8')

In [13]:
whisky = pd.read_csv("../dataset/whisky.csv", index_col=0, encoding='utf-8')

In [14]:
rating.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 908176 entries, 0 to 908175
Data columns (total 3 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   user_id    908176 non-null  int64  
 1   whisky_id  908176 non-null  int64  
 2   rating     908176 non-null  float64
dtypes: float64(1), int64(2)
memory usage: 27.7 MB


In [16]:
rating

Unnamed: 0,user_id,whisky_id,rating
0,0,0,9.0
1,1,0,7.0
2,2,0,9.0
3,3,0,10.0
4,4,0,9.0
...,...,...,...
908171,4828,3534,4.0
908172,119513,3534,2.0
908173,119514,3534,8.0
908174,3123,3534,4.0


In [17]:
whisky

Unnamed: 0,whisky_id,link,image,name,avr_rating,category,location,total_rating,price_tier,abv,...,herbal,oily,body,rich,sweet,salty,vanilla,tart,fruity,floral
0,0,/spirits/hibiki-21-year,https://ip-distiller.imgix.net/images/spirits/...,Hibiki 21 Year,9.01,Blended,Japan,861.0,5,43.00,...,30,20,80,80,85,15,20,25,85,50
1,1,/spirits/highland-park-18,https://ip-distiller.imgix.net/images/spirits/...,Highland Park 18 Year,8.89,Peated Single Malt,"Islands, Scotland",2988.0,4,43.00,...,20,40,70,80,70,40,50,50,70,20
2,2,/spirits/michter-s-20-year-kentucky-straight-b...,https://ip-distiller.imgix.net/images/spirits/...,Michter's 20 Year Kentucky Straight Bourbon (2...,9.00,Bourbon,"Kentucky, USA",10.0,5,57.10,...,0,15,80,90,85,5,30,25,35,0
3,3,/spirits/george-t-stagg-bourbon-fall-2019,https://ip-distiller.imgix.net/images/spirits/...,George T. Stagg Bourbon (Fall 2019),9.06,Bourbon,"Kentucky, USA",629.0,4,58.45,...,50,20,60,60,45,0,60,60,45,0
4,4,/spirits/bowmore-mizunara-cask-finish,https://ip-distiller.imgix.net/images/spirits/...,Bowmore Mizunara Cask Finish,7.86,Peated Single Malt,"Islay, Scotland",22.0,5,53.90,...,30,10,75,75,60,20,30,20,50,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3530,3530,/spirits/wild-turkey-spiced,https://ip-distiller.imgix.net/images/spirits/...,Wild Turkey Spiced,5.92,Flavored Whiskey,"Kentucky, USA",13.0,1,43.00,...,0,0,40,55,85,0,25,10,10,0
3531,3531,/spirits/seagram-s-seven-crown-american-blende...,https://ip-distiller.imgix.net/images/spirits/...,Seagram's 7 Crown American Blended Whiskey,4.31,Blended American Whiskey,USA,237.0,1,40.00,...,20,0,30,0,100,0,100,0,60,10
3532,3532,/spirits/11-wells-single-malt-whiskey,https://ip-distiller.imgix.net/images/spirits/...,11 Wells Single Malt Whiskey,6.00,American Single Malt,"Minnesota, USA",2.0,3,42.00,...,10,40,10,10,90,0,50,20,30,10
3533,3533,/spirits/immortal-spirits-early-whiskey,https://ip-distiller.imgix.net/images/spirits/...,Immortal Spirits Early Whiskey,2.00,Other Whiskey,"Oregon, USA",1.0,2,44.50,...,100,80,0,0,40,0,20,0,0,0


In [18]:
print(rating['user_id'].nunique())
print(rating['whisky_id'].nunique())

119515
3466


In [19]:
print(whisky['name'].nunique())

3535


In [20]:
whisky.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3535 entries, 0 to 3534
Data columns (total 24 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   whisky_id     3535 non-null   int64  
 1   link          3535 non-null   object 
 2   image         3535 non-null   object 
 3   name          3535 non-null   object 
 4   avr_rating    3466 non-null   float64
 5   category      3535 non-null   object 
 6   location      3535 non-null   object 
 7   total_rating  3466 non-null   float64
 8   price_tier    3535 non-null   int64  
 9   abv           3535 non-null   float64
 10  cask_type     2981 non-null   object 
 11  smoky         3535 non-null   int64  
 12  peaty         3535 non-null   int64  
 13  spicy         3535 non-null   int64  
 14  herbal        3535 non-null   int64  
 15  oily          3535 non-null   int64  
 16  body          3535 non-null   int64  
 17  rich          3535 non-null   int64  
 18  sweet         3535 non-null 

## 1. whisky_index data

### whisky_name to index dictionary
---
- rating data에서 whisky_name 의 인코딩 문제 발생
- 또한, 위스키 정보는 있지만, rating이 존재하지 않는 위스키도 존재
- 따라서, whisky 를 기준으로 index정하고, mapping 합니다.

In [21]:
whisky2Idx = {}
for i, whisky_name in enumerate(whisky['name'].unique()):
    whisky2Idx[whisky_name] = i+1

whisky2Idx

{'Hibiki 21 Year': 1,
 'Highland Park 18 Year': 2,
 "Michter's 20 Year Kentucky Straight Bourbon (2021 Release)": 3,
 'George T. Stagg Bourbon (Fall 2019)': 4,
 'Bowmore Mizunara Cask Finish': 5,
 'Macallan M': 6,
 'Port Ellen 32 Year 1979 (2012 Special Release)': 7,
 'Thomas H. Handy Sazerac Rye (Fall 2015)': 8,
 'William Larue Weller Bourbon (Fall 2015)': 9,
 'Balvenie Tun 1509 Batch 1': 10,
 'Booker\'s Bourbon Batch 2015-01 "Big Man, Small Batch"': 11,
 'Four Roses Limited Edition Single Barrel Bourbon (2013)': 12,
 'Macallan Rare Cask': 13,
 'Four Roses Limited Edition Single Barrel Bourbon (2014)': 14,
 'Macallan Fine Oak 21 Year': 15,
 'Thomas H. Handy Sazerac Rye (Fall 2014)': 16,
 "Parker's Heritage Promise of Hope": 17,
 'George Dickel x Leopold Bros Collaboration Blend (2021 Release)': 18,
 'Booker\'s Bourbon Batch 2021-01 "Donohoe\'s Batch"': 19,
 'Bushmills 28 Year The Rare Casks Cognac Cask Single Malt': 20,
 'Bunnahabhain 2008 Mòine Bordeaux Red Wine Cask Matured': 21,
 "

In [22]:
whisky2Idx

{'Hibiki 21 Year': 1,
 'Highland Park 18 Year': 2,
 "Michter's 20 Year Kentucky Straight Bourbon (2021 Release)": 3,
 'George T. Stagg Bourbon (Fall 2019)': 4,
 'Bowmore Mizunara Cask Finish': 5,
 'Macallan M': 6,
 'Port Ellen 32 Year 1979 (2012 Special Release)': 7,
 'Thomas H. Handy Sazerac Rye (Fall 2015)': 8,
 'William Larue Weller Bourbon (Fall 2015)': 9,
 'Balvenie Tun 1509 Batch 1': 10,
 'Booker\'s Bourbon Batch 2015-01 "Big Man, Small Batch"': 11,
 'Four Roses Limited Edition Single Barrel Bourbon (2013)': 12,
 'Macallan Rare Cask': 13,
 'Four Roses Limited Edition Single Barrel Bourbon (2014)': 14,
 'Macallan Fine Oak 21 Year': 15,
 'Thomas H. Handy Sazerac Rye (Fall 2014)': 16,
 "Parker's Heritage Promise of Hope": 17,
 'George Dickel x Leopold Bros Collaboration Blend (2021 Release)': 18,
 'Booker\'s Bourbon Batch 2021-01 "Donohoe\'s Batch"': 19,
 'Bushmills 28 Year The Rare Casks Cognac Cask Single Malt': 20,
 'Bunnahabhain 2008 Mòine Bordeaux Red Wine Cask Matured': 21,
 "

In [23]:
whisky_index = pd.DataFrame(list(whisky2Idx.items()), columns=["name", "whisky_id"])
whisky_index = whisky_index.reindex(columns=["whisky_id", "name"])
whisky_index

Unnamed: 0,whisky_id,name
0,1,Hibiki 21 Year
1,2,Highland Park 18 Year
2,3,Michter's 20 Year Kentucky Straight Bourbon (2...
3,4,George T. Stagg Bourbon (Fall 2019)
4,5,Bowmore Mizunara Cask Finish
...,...,...
3530,3531,Wild Turkey Spiced
3531,3532,Seagram's 7 Crown American Blended Whiskey
3532,3533,11 Wells Single Malt Whiskey
3533,3534,Immortal Spirits Early Whiskey


## 2. user_index data 

### username to index dictionary

In [26]:
user2Idx = {}
for i, username in enumerate(rating['username'].unique()):
    user2Idx[username] = i

print(len(user2Idx))
user2Idx

119515


{0: 1,
 1: 2,
 2: 3,
 3: 4,
 4: 5,
 5: 6,
 6: 7,
 7: 8,
 8: 9,
 9: 10,
 10: 11,
 11: 12,
 12: 13,
 13: 14,
 14: 15,
 15: 16,
 16: 17,
 17: 18,
 18: 19,
 19: 20,
 20: 21,
 21: 22,
 22: 23,
 23: 24,
 24: 25,
 25: 26,
 26: 27,
 27: 28,
 28: 29,
 29: 30,
 30: 31,
 31: 32,
 32: 33,
 33: 34,
 34: 35,
 35: 36,
 36: 37,
 37: 38,
 38: 39,
 39: 40,
 40: 41,
 41: 42,
 42: 43,
 43: 44,
 44: 45,
 45: 46,
 46: 47,
 47: 48,
 48: 49,
 49: 50,
 50: 51,
 51: 52,
 52: 53,
 53: 54,
 54: 55,
 55: 56,
 56: 57,
 57: 58,
 58: 59,
 59: 60,
 60: 61,
 61: 62,
 62: 63,
 63: 64,
 64: 65,
 65: 66,
 66: 67,
 67: 68,
 68: 69,
 69: 70,
 70: 71,
 71: 72,
 72: 73,
 73: 74,
 74: 75,
 75: 76,
 76: 77,
 77: 78,
 78: 79,
 79: 80,
 80: 81,
 81: 82,
 82: 83,
 83: 84,
 84: 85,
 85: 86,
 86: 87,
 87: 88,
 88: 89,
 89: 90,
 90: 91,
 91: 92,
 92: 93,
 93: 94,
 94: 95,
 95: 96,
 96: 97,
 97: 98,
 98: 99,
 99: 100,
 100: 101,
 101: 102,
 102: 103,
 103: 104,
 104: 105,
 105: 106,
 106: 107,
 107: 108,
 108: 109,
 109: 110,
 110: 11

In [27]:
user_index = pd.DataFrame(list(user2Idx.items()), columns=["username", "user_id"])
user_index = user_index.reindex(columns=["user_id", "username"])
user_index

Unnamed: 0,user_id,username
0,1,0
1,2,1
2,3,2
3,4,3
4,5,4
...,...,...
119510,119511,119510
119511,119512,119511
119512,119513,119512
119513,119514,119513


## 이미 저장된 user_index 수정

In [30]:
user_index = pd.read_csv("../dataset/user_index.csv", index_col=0, encoding="UTF-8")

In [32]:
user_index["user_id"] = user_index["user_id"].apply(lambda x : x+1)
user_index

Unnamed: 0,user_id,username
0,1,AngusBennion
1,2,ANTHONY-DIGIROLAMO
2,3,Heejin-Lim
3,4,whisky_lover
4,5,WhiskeyThreePutt
...,...,...
119510,119511,Hyppolite-Pierre-Jr
119511,119512,Vladim-r-apek
119512,119513,virglynn1
119513,119514,Will-Rosenfeld


## 3. rating 파일 수정

### rating data convert
---
- username convert to user_id
- whisky_name convert to whisky_id
- rating data에 이름 불일치하는 부분 찾아 올바른 id로 매핑해줍니다.

In [13]:
rating['user_id'] = rating['username'].map(user2Idx)
rating['whisky_id'] = rating['whisky_name'].map(whisky2Idx)

In [14]:
unmatched_dict = {
    "Bunnahabhain 2008 MÃ²ine Bordeaux Red Wine Cask Matured" : whisky2Idx["Bunnahabhain 2008 Mòine Bordeaux Red Wine Cask Matured"],
    "Lagavulin FÃ¨is Ã\x8cle 2015": whisky2Idx["Lagavulin Fèis Ìle 2015"],
    "SÃ¤ntis Malt Appenzeller Single Malt Edition Dreifaltigkeit": whisky2Idx["Säntis Malt Appenzeller Single Malt Edition Dreifaltigkeit"],
    "Bruichladdich Laddie Origins FÃ¨is Ã\x8cle 2021": whisky2Idx["Bruichladdich Laddie Origins Fèis Ìle 2021"],
    "Bowmore 15 Year FÃ¨is Ã\x8cle 2019": whisky2Idx["Bowmore 15 Year Fèis Ìle 2019"],
    "Caol Ila 22 Year FÃ¨is Ã\x8cle 2019": whisky2Idx["Caol Ila 22 Year Fèis Ìle 2019"],
    "WhistlePig The Boss Hog ?? The Samurai Scientist": whisky2Idx["WhistlePig The Boss Hog 六: The Samurai Scientist"],
    "Parker's Heritage Barrel Finished in Orange CuraÃ§ao Barrels" : whisky2Idx["Parker's Heritage Barrel Finished in Orange Curaçao Barrels"],
    'Port Charlotte PC11 EÃ²rna Na h-Alba': whisky2Idx["Port Charlotte PC11 Eòrna Na h-Alba"], 
    'Waterford The CuvÃ©e': whisky2Idx["Waterford The Cuvée"],
    'Belle Meade Bourbon MourvÃ¨dre Cask Finish': whisky2Idx["Belle Meade Bourbon Mourvèdre Cask Finish"],
    'Mackmyra GrÃ¶nt Te': whisky2Idx["Mackmyra Grönt Te"],
    'Ledaig 1998 Pedro XimÃ©nez Sherry Cask Finish': whisky2Idx['Ledaig 1998 Pedro Ximénez Sherry Cask Finish'],
    'Glenmorangie SpÃ¬os': whisky2Idx["Glenmorangie Spìos"], 
    'Mackmyra Moment BÃ¤rnsten': whisky2Idx["Mackmyra Moment Bärnsten"],
    'Bunnahabhain CeÃ²banach': whisky2Idx["Bunnahabhain Ceòbanach"]
}

In [15]:
rating.loc[rating["whisky_id"].isnull()]

Unnamed: 0,username,whisky_name,rating,user_id,whisky_id
8234,Steve-Pod,Bunnahabhain 2008 MÃ²ine Bordeaux Red Wine Cas...,8.0,573,
8235,EttParDroppar,Bunnahabhain 2008 MÃ²ine Bordeaux Red Wine Cas...,8.0,6558,
8236,Exelixi,Bunnahabhain 2008 MÃ²ine Bordeaux Red Wine Cas...,10.0,264,
8237,anwinterott,Bunnahabhain 2008 MÃ²ine Bordeaux Red Wine Cas...,9.0,6559,
8238,BCBourbon,Bunnahabhain 2008 MÃ²ine Bordeaux Red Wine Cas...,8.0,6560,
...,...,...,...,...,...
5421,Benjamin-Dumaz,Bunnahabhain CeÃ²banach,8.0,21752,
5422,Chase-Heck,Bunnahabhain CeÃ²banach,8.0,72797,
5423,Jason-T-Hauser,Bunnahabhain CeÃ²banach,8.0,2963,
5424,RequiresSugar,Bunnahabhain CeÃ²banach,8.0,4879,


In [16]:
rating.loc[rating["whisky_id"].isnull(), "whisky_id"] = rating.loc[rating["whisky_id"].isnull(), "whisky_name"].apply(lambda x: unmatched_dict.get(x))

In [17]:
rating.loc[rating["whisky_id"].isnull()]

Unnamed: 0,username,whisky_name,rating,user_id,whisky_id


#### whisky id mapping 완료
---
- 이제 rating 에서 필요없어진 username, whisky_name을 제거해줍니다.
- whisky_id data type float to int 

In [18]:
rating

Unnamed: 0,username,whisky_name,rating,user_id,whisky_id
0,AngusBennion,Hibiki 21 Year,9.0,0,0.0
1,ANTHONY-DIGIROLAMO,Hibiki 21 Year,7.0,1,0.0
2,Heejin-Lim,Hibiki 21 Year,9.0,2,0.0
3,whisky_lover,Hibiki 21 Year,10.0,3,0.0
4,WhiskeyThreePutt,Hibiki 21 Year,9.0,4,0.0
...,...,...,...,...,...
11254,Cptndrinksalot,Triple Crown Blended Whiskey,4.0,4828,3534.0
11255,Will-Rosenfeld,Triple Crown Blended Whiskey,2.0,119513,3534.0
11256,Conrad-Hasper,Triple Crown Blended Whiskey,8.0,119514,3534.0
11257,LRScotchDude,Triple Crown Blended Whiskey,4.0,3123,3534.0


In [19]:
processed_rating = rating.drop(['username', 'whisky_name'], axis=1)
processed_rating = processed_rating.reindex(columns=["user_id", "whisky_id", "rating"])
processed_rating["whisky_id"] = processed_rating["whisky_id"].astype(int)
processed_rating.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 908176 entries, 0 to 11258
Data columns (total 3 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   user_id    908176 non-null  int64  
 1   whisky_id  908176 non-null  int32  
 2   rating     888132 non-null  float64
dtypes: float64(1), int32(1), int64(1)
memory usage: 24.3 MB


## 4. whisky 파일 수정

In [20]:
whisky

Unnamed: 0,link,image,name,avr_rating,category,location,total_rating,cost_rank,abv,cask_type,...,herbal,oily,full_bodied,rich,sweet,salty,vanilla,tart,fruity,floral
0,/spirits/hibiki-21-year,https://ip-distiller.imgix.net/images/spirits/...,Hibiki 21 Year,4.52,Blended,Japan,861,5,43.00,"ex-bourbon American oak, ex-sherry European oa...",...,30,20,80,80,85,15,20,25,85,50
1,/spirits/highland-park-18,https://ip-distiller.imgix.net/images/spirits/...,Highland Park 18 Year,4.47,Peated Single Malt,"Islands, Scotland",2987,4,43.00,ex-sherry,...,20,40,70,80,70,40,50,50,70,20
2,/spirits/michter-s-20-year-kentucky-straight-b...,https://ip-distiller.imgix.net/images/spirits/...,Michter's 20 Year Kentucky Straight Bourbon (2...,4.61,Bourbon,"Kentucky, USA",10,5,57.10,"new, charred American oak",...,0,15,80,90,85,5,30,25,35,0
3,/spirits/george-t-stagg-bourbon-fall-2019,https://ip-distiller.imgix.net/images/spirits/...,George T. Stagg Bourbon (Fall 2019),4.61,Bourbon,"Kentucky, USA",628,4,58.45,"new, charred American oak",...,50,20,60,60,45,0,60,60,45,0
4,/spirits/bowmore-mizunara-cask-finish,https://ip-distiller.imgix.net/images/spirits/...,Bowmore Mizunara Cask Finish,4.08,Peated Single Malt,"Islay, Scotland",22,5,53.90,"ex-bourbon, ex-sherry, new Mizunara oak",...,30,10,75,75,60,20,30,20,50,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3530,/spirits/wild-turkey-spiced,https://ip-distiller.imgix.net/images/spirits/...,Wild Turkey Spiced,3.00,Flavored Whiskey,"Kentucky, USA",13,1,43.00,"New, Charred American Oak",...,0,0,40,55,85,0,25,10,10,0
3531,/spirits/seagram-s-seven-crown-american-blende...,https://ip-distiller.imgix.net/images/spirits/...,Seagram's 7 Crown American Blended Whiskey,2.21,Blended American Whiskey,USA,237,1,40.00,oak,...,20,0,30,0,100,0,100,0,60,10
3532,/spirits/11-wells-single-malt-whiskey,https://ip-distiller.imgix.net/images/spirits/...,11 Wells Single Malt Whiskey,3.00,American Single Malt,"Minnesota, USA",2,3,42.00,"new, charred American oak",...,10,40,10,10,90,0,50,20,30,10
3533,/spirits/immortal-spirits-early-whiskey,https://ip-distiller.imgix.net/images/spirits/...,Immortal Spirits Early Whiskey,1.00,Other Whiskey,"Oregon, USA",1,2,44.50,"new, charred American oak",...,100,80,0,0,40,0,20,0,0,0


In [21]:
whisky['whisky_id'] = whisky['name'].map(whisky2Idx)

In [22]:
# select the last column
last_col = whisky.iloc[:, -1]

# drop the last column from the DataFrame
whisky = whisky.iloc[:, :-1]

# insert the last column as the first column
whisky.insert(0, 'whisky_id', last_col)

In [23]:
whisky

Unnamed: 0,whisky_id,link,image,name,avr_rating,category,location,total_rating,cost_rank,abv,...,herbal,oily,full_bodied,rich,sweet,salty,vanilla,tart,fruity,floral
0,0,/spirits/hibiki-21-year,https://ip-distiller.imgix.net/images/spirits/...,Hibiki 21 Year,4.52,Blended,Japan,861,5,43.00,...,30,20,80,80,85,15,20,25,85,50
1,1,/spirits/highland-park-18,https://ip-distiller.imgix.net/images/spirits/...,Highland Park 18 Year,4.47,Peated Single Malt,"Islands, Scotland",2987,4,43.00,...,20,40,70,80,70,40,50,50,70,20
2,2,/spirits/michter-s-20-year-kentucky-straight-b...,https://ip-distiller.imgix.net/images/spirits/...,Michter's 20 Year Kentucky Straight Bourbon (2...,4.61,Bourbon,"Kentucky, USA",10,5,57.10,...,0,15,80,90,85,5,30,25,35,0
3,3,/spirits/george-t-stagg-bourbon-fall-2019,https://ip-distiller.imgix.net/images/spirits/...,George T. Stagg Bourbon (Fall 2019),4.61,Bourbon,"Kentucky, USA",628,4,58.45,...,50,20,60,60,45,0,60,60,45,0
4,4,/spirits/bowmore-mizunara-cask-finish,https://ip-distiller.imgix.net/images/spirits/...,Bowmore Mizunara Cask Finish,4.08,Peated Single Malt,"Islay, Scotland",22,5,53.90,...,30,10,75,75,60,20,30,20,50,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3530,3530,/spirits/wild-turkey-spiced,https://ip-distiller.imgix.net/images/spirits/...,Wild Turkey Spiced,3.00,Flavored Whiskey,"Kentucky, USA",13,1,43.00,...,0,0,40,55,85,0,25,10,10,0
3531,3531,/spirits/seagram-s-seven-crown-american-blende...,https://ip-distiller.imgix.net/images/spirits/...,Seagram's 7 Crown American Blended Whiskey,2.21,Blended American Whiskey,USA,237,1,40.00,...,20,0,30,0,100,0,100,0,60,10
3532,3532,/spirits/11-wells-single-malt-whiskey,https://ip-distiller.imgix.net/images/spirits/...,11 Wells Single Malt Whiskey,3.00,American Single Malt,"Minnesota, USA",2,3,42.00,...,10,40,10,10,90,0,50,20,30,10
3533,3533,/spirits/immortal-spirits-early-whiskey,https://ip-distiller.imgix.net/images/spirits/...,Immortal Spirits Early Whiskey,1.00,Other Whiskey,"Oregon, USA",1,2,44.50,...,100,80,0,0,40,0,20,0,0,0


In [24]:
whisky.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3535 entries, 0 to 3534
Data columns (total 24 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   whisky_id     3535 non-null   int64  
 1   link          3535 non-null   object 
 2   image         3535 non-null   object 
 3   name          3535 non-null   object 
 4   avr_rating    3464 non-null   float64
 5   category      3535 non-null   object 
 6   location      3535 non-null   object 
 7   total_rating  3535 non-null   int64  
 8   cost_rank     3535 non-null   int64  
 9   abv           3535 non-null   float64
 10  cask_type     2981 non-null   object 
 11  smoky         3535 non-null   int64  
 12  peaty         3535 non-null   int64  
 13  spicy         3535 non-null   int64  
 14  herbal        3535 non-null   int64  
 15  oily          3535 non-null   int64  
 16  full_bodied   3535 non-null   int64  
 17  rich          3535 non-null   int64  
 18  sweet         3535 non-null 

## 파일 저장
---
- rating, whisky, whisky_index, user_index 기본 파일 생성

In [25]:
os.getcwd()

'C:\\Users\\SSAFY\\Desktop\\Whisky\\preprocessing'

In [33]:
whisky_index

Unnamed: 0,whisky_id,name
0,1,Hibiki 21 Year
1,2,Highland Park 18 Year
2,3,Michter's 20 Year Kentucky Straight Bourbon (2...
3,4,George T. Stagg Bourbon (Fall 2019)
4,5,Bowmore Mizunara Cask Finish
...,...,...
3530,3531,Wild Turkey Spiced
3531,3532,Seagram's 7 Crown American Blended Whiskey
3532,3533,11 Wells Single Malt Whiskey
3533,3534,Immortal Spirits Early Whiskey


In [34]:
user_index

Unnamed: 0,user_id,username
0,1,AngusBennion
1,2,ANTHONY-DIGIROLAMO
2,3,Heejin-Lim
3,4,whisky_lover
4,5,WhiskeyThreePutt
...,...,...
119510,119511,Hyppolite-Pierre-Jr
119511,119512,Vladim-r-apek
119512,119513,virglynn1
119513,119514,Will-Rosenfeld


In [28]:
processed_rating

Unnamed: 0,user_id,whisky_id,rating
0,0,0,9.0
1,1,0,7.0
2,2,0,9.0
3,3,0,10.0
4,4,0,9.0
...,...,...,...
11254,4828,3534,4.0
11255,119513,3534,2.0
11256,119514,3534,8.0
11257,3123,3534,4.0


In [35]:
whisky

Unnamed: 0,whisky_id,link,image,name,avr_rating,category,location,total_rating,price_tier,abv,...,herbal,oily,body,rich,sweet,salty,vanilla,tart,fruity,floral
0,0,/spirits/hibiki-21-year,https://ip-distiller.imgix.net/images/spirits/...,Hibiki 21 Year,9.01,Blended,Japan,861.0,5,43.00,...,30,20,80,80,85,15,20,25,85,50
1,1,/spirits/highland-park-18,https://ip-distiller.imgix.net/images/spirits/...,Highland Park 18 Year,8.89,Peated Single Malt,"Islands, Scotland",2988.0,4,43.00,...,20,40,70,80,70,40,50,50,70,20
2,2,/spirits/michter-s-20-year-kentucky-straight-b...,https://ip-distiller.imgix.net/images/spirits/...,Michter's 20 Year Kentucky Straight Bourbon (2...,9.00,Bourbon,"Kentucky, USA",10.0,5,57.10,...,0,15,80,90,85,5,30,25,35,0
3,3,/spirits/george-t-stagg-bourbon-fall-2019,https://ip-distiller.imgix.net/images/spirits/...,George T. Stagg Bourbon (Fall 2019),9.06,Bourbon,"Kentucky, USA",629.0,4,58.45,...,50,20,60,60,45,0,60,60,45,0
4,4,/spirits/bowmore-mizunara-cask-finish,https://ip-distiller.imgix.net/images/spirits/...,Bowmore Mizunara Cask Finish,7.86,Peated Single Malt,"Islay, Scotland",22.0,5,53.90,...,30,10,75,75,60,20,30,20,50,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3530,3530,/spirits/wild-turkey-spiced,https://ip-distiller.imgix.net/images/spirits/...,Wild Turkey Spiced,5.92,Flavored Whiskey,"Kentucky, USA",13.0,1,43.00,...,0,0,40,55,85,0,25,10,10,0
3531,3531,/spirits/seagram-s-seven-crown-american-blende...,https://ip-distiller.imgix.net/images/spirits/...,Seagram's 7 Crown American Blended Whiskey,4.31,Blended American Whiskey,USA,237.0,1,40.00,...,20,0,30,0,100,0,100,0,60,10
3532,3532,/spirits/11-wells-single-malt-whiskey,https://ip-distiller.imgix.net/images/spirits/...,11 Wells Single Malt Whiskey,6.00,American Single Malt,"Minnesota, USA",2.0,3,42.00,...,10,40,10,10,90,0,50,20,30,10
3533,3533,/spirits/immortal-spirits-early-whiskey,https://ip-distiller.imgix.net/images/spirits/...,Immortal Spirits Early Whiskey,2.00,Other Whiskey,"Oregon, USA",1.0,2,44.50,...,100,80,0,0,40,0,20,0,0,0


## user, whisky index 1부터 시작하도록 수정

In [36]:
rating['user_id'] = rating['user_id'].apply(lambda x: x+1)
rating['whisky_id'] = rating['whisky_id'].apply(lambda x: x+1)

In [37]:
rating

Unnamed: 0,user_id,whisky_id,rating
0,1,1,9.0
1,2,1,7.0
2,3,1,9.0
3,4,1,10.0
4,5,1,9.0
...,...,...,...
908171,4829,3535,4.0
908172,119514,3535,2.0
908173,119515,3535,8.0
908174,3124,3535,4.0


In [40]:
whisky['whisky_id'] = whisky['whisky_id'].apply(lambda x: x+1)
whisky

Unnamed: 0,whisky_id,link,image,name,avr_rating,category,location,total_rating,price_tier,abv,...,herbal,oily,body,rich,sweet,salty,vanilla,tart,fruity,floral
0,1,/spirits/hibiki-21-year,https://ip-distiller.imgix.net/images/spirits/...,Hibiki 21 Year,9.01,Blended,Japan,861.0,5,43.00,...,30,20,80,80,85,15,20,25,85,50
1,2,/spirits/highland-park-18,https://ip-distiller.imgix.net/images/spirits/...,Highland Park 18 Year,8.89,Peated Single Malt,"Islands, Scotland",2988.0,4,43.00,...,20,40,70,80,70,40,50,50,70,20
2,3,/spirits/michter-s-20-year-kentucky-straight-b...,https://ip-distiller.imgix.net/images/spirits/...,Michter's 20 Year Kentucky Straight Bourbon (2...,9.00,Bourbon,"Kentucky, USA",10.0,5,57.10,...,0,15,80,90,85,5,30,25,35,0
3,4,/spirits/george-t-stagg-bourbon-fall-2019,https://ip-distiller.imgix.net/images/spirits/...,George T. Stagg Bourbon (Fall 2019),9.06,Bourbon,"Kentucky, USA",629.0,4,58.45,...,50,20,60,60,45,0,60,60,45,0
4,5,/spirits/bowmore-mizunara-cask-finish,https://ip-distiller.imgix.net/images/spirits/...,Bowmore Mizunara Cask Finish,7.86,Peated Single Malt,"Islay, Scotland",22.0,5,53.90,...,30,10,75,75,60,20,30,20,50,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3530,3531,/spirits/wild-turkey-spiced,https://ip-distiller.imgix.net/images/spirits/...,Wild Turkey Spiced,5.92,Flavored Whiskey,"Kentucky, USA",13.0,1,43.00,...,0,0,40,55,85,0,25,10,10,0
3531,3532,/spirits/seagram-s-seven-crown-american-blende...,https://ip-distiller.imgix.net/images/spirits/...,Seagram's 7 Crown American Blended Whiskey,4.31,Blended American Whiskey,USA,237.0,1,40.00,...,20,0,30,0,100,0,100,0,60,10
3532,3533,/spirits/11-wells-single-malt-whiskey,https://ip-distiller.imgix.net/images/spirits/...,11 Wells Single Malt Whiskey,6.00,American Single Malt,"Minnesota, USA",2.0,3,42.00,...,10,40,10,10,90,0,50,20,30,10
3533,3534,/spirits/immortal-spirits-early-whiskey,https://ip-distiller.imgix.net/images/spirits/...,Immortal Spirits Early Whiskey,2.00,Other Whiskey,"Oregon, USA",1.0,2,44.50,...,100,80,0,0,40,0,20,0,0,0


In [42]:
whisky_index.to_csv("../dataset/whisky_index.csv", encoding="UTF-8")
user_index.to_csv("../dataset/user_index.csv", encoding="UTF-8")
rating.to_csv("../dataset/rating.csv", encoding="UTF-8")
whisky.to_csv("../dataset/whisky.csv", encoding="UTF-8")