### Vivino API[explore]

 - country_code : shipping country
 - state : shipping location
 - currency_code
 - country_codes : wine country code list
 - grape_filter
 - min_rating
 - order_by
 - order
 - price_range_max : ~500
 - price_range_min : 0~
 - wine_type_ids : (1, 2, 3, 4, 7, 24)

In [4]:
# Vivino 수집 클래스

import time
import requests
import pandas as pd

class VivinoAPI:
    headers = {"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
               "referer":"https://www.vivino.com/explore?e=eJzLLbI1VMvNzLM1UMtNrLA1NTBQS660DQ1WSwYSLmoFQNn0NNuyxKLM1JLEHLX8ohTblNTiZLX8pErbgqLM5FS18pLoWKAqMGUEoUwglDmEMobKmQBNLgYyEovUim2dHQF9pSex"}

    def get(self, url, **kwargs):
        try:
            r = requests.get(url, headers=self.headers, params=kwargs)
            time.sleep(0.02)
            if r.status_code==200:
                return {"headers":r.headers, "json":r.json()}
        except Exception as e:
            if r.status_code==429:
                time.sleep(10)
                return self.get(url, **kwargs)
            else:
                print(f"status code :{r.status_code}")
                raise e

In [5]:
# 생산국 정보 수집

import os

api = VivinoAPI()

if os.path.exists("./csv")==False:
    os.mkdir("csv")

countries = api.get("https://www.vivino.com/api/countries?language=en")["json"]
df_country = pd.DataFrame(countries["countries"], columns=["code", "name", "users_count", "wines_count"])
df_country.to_csv(".../csv/scraping/country.csv", index=False)

In [6]:
# csv 생성

df_wine = pd.DataFrame(columns=["winery","year","id","name","title","acidity","intensity","sweetness","tannin","price","type_id","country"])
df_wine.to_csv("", index=False)

df_review = pd.DataFrame(columns=["id", "year", "wine_id", "user_rating", "user_id"])
df_review.to_csv(".../csv/scraping/reviews.csv", index=False)

In [7]:
# 수집 정보 csv 생성

wine_type_ids = ["1", "2", "3", "4", "7", "24"]
country_codes = df_country.loc[df_country["wines_count"]!=0, "code"].to_list()

d = {"country_code":[],
     "wine_type_id":[],
     "wines_cnt":[],
     "latest_idx":[]}

for _c in country_codes:
    for _w in wine_type_ids:
        d["country_code"].append(_c)
        d["wine_type_id"].append(_w)
        d["wines_cnt"].append(-1)
        d["latest_idx"].append(0)

df_status = pd.DataFrame(d)
df_status.to_csv(".../csv/scraping/status.csv", index=False)

In [8]:
# 와인 정보 수집 함수

def get_wines(country_code:str, wine_type_id:int):
    url = "https://www.vivino.com/api/explore/explore?per_page=50"
    result = []

    params = {
        "country_code":"us",
        "state":"ca",
        "currency_code":"USD",
        "country_codes[]":[country_code],
        "grape_filter":"varietal",
        "min_rating":"1",
        "order_by":"price",
        "order":"asc",
        "price_range_max":"500",
        "price_range_min":"0",
        "wine_type_ids[]":[wine_type_id],
        "page":1
    }
    api = VivinoAPI()
    total_cnt = api.get(url, **params)["json"]["explore_vintage"]["records_matched"]
    total_page = (total_cnt//50 if total_cnt%50==0 else total_cnt//50+1)+1
    
    while params["page"]<=total_page:
        r = api.get(url, **params)["json"]
    
        for _w in r["explore_vintage"]["matches"]:
            
            try:
                winery = _w["vintage"]["wine"]["winery"]["name"]
            except TypeError as e:
                winery = -1
            
            try:
                year = _w["vintage"]["year"]
            except TypeError as e:
                year = -1
            
            try:
                id = _w["vintage"]["wine"]["id"]
            except TypeError as e:
                id = -1

            try:
                name = _w["vintage"]["wine"]["name"]
            except TypeError as e:
                name = -1

            try:
                title = _w["vintage"]["name"]
            except TypeError as e:
                title = -1
            
            try:
                acidity = _w["vintage"]["wine"]["taste"]["structure"]["acidity"]
            except TypeError as e:
                acidity = -1
            
            try:
                intensity = _w["vintage"]["wine"]["taste"]["structure"]["intensity"]
            except TypeError as e:
                intensity = -1
            
            try:
                sweetness = _w["vintage"]["wine"]["taste"]["structure"]["sweetness"]
            except TypeError as e:
                sweetness = -1
            
            try:
                tannin = _w["vintage"]["wine"]["taste"]["structure"]["tannin"]
            except TypeError as e:
                tannin = -1
            
            try:
                price = _w["price"]["amount"]
            except TypeError as e:
                price = -1
            
            try:
                type_id = _w["vintage"]["wine"]["type_id"]
            except TypeError as e:
                type_id = -1
            
            try:
                country = _w["vintage"]["wine"]["region"]["country"]["name"]
            except TypeError as e:
                country = -1
            
            result.append([
                winery,
                year,
                id,
                name,
                title,
                acidity,
                intensity,
                sweetness,
                tannin,
                price,
                type_id,
                country
            ])
        print(f"\rGetting wine data about country:{country_code} wine_type:{wine_type_id} page {params['page']}:total {total_page}", end=" ")
        params["page"] += 1
    _df = pd.DataFrame(data=result, columns=["winery","year","id","name","title","acidity","intensity","sweetness","tannin","price","type_id","country"])
    _df.drop_duplicates(["id", "year"], inplace=True)
    _df.reset_index(drop=True, inplace=True)
    print("")
    print("Complete Getting Wines")
    return _df

In [9]:
# 리뷰 수집 함수

def get_reviews(wine_id, year):
    result = []
    page = 1
    requester = VivinoAPI()

    while True:
        print(f"\r Getting reviews about wine {wine_id}-{year} Page {page}", end="")
        if str(year)=="nan":
            url = f"https://www.vivino.com/api/wines/{wine_id}/reviews?per_page=50&&page={page}"
        else:
            url = f"https://www.vivino.com/api/wines/{wine_id}/reviews?per_page=50&year={year}&page={page}"
        r = requester.get(url)["json"]

        if not r["reviews"]:
            print("")
            break

        for review in r["reviews"]:
            result.append([
                review["id"],
                review["vintage"]["year"],
                wine_id,
                review["rating"],
                review["user"]["id"]
            ])

        page += 1
    
    print("Complete Getting Reviews")
    return pd.DataFrame(data=result, columns=["id", "year", "wine_id", "user_rating", "user_id"])

In [None]:
# 와인 정보 수집

df_status = pd.read_csv(".../csv/scraping/status.csv")

for i, r in df_status.iterrows():
    if r["wines_cnt"]==-1:
        wines = get_wines(r["country_code"], r["wine_type_id"])
        wines.to_csv(".../csv/scraping/wines.csv", header=False, index=False, mode="a")
        df_status.loc[(df_status["country_code"]==r.country_code)&(df_status["wine_type_id"]==r.wine_type_id), "wines_cnt"]=len(wines)
        df_status.to_csv(".../csv/scraping/status.csv", index=False)

In [None]:
# 수집 정보 csv 수정

df_wine = pd.read_csv(".../csv/scraping/status.csv")

for i, r in df_status.iterrows():
    country_name = df_country.loc[df_country["code"]==r["country_code"], "name"].values[0]
    id_cnt = len(df_wine.loc[(df_wine["country"]==country_name)&(df_wine["type_id"]==r["wine_type_id"]), ["id"]].drop_duplicates())
    df_status.loc[(df_status["country_code"]==r["country_code"])&(df_status["wine_type_id"]==r["wine_type_id"]), "wines_cnt"] = id_cnt

df_status.to_csv(".../csv/scraping/status.csv", index=False)

In [None]:
# 리뷰 수집

df_status = pd.read_csv(".../csv/scraping/status.csv")
df_wine = pd.read_csv(".../csv/scraping/wines.csv")
df_country = pd.read_csv(".../csv/scraping/country.csv")

for i, r in df_status.iterrows():
    if r["wines_cnt"]==0: 
        continue
    elif r["wines_cnt"]==r["latest_idx"]+1:
        continue
    else:
        country_name = df_country.loc[df_country["code"]==r["country_code"], "name"].values[0]
        wines = df_wine.loc[(df_wine["country"]==country_name)&(df_wine["type_id"]==r["wine_type_id"]), ["id", "year"]].drop_duplicates(["id"])
        wines.reset_index(drop=True, inplace=True)
        wines = wines.iloc[r["latest_idx"]:, :]
        
        for j, w in wines.iterrows():
            reviews = get_reviews(w["id"], w["year"])
            reviews.to_csv(".../csv/scraping/reviews.csv", header=False, index=False, mode='a')
            df_status.loc[(df_status["country_code"]==r["country_code"])&(df_status["wine_type_id"]==r["wine_type_id"]), "latest_idx"] = j
            df_status.to_csv(".../csv/scraping/status.csv", index=False)