<a href="https://colab.research.google.com/github/mj-woo/Dataverse/blob/main/Dataverse_Uploading_Movie_Dataset_Automation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Modules / Set API Token

In [None]:
import json
import requests
from requests import Response
from bs4 import BeautifulSoup
import pandas as pd

In [None]:
global kobis_key, kmdb_key, tmdb_key, dataverse_key

with open('./sample_data/credentials.json', 'r') as file:
    credentials = json.load(file)
    kobis_key: str = credentials["kobis_key"]  # kobis api token
    kmdb_key: str = credentials["kmdb_key"]  # kmdb api token
    tmdb_key: str = credentials["tmdb_key"]  # tmdb api token
    dataverse_key: str = credentials["dataverse_key"] # dataverse api token

# Functions

## get_kobis_movie_code((movie_name: str))

In [None]:
def get_kobis_movie_code(movie_name: str):
    '''
    get movie identification code ("movidCd") from kobis api
    :param movie_name: movie title
    :return: a list that saves kobis movie identification code(s)
    '''
    # movie list API request url
    url: str = "http://kobis.or.kr/kobisopenapi/webservice/rest/movie/searchMovieList.json"

    # query parameter
    params: dict = {"key": kobis_key,
                    "itemPerPage": 100,
                    "movieNm": movie_name}
    # HTTP response
    response: Response = requests.get(url, params=params)

    # JSON deserialize
    response_json = json.loads(response.text)
    # search only if there's a result value
    # append movidCd to kobis_code list only if movie title exactly matches and the production country is South Korea
    # save movie identification code(s) in the list below
    kobis_code = []
    if response_json["movieListResult"].get("totCnt") > 0:
      for movie in response_json["movieListResult"]["movieList"]:
        if movie["movieNm"] == movie_name and movie["repNationNm"] in ("한국", "대한민국"):
          kobis_code.append(movie["movieCd"])
    return kobis_code

In [None]:
get_kobis_movie_code('마녀')

['20229257', '20228817', '20185341', '20136864']

## get_kobis_metadata(movie_code: str)

In [None]:
def get_kobis_metadata(movie_code: str):
    '''
    get movie metadata from kobis API using movie identification code
    :param movie_code: kobis movie identification code
    :return: kobis movie metadata
    '''

    # movie information API request url
    url: str = "http://www.kobis.or.kr/kobisopenapi/webservice/rest/movie/searchMovieInfo.json"

    # query parameter
    params: dict = {"key": kobis_key,
                    "movieCd": movie_code}

    # HTTP response
    response: Response = requests.get(url, params=params)

    # JSON deserialize
    response_json = json.loads(response.text)

    return response_json["movieInfoResult"]['movieInfo']

In [None]:
get_kobis_metadata((get_kobis_movie_code('마녀')))

## get_kmdb_metadata(movie_name: str)

In [None]:
def get_kmdb_metadata(movie_name: str):
    '''
    get movie metadata from kmdb api
    :param: movie_name: movie name
    :return: kmbd movie metadata
    '''

    url: str = "http://api.koreafilm.or.kr/openapi-data2/wisenut/search_api/search_json2.jsp?collection=kmdb_new2"

    params: dict = {"ServiceKey": kmdb_key,
                    "listCount": 1000,
                    "title": movie_name}

    # HTTP response
    response: Response = requests.get(url, params=params)

    # JSON deserialize
    kmdb_metadata_list = []
    response_json = json.loads(response.text)
    if response_json["TotalCount"] > 0:
      return response_json["Data"][0]["Result"]

In [None]:
get_kmdb_metadata('히어로')

## producer(company), distributor(company), posterUrl(posters)

In [None]:
def producer(company: dict):
    '''
    get one of the movie producer names
    :param: company: company information
    :return: movie producer name
    '''
    # company is an item in a list: kobis_metadata["companys"]
    if company["companyPartNm"] == "제작사":
      return company["companyNm"]

def distributor(company: dict):
    '''
    get one of the movie distributor names
    :param: company: company information
    :return: movie distributor name
    '''
    if company["companyPartNm"] == "배급사":
      return company["companyNm"]

def posterUrl(posters: str):
    '''
    get one of the poster urls and convert http:// into https:// format for a security reason
    :param: posters: poster urls
    :return: one poster url
    '''
    poster = posters.split("|")[:1]
    poster = [i.replace("http","https") for i in poster]
    return poster

## change_genre(kobis_genre_list)

In [None]:
def change_genre(kobis_genre_list: list):
    '''
    change genre names
    :param: kobis_genre_list: movie genres
    :return: a list with changed movie genre names
    '''
  before = ['미스터리', '가족', '판타지', '전쟁', 'SF', '멜로/로맨스', '공포(호러)']
  after = ['스릴러', '드라마', 'SF/판타지', '액션', 'SF/판타지', '로맨스', '공포']
  for i in range(len(kobis_genre_list)):
    for j in range(len(before)):
      if kobis_genre_list[i] == before[j]:
        kobis_genre_list[i] = after[j]
  return kobis_genre_list

def genre_modification(kobis_genre_list: list):
    '''
    modify kobis movie genres to customize to our movie trailer website genres
    :param: kobis_genre_listame: movie genres
    :return: a list with modified movie genre names
    '''
  # remove 'western', 'adult (erotic)', 'documentary', 'other' genres
  blacklist = ['서부극(웨스턴)', '성인물(에로)', '다큐멘터리', '기타']
  for i in kobis_genre_list:
    if i in blacklist:
      return -1

  # process 'performance', 'musical' genres
  if '공연' in kobis_genre_list:
    if len(kobis_genre_list) ==1:
      return -1
    else:
      kobis_genre_list.remove('공연')

  if '뮤지컬' in kobis_genre_list:
    if len(kobis_genre_list) ==1:
      return -1
    else:
      kobis_genre_list.remove('뮤지컬')

  # process 'historical play' genre
  if '사극' in kobis_genre_list:
    if len(kobis_genre_list) ==1:
      kobis_genre_list[kobis_genre_list.index('사극')] = '드라마'
    else:
      kobis_genre_list.remove('사극')

  kobis_genre_list = change_genre(kobis_genre_list)

  # eliminate any redundant genre in a list
  kobis_genre_list = list(set(kobis_genre_list))

  return kobis_genre_list

## trailer(movieCd): choose the trailer url by using BeautifulSoup

In [None]:
vodUrl_list = []

def trailer(movieCd: str):
    '''
    use BeautifulSoup to scrape movie trailer url from kmdb website and append to vodUrl_list
    :param: movieCd: kmdb movie identification code
    :return: None
    '''
  vodUrl_list.clear()
  alpha = movieCd[0]
  number = movieCd[1:]
  url = f"http://kmdb.or.kr/db/kor/detail/movie/{alpha}/{number}/own/videoData"
  r = requests.get(url.format(alpha, number))
  soup = BeautifulSoup(r.text, 'html.parser')

  for li in soup.find_all('li', class_='pt0'):
    text = li.find('span', attrs = {'class': "timeline-musicvideo"}).get_text()
    vodUrl = li.a['href']
    result = vodUrl[20:36]
    if text == '예고편':
      vodUrl_list.append(f"https://www.kmdb.or.kr/trailer/trailerPlayPop?pFileNm={result}")

## write(metadata_base): create json file

In [None]:
def write(metadata_base: dict):
    '''
    create a json file
    :param: metadata_base: movie metadata
    :return: None
    '''
  title = metadata_base["title"]
  file_path = f"./sample_data/movie/{title}.json"
  json_data = json.dumps(metadata_base, ensure_ascii=False)
  with open(file_path, 'w') as f:
      f.write(json_data)

## get_movie_metadata(movie_name)

In [None]:
def get_metadata_per_movie(movie_name: str):
  '''
  use each movie identification code in kobis_code list (global) to create a movie metadata and and upload on Dataverse as a separate dataset
  :param: movie_name: movie name
  :return: None
  '''
  kobis_movie_code = get_kobis_movie_code(movie_name)
  for kobis_movie in kobis_movie_code:
    kobis_metadata = get_kobis_metadata(kobis_movie)
    result = get_movie_metadata(kobis_metadata,movie_name)
    if result == -1:
      print("No Match .. Skipping this movie")
      continue
    print(movie_name)
    dataset_response = make_film_dataset(movie_name, result)
    print(dataset_response)

In [None]:
def get_movie_metadata(kobis_metadata: dict, movie_name: str):
    '''
    request movie metadata and get the metadata in a dict format
    :param: movie_name: movie name
    :return: json metadata format to upload on Dataverse dataset
    '''

    metadata_base = {
        "title": "sample",
        "titleEng": "sample",
        "genre": [],
        "synopsis": {},
        "openDate": "sample",
        "runningTimeMinute": 0,
        "actors": [],
        "directors": [],
        "producer" : [],
        "distributor": [],
        "keywords": [],
        "posterUrl": "",
        "vodUrl": []
    }

    kmdb_metadata = get_kmdb_metadata(movie_name)
    # Do not upload on dataverse if directors value is null
    if(kobis_metadata["directors"]==[]):
      print('No director')
      return -1
    else:
      metadata_base["directors"] = kobis_metadata["directors"]

    movie_match: dict = dict()
    flag = 0
    # Do not upload on dataverse if kmdb metadata value is null
    if kmdb_metadata == None:
      print("No matching movie (kobis & KMDB)")
      return -1
    # Do not upload on dataverse if there is no matching movie
    else:
      for movie in kmdb_metadata:
        for director in movie.get("directors").get("director"):
          if director.get("directorNm") == metadata_base["directors"][0]["peopleNm"]:
            flag = 1
            movie_match = movie
            break
        if flag == 1:
          break
    if flag==0:
      print("No matching movie (kobis & KMDB)")
      return -1

    metadata_base["title"] = kobis_metadata["movieNm"]
    metadata_base["titleEng"] = kobis_metadata["movieNmEn"]
    metadata_base["openDate"] = kobis_metadata["openDt"]
    metadata_base["runningTimeMinute"] = kobis_metadata["showTm"]
    metadata_base["actors"] = list(map(lambda x: x["peopleNm"], kobis_metadata["actors"][:5]))
    producer_list = list(map(producer, kobis_metadata["companys"]))
    for i in producer_list:
      if i!=None:
        metadata_base["producer"].append(i)
        break
    distributor_list = list(map(distributor, kobis_metadata["companys"]))
    for i in distributor_list:
      if i != None:
        metadata_base["distributor"].append(i)
        break
    genre = genre_modification(list(map(lambda x: x["genreNm"], kobis_metadata["genres"])))
    if genre == -1:
      return -1
    else:
      metadata_base["genre"] = genre

    metadata_base["synopsis"] = movie_match["plots"]["plot"][0]
    keywords = list(set(movie_match["keywords"].split(",")[:7]))[:5]
    metadata_base["keywords"] = keywords
    metadata_base["posterUrl"] = posterUrl(movie_match["posters"])
    trailer(movie_match["DOCID"])
    cnt = 0
    for i in vodUrl_list:
      metadata_base["vodUrl"].append(i)
      cnt+=1
      if cnt == 3:
        break

    # Do not upload on dataverse if vodUrl value is null (as the focus is on creating a movie trailer website; may later be customized)
    if metadata_base["vodUrl"] == []:
      return -1
    if metadata_base['posterUrl'] == "":
      metadata_base['posterUrl'] = "정보 없음";

    metadata_base["directors"] = list(map(lambda x: x["peopleNm"], kobis_metadata["directors"]))
    write(metadata_base)
    return metadata_base

In [None]:
def make_film_dataset(movie_name: str, movie_metadata: dict):
    '''
    create a dataset on SNU dataverse (movies sub-dataverse)
    :param: movie_name: movie name; use it as a dataset title
    :param: movie_metadata: movie metadata; use it as a dataset metadata (description section)
    :return: REST API response value
    '''
    server_url = "https://snu.dataverse.ac.kr"
    parent = "movies"
    route = f"/api/dataverses/{parent}/datasets"

    url = server_url + route

    headers = {"X-Dataverse-key": dataverse_key,
               "Content-Type": "application/json"}
    body: dict
    with open("./sample_data/dataset-default.json", "r", encoding='utf-8') as file:
        json_data = json.load(file)
        body = json_data

    body["datasetVersion"]["metadataBlocks"]["citation"]["fields"][0]["value"] = movie_name
    body["datasetVersion"]["metadataBlocks"]["citation"]["fields"][3]["value"][0]["dsDescriptionValue"]["value"] = \
        json.dumps(movie_metadata, ensure_ascii=False)

    response = requests.post(url, headers=headers, json=body)
    return response.text

# Clear all datasets in a dataverse collection

In [None]:
def clear_dataverse_collection():
    '''
    clear all dataset from a dataverse collection
    :param: None
    :return: None
    '''
    url = "https://snu.dataverse.ac.kr/api/dataverses/movies/contents"
    headers = {"X-Dataverse-key": dataverse_key}

    response = requests.get(url, headers=headers)
    body = json.loads(response.text)
    for dataset in body["data"]:
        dataset_id = dataset["id"]
        dataset_url = f"https://snu.dataverse.ac.kr/api/datasets/{dataset_id}"
        requests.delete(dataset_url, headers=headers)
    response = requests.get(url, headers=headers)
    body = json.loads(response.text)
    print(body)

In [None]:
clear_dataverse_collection()

{'status': 'OK', 'data': []}


# Movie List to Upload (Read from KOBIS excel file)
### Total: 5505 rows in raw dataset (1450 uploaded on Dataverse)

In [None]:
file_path = "./sample_data/kobis_movies_list 3.xlsx"
movie_list = pd.read_excel(file_path)
movie_list = movie_list.drop(range(3))
movie_list.reset_index(drop = True, inplace = True)

new_column_names = movie_list.iloc[0]
movie_list.columns = new_column_names
movie_list = movie_list.iloc[1:].reset_index(drop = True)
movie_list[:5]

Unnamed: 0,영화명,영화명(영문),제작연도,제작국가,유형,장르,제작상태,감독,제작사,'성인물(에로)' 포함 여부,감독 2명 이상 여부
0,퀴어 마이 프렌즈,Queer My Friends,2022,한국,장편,"다큐멘터리,드라마,가족",개봉예정,서아현,시소픽쳐스,0,0
1,데시벨,Decibel,2021,한국,장편,액션,개봉,황인호,"이스트드림시노펙스(주),(주)기억속의매미",0,0
2,잠,Sleep,2022,한국,장편,미스터리,개봉예정,유재선,(주)루이스 픽쳐스,0,0
3,낭만여행,Romance trip,2023,한국,장편,"코미디,드라마",개봉예정,박동기,낭만필름,0,0
4,더 문,The Moon,2023,한국,장편,"SF,액션,드라마",개봉예정,김용화,(주)블라드스튜디오,0,0


In [None]:
for movie in movie_list["영화명"]:
  get_metadata_per_movie(movie)