In [122]:
import pandas as pd
import numpy as np
import scipy.spatial.distance as scidist
from sklearn.preprocessing import normalize
import json
import os
import random
import re

# Installation

Go to [https://datasets.imdbws.com/](https://datasets.imdbws.com/), download `title.basics.tsv.gz` and `title.ratings.tsv.gz`, unzip into this folder. For the covers, go to [http://mmcv.csie.ncku.edu.tw/~wtchu/projects/MoviePoster/](http://mmcv.csie.ncku.edu.tw/~wtchu/projects/MoviePoster/), download the Movie Poster Dataset, throw all images into a folder "covers" (e.g., with `mv ~/Downloads/Movie_Poster_Dataset/**/*.jpg data/imdb/covers`).

In [114]:
ratings = pd.read_csv('title.ratings.tsv', sep='\t')
movies = pd.read_csv('title.basics.tsv', sep="\t", header=0, converters={
    # sucks, but it's mixed data everywhere
    'tconst': str,
    'titleType': str,
    'primaryTitle': str,
    'originalTitle': str,
    'isAdult': str,
    'startYear': str,
    'endYear': str,
    'runtimeMinutes': str,
    'genres': str,
})


In [115]:
# filter for no porn and movies
movies = movies.query('(isAdult == "0") & (titleType == "movie")')

In [116]:
# read covers!
cover_files = os.listdir('./covers')

tconst_re = '(tt\d+)\.jpg'
tconsts = list(map(lambda n: re.match(tconst_re, n).group(1), cover_files))

movie_covers = pd.DataFrame({'tconst': tconsts, 'cover': cover_files})


In [117]:
# merge datasets
movie_ratings = pd.merge(ratings, movies, how='inner', on='tconst')
movie_ratings = pd.merge(movie_ratings, movie_covers, how='inner', on='tconst')

In [118]:
# filter for movies that have at least 1000 votes
movie_ratings = movie_ratings.query('(numVotes >= 1000)')
# should give around 7K movies
len(movie_ratings)

6874

In [119]:
# normalize the variables we use for distance (since votes are on a completely different scale than ratings)
movie_ratings['averageRating_unit'] = movie_ratings['averageRating'].to_numpy() / 10
movie_ratings['numVotes_unit'] = (movie_ratings['numVotes']-movie_ratings['numVotes'].min())/(movie_ratings['numVotes'].max()-movie_ratings['numVotes'].min())
movie_ratings

Unnamed: 0,tconst,averageRating,numVotes,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,cover,averageRating_unit,numVotes_unit
0,tt0079285,5.1,10035,movie,Saturn 3,Saturn 3,0,1980,\N,88,"Adventure,Horror,Sci-Fi",tt0079285.jpg,0.51,0.003273
2,tt0080339,7.7,249810,movie,Airplane!,Airplane!,0,1980,\N,88,Comedy,tt0080339.jpg,0.77,0.090146
3,tt0080360,6.9,36981,movie,Altered States,Altered States,0,1980,\N,102,"Horror,Sci-Fi,Thriller",tt0080360.jpg,0.69,0.013036
4,tt0080365,6.3,28046,movie,American Gigolo,American Gigolo,0,1980,\N,117,"Crime,Drama,Mystery",tt0080365.jpg,0.63,0.009799
5,tt0080377,6.1,20283,movie,Any Which Way You Can,Any Which Way You Can,0,1980,\N,116,"Action,Comedy",tt0080377.jpg,0.61,0.006986
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7894,tt4819560,6.6,2505,movie,Saving Mr. Wu,Jie jiu Wu xian sheng,0,2015,\N,106,"Action,Crime,Drama",tt4819560.jpg,0.66,0.000545
7895,tt4935446,7.0,2456,movie,Heart of a Dog,Heart of a Dog,0,2015,\N,75,Documentary,tt4935446.jpg,0.70,0.000528
7897,tt4944352,7.3,2379,movie,Heneral Luna,Heneral Luna,0,2015,\N,118,"Action,Biography,History",tt4944352.jpg,0.73,0.000500
7898,tt5049302,6.3,2125,movie,The Priests,Geomeun sajedeul,0,2015,\N,103,"Mystery,Thriller",tt5049302.jpg,0.63,0.000408


In [123]:

# let's take some movies at random for efficiency
random.seed(1)
N = 250
movie_ratings = movie_ratings.sample(N)

movie_names = movie_ratings['primaryTitle'] + ' (' + movie_ratings['startYear'] +')'

movie_dists = scidist.squareform(scidist.pdist(movie_ratings[['averageRating_unit','numVotes_unit']]))

movie_genres = movie_ratings['genres'].str.split(',').apply(lambda g: set(g))
all_genres = set.union(*movie_genres)

setdist_table = [
    list(map(lambda ag: 1 if ag in i else 0, all_genres)) for i in movie_genres
]
setdist_table = np.array(setdist_table)
setdists = scidist.squareform(scidist.pdist(setdist_table, scidist.jaccard))


In [127]:
with open(f'imdb_{N}.json', 'w', encoding='utf8') as f:
    jsonstr = {
        'E': movie_names.tolist(),
        'EA': movie_dists.tolist(),
        'SR': movie_genres.apply(lambda s: list(s)).tolist(),
        'S': list(all_genres),
        'SA': setdists.tolist()
    }
    json.dump(jsonstr, f, ensure_ascii=False)