In [1]:
import pandas as pd
import numpy as np
import scipy.spatial.distance as scidist
from scipy.stats import zscore
from sklearn.preprocessing import normalize
import json
import os
import random
import re

# Installation

Go to [https://datasets.imdbws.com/](https://datasets.imdbws.com/), download `title.basics.tsv.gz` and `title.ratings.tsv.gz`, unzip into this folder. For the covers, go to [http://mmcv.csie.ncku.edu.tw/~wtchu/projects/MoviePoster/](http://mmcv.csie.ncku.edu.tw/~wtchu/projects/MoviePoster/), download the Movie Poster Dataset, throw all images into a folder "covers" (e.g., with `mv ~/Downloads/Movie_Poster_Dataset/**/*.jpg data/imdb/covers`).

In [2]:
ratings = pd.read_csv('title.ratings.tsv', sep='\t')
movies = pd.read_csv('title.basics.tsv', sep="\t", header=0, converters={
    # sucks, but it's mixed data everywhere
    'tconst': str,
    'titleType': str,
    'primaryTitle': str,
    'originalTitle': str,
    'isAdult': str,
    'startYear': str,
    'endYear': str,
    'runtimeMinutes': str,
    'genres': str,
})


In [3]:
# filter for no porn and movies
movies = movies.query('(isAdult == "0") & (titleType == "movie")')

In [4]:
# read covers!
cover_files = os.listdir('./covers')

tconst_re = '(tt\d+)\.jpg'
tconsts = list(map(lambda n: re.match(tconst_re, n).group(1), cover_files))

movie_covers = pd.DataFrame({'tconst': tconsts, 'cover': cover_files})


In [5]:
# merge datasets
movie_ratings = pd.merge(ratings, movies, how='inner', on='tconst')
movie_ratings = pd.merge(movie_ratings, movie_covers, how='inner', on='tconst')

In [6]:
# filter for movies that have at least 1000 votes
movie_ratings = movie_ratings.query('(numVotes >= 1000)')
# should give around 7K movies
len(movie_ratings)

6874

In [8]:
# normalize the variables we use for distance (since votes are on a completely different scale than ratings)
# tbh idk if we need normalization or standardization 
#movie_ratings['averageRating_unit'] = movie_ratings['averageRating'].to_numpy() / 10
movie_ratings['averageRating_unit'] = zscore(movie_ratings['averageRating'])
movie_ratings['numVotes_unit'] = zscore(movie_ratings['numVotes'])
#movie_ratings['numVotes_unit'] = (movie_ratings['numVotes']-movie_ratings['numVotes'].min())/(movie_ratings['numVotes'].max()-movie_ratings['numVotes'].min())
movie_ratings

Unnamed: 0,tconst,averageRating,numVotes,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,cover,averageRating_unit,numVotes_unit
0,tt0079285,5.1,10035,movie,Saturn 3,Saturn 3,0,1980,\N,88,"Adventure,Horror,Sci-Fi",tt0079285.jpg,-1.185437,-0.410585
2,tt0080339,7.7,249810,movie,Airplane!,Airplane!,0,1980,\N,88,Comedy,tt0080339.jpg,1.409943,0.923128
3,tt0080360,6.9,36981,movie,Altered States,Altered States,0,1980,\N,102,"Horror,Sci-Fi,Thriller",tt0080360.jpg,0.611364,-0.260701
4,tt0080365,6.3,28046,movie,American Gigolo,American Gigolo,0,1980,\N,117,"Crime,Drama,Mystery",tt0080365.jpg,0.012431,-0.310401
5,tt0080377,6.1,20283,movie,Any Which Way You Can,Any Which Way You Can,0,1980,\N,116,"Action,Comedy",tt0080377.jpg,-0.187214,-0.353582
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7894,tt4819560,6.6,2505,movie,Saving Mr. Wu,Jie jiu Wu xian sheng,0,2015,\N,106,"Action,Crime,Drama",tt4819560.jpg,0.311898,-0.452469
7895,tt4935446,7.0,2456,movie,Heart of a Dog,Heart of a Dog,0,2015,\N,75,Documentary,tt4935446.jpg,0.711187,-0.452742
7897,tt4944352,7.3,2379,movie,Heneral Luna,Heneral Luna,0,2015,\N,118,"Action,Biography,History",tt4944352.jpg,1.010654,-0.453170
7898,tt5049302,6.3,2125,movie,The Priests,Geomeun sajedeul,0,2015,\N,103,"Mystery,Thriller",tt5049302.jpg,0.012431,-0.454583


In [9]:

# let's take some movies at random for efficiency
random.seed(1)
N = 50
movie_ratings = movie_ratings.sample(N)

movie_names = movie_ratings['primaryTitle'] + ' (' + movie_ratings['startYear'] +')'

movie_dists = scidist.squareform(scidist.pdist(movie_ratings[['averageRating_unit','numVotes_unit']]))

movie_genres = movie_ratings['genres'].str.split(',').apply(lambda g: set(g))
all_genres = set.union(*movie_genres)

setdist_table = [
    list(map(lambda ag: 1 if ag in i else 0, all_genres)) for i in movie_genres
]
setdist_table = np.array(setdist_table)
setdists = scidist.squareform(scidist.pdist(setdist_table, scidist.jaccard))


In [50]:
# haha fun part let's make actual glyph images!
from PIL import Image, ImageDraw, ImageFont

def expand2square(pil_img, background_color=(0,0,0)):
    width, height = pil_img.size
    if width == height:
        return pil_img
    elif width > height:
        result = Image.new(pil_img.mode, (width, width), background_color)
        result.paste(pil_img, (0, (width - height) // 2))
        return result
    else:
        result = Image.new(pil_img.mode, (height, height), background_color)
        result.paste(pil_img, ((height - width) // 2, 0))
        return result

def add_margin(pil_img, top, right, bottom, left, color):
    width, height = pil_img.size
    new_width = width + right + left
    new_height = height + top + bottom
    result = Image.new(pil_img.mode, (new_width, new_height), color)
    result.paste(pil_img, (left, top))
    return result


for i, c in enumerate(movie_ratings['cover']):
    movie = movie_ratings.iloc[i]
    tconst, ext = c.split('.')
    with Image.open(f'./covers/{c}') as im:
        im2 = expand2square(im.copy())
        im2 = im2.resize((300,300))
        im2 = add_margin(im2, 0,0,100,0,(0,0,0))
        w,h = im2.size
        print(tconst)

        fnt = ImageFont.truetype("Arial.ttf", 40)
        d = ImageDraw.Draw(im2)
        d.text((5, 395), "{:.2f}".format(movie['averageRating']), font=fnt, fill=(255, 255, 255), anchor='ls')
        d.text((295, 395), "{:,}".format(movie['numVotes']), font=fnt, fill=(255, 255, 255), anchor='rs')
        y = 325
        for star in range(round(movie['averageRating'])):
            x = 10 + star*25 + max(0, star-1)*2
            d.ellipse((x,y,x+25,y+25), fill=(255,255,0))
        for star in range(round(movie['averageRating']), 10):
            x = 10 + star*25 + max(0, star-1)*2
            d.ellipse((x,y,x+25,y+25), outline=(255,255,0))

        im2.save(f'./test/{tconst}_thumb.{ext}', "JPEG")
        # TODO add path to dataframe

0 0.75 300 400
1 0.75 300 400
2 0.75 300 400
3 0.75 300 400
4 0.75 300 400
5 0.75 300 400
6 0.75 300 400
7 0.75 300 400
8 0.75 300 400
9 0.75 300 400
10 0.75 300 400
11 0.75 300 400
12 0.75 300 400
13 0.75 300 400
14 0.75 300 400
15 0.75 300 400
16 0.75 300 400
17 0.75 300 400
18 0.75 300 400
19 0.75 300 400
20 0.75 300 400
21 0.75 300 400
22 0.75 300 400
23 0.75 300 400
24 0.75 300 400
25 0.75 300 400
26 0.75 300 400
27 0.75 300 400
28 0.75 300 400
29 0.75 300 400
30 0.75 300 400
31 0.75 300 400
32 0.75 300 400
33 0.75 300 400
34 0.75 300 400
35 0.75 300 400
36 0.75 300 400
37 0.75 300 400
38 0.75 300 400
39 0.75 300 400
40 0.75 300 400
41 0.75 300 400
42 0.75 300 400
43 0.75 300 400
44 0.75 300 400
45 0.75 300 400
46 0.75 300 400
47 0.75 300 400
48 0.75 300 400
49 0.75 300 400


In [127]:
with open(f'imdb_{N}.json', 'w', encoding='utf8') as f:
    jsonstr = {
        'E': movie_names.tolist(),
        'EA': movie_dists.tolist(),
        'SR': movie_genres.apply(lambda s: list(s)).tolist(),
        'S': list(all_genres),
        'SA': setdists.tolist()
    }
    json.dump(jsonstr, f, ensure_ascii=False)