In [137]:
import pandas as pd
import numpy as np
import scipy.spatial.distance as scidist
from scipy.stats import zscore
from sklearn.preprocessing import normalize
import json
import os
import random
import re

# Installation

Go to [https://datasets.imdbws.com/](https://datasets.imdbws.com/), download `title.basics.tsv.gz` and `title.ratings.tsv.gz`, unzip into this folder. For the covers, go to [http://mmcv.csie.ncku.edu.tw/~wtchu/projects/MoviePoster/](http://mmcv.csie.ncku.edu.tw/~wtchu/projects/MoviePoster/), download the Movie Poster Dataset, throw all images into a folder "covers" (e.g., with `mv ~/Downloads/Movie_Poster_Dataset/**/*.jpg data/imdb/covers`).

In [138]:
ratings = pd.read_csv('title.ratings.tsv', sep='\t')
movies = pd.read_csv('title.basics.tsv', sep="\t", header=0, converters={
    # sucks, but it's mixed data everywhere
    'tconst': str,
    'titleType': str,
    'primaryTitle': str,
    'originalTitle': str,
    'isAdult': str,
    'startYear': str,
    'endYear': str,
    'runtimeMinutes': str,
    'genres': str,
})


In [139]:
# filter for no porn and yes movies
movies = movies.query('(isAdult == "0") & (titleType == "movie")')

In [140]:
# read covers!
cover_files = os.listdir('./covers')

tconst_re = '(tt\d+)\.jpg'
tconsts = list(map(lambda n: re.match(tconst_re, n).group(1) if re.match(tconst_re, n) is not None else '', cover_files))

movie_covers = pd.DataFrame({'tconst': tconsts, 'cover': cover_files})


In [141]:
# merge datasets
movie_ratings = pd.merge(ratings, movies, how='inner', on='tconst')
movie_ratings = pd.merge(movie_ratings, movie_covers, how='inner', on='tconst')

In [124]:
# filter for movies that have at least 1000 votes
movie_ratings = movie_ratings.query('(numVotes >= 1000)')
# should give around 7K movies
len(movie_ratings)

6874

In [142]:
movie_ratings['year'] = movie_ratings['startYear'].apply(lambda s: int(s))

In [143]:
# normalize the variables we use for distance (since votes are on a completely different scale than ratings)
# tbh idk if we need normalization or standardization 
#movie_ratings['averageRating_unit'] = movie_ratings['averageRating'].to_numpy() / 10
movie_ratings['averageRating_unit'] = zscore(movie_ratings['averageRating'])
movie_ratings['numVotes_unit'] = zscore(movie_ratings['numVotes'])
movie_ratings['year_unit'] = zscore(movie_ratings['year'])
#movie_ratings['numVotes_unit'] = (movie_ratings['numVotes']-movie_ratings['numVotes'].min())/(movie_ratings['numVotes'].max()-movie_ratings['numVotes'].min())

In [144]:
# filter to top K genres
# (each genre needs a color, can't do 20-something colors)
K = 5
genres = movie_ratings.genres.str.split(',').apply(lambda x:set(x))
all_genres = list(set.union(*genres))
setdist_table = [
    list(map(lambda ag: 1 if ag in i else 0, all_genres)) for i in genres
]
setdist_table = np.array(setdist_table)
genre_freq = np.sum(setdist_table,axis=0)
all_genres_sorted = sorted(all_genres, key=lambda g: genre_freq[all_genres.index(g)],reverse=True)
limit_to_genres = all_genres_sorted[:(K+1)]

for genre in all_genres:
    movie_ratings[f'genre_{genre}'] = setdist_table[:,all_genres.index(genre)]
# set all other genres to zero in the hopes that all movies have at least one genre, which will then be one of those we want
for genre in set(all_genres).difference(set(limit_to_genres)):
    genre_col = f'genre_{genre}'
    movie_ratings = movie_ratings[movie_ratings[genre_col]== 0]
len(movie_ratings)

3348

In [146]:

# let's take some movies at random for efficiency
random.seed(1)
N = 10

movie_ratings = movie_ratings.sample(N)

movie_names = movie_ratings['primaryTitle'] + ' (' + movie_ratings['startYear'] +')'

movie_dists = scidist.squareform(scidist.pdist(movie_ratings[['averageRating_unit','numVotes_unit', 'year_unit']]))

movie_genres = movie_ratings['genres'].str.split(',').apply(lambda g: set(g))
all_genres = set.union(*movie_genres)

setdist_table = [
    list(map(lambda ag: 1 if ag in i else 0, all_genres)) for i in movie_genres
]
setdist_table = np.array(setdist_table)
setdists = scidist.squareform(scidist.pdist(setdist_table, scidist.jaccard))


In [152]:
# haha fun part let's make actual glyph images!
from PIL import Image, ImageDraw, ImageFont

def expand2square(pil_img, background_color=(0,0,0)):
    width, height = pil_img.size
    if width == height:
        return pil_img
    elif width > height:
        result = Image.new(pil_img.mode, (width, width), background_color)
        result.paste(pil_img, (0, (width - height) // 2))
        return result
    else:
        result = Image.new(pil_img.mode, (height, height), background_color)
        result.paste(pil_img, ((height - width) // 2, 0))
        return result

def add_margin(pil_img, top, right, bottom, left, color):
    width, height = pil_img.size
    new_width = width + right + left
    new_height = height + top + bottom
    result = Image.new(pil_img.mode, (new_width, new_height), color)
    result.paste(pil_img, (left, top))
    return result


for i, c in enumerate(movie_ratings['cover']):
    movie = movie_ratings.iloc[i]
    tconst, ext = c.split('.')
    with Image.open(f'./covers/{c}') as im:
        im2 = expand2square(im.copy())
        im2 = im2.resize((300,300))
        im2 = add_margin(im2, 50,0,100,0,(0,0,0))
        w,h = im2.size
        print(tconst)

        fnt = ImageFont.truetype("Arial.ttf", 40)
        d = ImageDraw.Draw(im2)
        d.text((150, 45), movie['startYear'], font=fnt, fill=(255, 255, 255), anchor='ms')
        d.text((5, 50+395), "{:.2f}".format(movie['averageRating']), font=fnt, fill=(255, 255, 255), anchor='ls')
        d.text((295, 50+395), "{:,}".format(movie['numVotes']), font=fnt, fill=(255, 255, 255), anchor='rs')
        y = 325+50
        for star in range(round(movie['averageRating'])):
            x = 10 + star*25 + max(0, star-1)*2
            d.ellipse((x,y,x+25,y+25), fill=(255,255,0))
        for star in range(round(movie['averageRating']), 10):
            x = 10 + star*25 + max(0, star-1)*2
            d.ellipse((x,y,x+25,y+25), outline=(255,255,0))

        im2.save(f'./test/{tconst}_thumb.{ext}', "JPEG")
        # TODO add path to dataframe

tt0274812
tt0388500
tt1980986
tt0118655
tt0093565
tt0243255
tt4276752
tt0398375
tt0137338
tt0188863


In [155]:
with open(f'imdb_{N}.json', 'w', encoding='utf8') as f:
    jsonstr = {
        'glyph_ids': movie_ratings['tconst'].apply(lambda t: f'data/imdb/imdb10/{t}_thumb.jpg').tolist(),
        'E': movie_names.tolist(),
        'EA': movie_dists.tolist(),
        'SR': movie_genres.apply(lambda s: list(s)).tolist(),
        'S': list(all_genres),
        'SA': setdists.tolist()
    }
    json.dump(jsonstr, f, ensure_ascii=False)

3003    Comedy,Crime,Drama
Name: genres, dtype: object