In [1]:

# add additional imdb fields

import pandas, pathlib, numpy, datetime, cv2
from PIL import Image, ImageDraw, ImageFont

data = pandas.read_csv(pathlib.Path.cwd().parents[0] / 'data' / '2_extract_with_imdb.csv')
imdb_director = pandas.read_csv(pathlib.Path.home() / 'imdb' / 'title.crew.tsv', delimiter='\t', low_memory=False)
imdb_director = imdb_director.loc[imdb_director.tconst.isin(list(data.imdb))]
imdb_director = imdb_director[['tconst', 'directors']].rename(columns={'directors':'nconst'})
imdb_director['nconst'] = imdb_director['nconst'].str.split(',')
imdb_director = imdb_director.explode('nconst')

imdb_name = pandas.read_csv(pathlib.Path.home() / 'imdb' / 'name.basics.tsv', delimiter='\t', low_memory=False)
imdb_director = pandas.merge(imdb_director, imdb_name, on='nconst', how='left')
imdb_director = imdb_director[['tconst', 'primaryName']].rename(columns={'tconst':'imdb', 'primaryName':'imdb_director'})
imdb_director = imdb_director.pivot_table(index='imdb', aggfunc=lambda x: ', '.join(sorted(x.unique().astype(str)))).reset_index()
data = pandas.merge(data, imdb_director, on='imdb', how='left')    

imdb_title = pandas.read_csv(pathlib.Path.home() / 'imdb' / 'title.basics.tsv', delimiter='\t', low_memory=False)
imdb_title = imdb_title[['tconst', 'startYear', 'originalTitle']].rename(columns={'tconst':'imdb', 'startYear':'imdb_year', 'originalTitle':'imdb_title'})
data = pandas.merge(data, imdb_title, on='imdb', how='left')

print(len(data))
data.head()


2986


Unnamed: 0,link,title,date,director,starring,margaret,david,imdb,imdb_rating,imdb_director,imdb_year,imdb_title
0,https://web.archive.org/web/20041119140541/htt...,"Midsummer Night`s Dream, A",,Michael Hoffman,,no score,3.0,tt0140379,6.4,Michael Hoffman,1999,A Midsummer Night's Dream
1,https://web.archive.org/web/20041119140541/htt...,Cast Away,,Robert Zemeckis,Tom Hanks; Helen Hunt,4.0,4.0,tt0162222,7.8,Robert Zemeckis,2000,Cast Away
2,https://web.archive.org/web/20041119140541/htt...,Crackerjack,,Paul Moloney,Mick Molloy; Judith Lucy; Bill Hunter; Frank W...,4.0,3.5,tt0291832,6.7,Paul Moloney,2002,Crackerjack
3,https://web.archive.org/web/20041119140541/htt...,K:19 The Widowmaker,,Kathryn Bigelow,Harrison Ford; Liam Neeson,3.5,3.0,tt0267626,6.7,Kathryn Bigelow,2002,K-19: The Widowmaker
4,https://web.archive.org/web/20041119140541/htt...,Kissing Jessica Stein,,Charles Herman-Wurmfeld,Heather Juergensen; Jennifer Westfeldt,4.0,3.0,tt0264761,6.4,Charles Herman-Wurmfeld,2001,Kissing Jessica Stein


In [2]:

# sort into critic specific dataframes

def labeller(row):
    
    label = f"{row['imdb_title']} (dir. {row['imdb_director']}, {row['imdb_year']})"
    return label

def critic_specific(name):
    dataframe = data.copy()
    dataframe['label'] = dataframe.apply(labeller, axis=1)
    dataframe = dataframe.loc[~dataframe[name].isin(['no score'])]
    dataframe = dataframe.loc[~dataframe.imdb_rating.isin([numpy.nan])]
    dataframe['imdb_rating'] = dataframe['imdb_rating']/2
    dataframe = dataframe[['imdb', name, 'imdb_rating', 'label']]
    dataframe['diff'] = dataframe[name].astype('float64')-dataframe['imdb_rating'].astype('float64')    
    dataframe = dataframe.sort_values(by='diff', ascending=False)
    return dataframe

margaret_data = critic_specific('margaret')
david_data = critic_specific('david')

print(len(margaret_data), len(david_data))
margaret_data.head()


2303 2329


Unnamed: 0,imdb,margaret,imdb_rating,label,diff
237,tt0199626,5.0,2.65,"In the Cut (dir. Jane Campion, 2003)",2.35
1317,tt0403217,5.0,2.9,"Last Days (dir. Gus Van Sant, 2005)",2.1
262,tt0189456,4.0,2.15,"Cut (dir. Kimble Rendall, 2000)",1.85
972,tt0218378,5.0,3.2,"The Claim (dir. Michael Winterbottom, 2000)",1.8
1434,tt0377471,4.5,2.8,"Be Cool (dir. F. Gary Gray, 2005)",1.7


In [3]:

# plot highest and lowest instances

def graphing(data, name, colour, direct):
 
    def step_horizontal(step):
        return int(step*100+750)

    def step_vertical(step):
        return int(step*50+100)

    if direct == 'high':
        selection = data.head(10).sort_values(by='diff', ascending=False).to_dict(orient='records')
    else:
        selection = data.tail(10).sort_values(by='diff', ascending=True).to_dict(orient='records')        
    
    canvas_w, canvas_h = 1400, 650
    canvas = numpy.zeros((canvas_h, canvas_w, 3), numpy.uint8)
    canvas[:, :] = (240,240,240)
    
    for x in range(0, 6):
        cv2.line(canvas, (step_horizontal(x),step_vertical(0)), (step_horizontal(x),step_vertical(9)), (100,100,100), 1) 

    for n,x in enumerate(selection):
        if direct == 'high':
            balance = (1)
        else:
            balance = (-1)

        midway = numpy.mean([float(step_horizontal(float(x[name]))), float(step_horizontal(float(x['imdb_rating'])))])
        cv2.line(canvas, (step_horizontal(float(x['imdb_rating'])),step_vertical(n)), 
                 (int(midway)-(25*balance),step_vertical(n)), (68,68,68), 2)
        cv2.line(canvas, (step_horizontal(float(x[name])),step_vertical(n)), 
                 (int(midway)+(25*balance),step_vertical(n)), (68,68,68), 2)
        cv2.line(canvas, (step_horizontal(float(x[name])),step_vertical(n)-10),   
                 (step_horizontal(float(x[name])),step_vertical(n)+10), (68,68,68), 2)
        cv2.line(canvas, (step_horizontal(float(x['imdb_rating'])),step_vertical(n)-10),  
                 (step_horizontal(float(x['imdb_rating'])),step_vertical(n)+10), (68,68,68), 2)    

    cv2.imwrite(str(pathlib.Path.cwd().parents[0] / 'graph' / f'{name}_{direct}.png'), canvas)

    image_data = Image.open(str(pathlib.Path.cwd().parents[0] / 'graph' / f'{name}_{direct}.png'))
    draw = ImageDraw.Draw(image_data)
        
    bold_path = str(pathlib.Path.cwd().parents[0] / 'font' / 'Raleway' / 'static' / 'Raleway-Bold.ttf')
    bold_font = ImageFont.truetype(str(bold_path), 20)
    ebold_path = str(pathlib.Path.cwd().parents[0] / 'font' / 'Raleway' / 'static' / 'Raleway-ExtraBold.ttf')
    ebold_font = ImageFont.truetype(str(ebold_path), 20)       

    for s in range(0, 6):
        w,h = bold_font.getsize(str(s))  
        draw.text((step_horizontal(s)-(w/2), (650/2)-(h/2)-275), str(s), font=bold_font, fill="#777777")
        draw.text((step_horizontal(s)-(w/2), (650/2)-(h/2)+275), str(s), font=bold_font, fill="#777777")

    for n,x in enumerate(selection):
        w,h = bold_font.getsize(str(x['label']))  
        draw.text((50, step_vertical(n)-(h/2)), x['label'], font=bold_font, fill="#444444")
        
        dff = str(abs(round(x['diff'], 1)))
        midway = numpy.mean([float(step_horizontal(float(x[name]))), float(step_horizontal(float(x['imdb_rating'])))])
        w,h = ebold_font.getsize(dff)  
        draw.text((int(midway)-(w/2), step_vertical(n)-(h/1.7)), dff, font=ebold_font, fill="#444444")

        slogan = name.title()   
        w,h = ebold_font.getsize(slogan)
        if direct == 'high':
            front = step_horizontal(float(x[name]))+20
        else:
            front = step_horizontal(float(x[name]))-w-20
        draw.text((front, step_vertical(n)-(h/1.7)), slogan, font=ebold_font, fill=colour)

        slogan = 'IMDB'  
        w,h = ebold_font.getsize(slogan)  
        if direct == 'high':
            front = step_horizontal(float(x['imdb_rating']))-w-20
        else:
            front = step_horizontal(float(x['imdb_rating']))+20        
        draw.text((front, step_vertical(n)-(h/1.7)), slogan, font=ebold_font, fill="#444444")    

    image_data.save(str(pathlib.Path.cwd().parents[0] / 'graph' / f'{name}_{direct}.png'))

graphing(david_data, 'david', '#FB4D3D', 'high')
graphing(margaret_data, 'margaret', '#457b9d', 'high')
graphing(david_data, 'david', '#FB4D3D', 'low')
graphing(margaret_data, 'margaret', '#457b9d', 'low')

print('all done.')


all done.
