Build a recommender based on https://www.kaggle.com/ibtesama/getting-started-with-a-movie-recommendation-system

TODO:
- Deal with duplicate titles.

In [1]:
import pandas as pd 
import numpy as np 
from rake_nltk import Rake
import dill
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Build data

In [2]:
def remove_directors(x, jobtitle):
    x[jobtitle] = [job for job in x[jobtitle] for director in x['directors'] if job != director]
    return x

def remove_job(x, jobtitle, joblist):
    x[jobtitle] = [job for job in x[jobtitle] if job not in x[joblist]]
    return x

def shorten_list(x, size=10):
    # Returns the list top 'size' elements or entire list; whichever is more.
    if isinstance(x, list):
        names = [i for i in x]
        # Check if more than 'size' elements exist. If yes, return only 
        # first three. If no, return entire list.
        if len(names) > size:
            names = names[:size]
        return names

    # Return empty list in case of missing/malformed data
    return []

def rake_plot(df):
    
    plot = df['overview']
    # instantiating Rake, by default it uses english stopwords from NLTK
    # and discards all puntuation characters as well
    r = Rake()
    
    try:
        # extracting the words by passing the text
        r.extract_keywords_from_text(plot)

        # getting the dictionary with key words as keys and their scores as values
        key_words_dict_scores = r.get_word_degrees()

        # assigning the key words to the new column for the corresponding movie
        return list(key_words_dict_scores.keys())
    
    except:
        return ''

In [3]:
with open('pickles/all_films.pkl', 'rb') as file:
    all_films = dill.load(file)

omdb = pd.read_json('data/omdb.json')

df = all_films.merge(omdb[['imdbID', 'Plot']], on='imdbID', how='left') \
    .drop(columns=['budget_imdb', 'revenue_imdb',  
                   'decade', 'budget_adj', 'revenue_adj', 
                   'profit', 'profit_adj', 'budget_bin', 
                   'budget_adj_bin', 'imdbID',  
                   'budget', 'id',
                   'revenue', 'runtime', 'status']) \
    .rename(columns={'Plot':'plot'})

# df['genres'] = [[x['name'] for x in list_dict] for list_dict in df['genres']]

df['directors'] = pd.Series(
    [[x['name'] for x in crew_list if x['job'] == 'Director'] 
         for crew_list in [x['crew'] for x in df['credits']]
    ])

df['directors_gender'] = pd.Series(
    [[x['gender'] for x in crew_list if x['job'] == 'Director'] 
        for crew_list in [x['crew'] for x in df['credits']]
    ], name='gender')

df['cast'] = pd.Series([[x['name'] for x in cast_list] 
              for cast_list in [x['cast'] for x in df['credits']]])

df['short_cast'] = df['cast'].apply(shorten_list, size=5)

df['short_keywords'] = df['keywords'].apply(shorten_list)

df['url'] = 'https://www.imdb.com/title/' + df['imdb_id']

df['production_companies'] = pd.Series(
    [[x['name'] for x in companies_list] for companies_list 
     in df['production_companies']])

writing_jobs = ['Screenplay', 'Writer','Story', 'Original Story', 'Screenstory',
                'Dialogue', 'Adaptation', 'Scenario Writer', 'Story Artist', 
                'Lyricist', 'Idea', 'Co-Writer', 'Story Editor', 'Script Editor',
                'Original Film Writer', 'Teleplay', 'Script Consultant', 'Musical',
                'Head of Story', 'Creative Producer', 'Story Coordinator', 
                'Story Consultant', 'Story Manager', 'Story Supervisor']

df['writing'] = pd.Series(
    [list(set(
        [x['name'] for x in crew_list if x['job'] in writing_jobs]))
     for crew_list in [x['crew'] for x in df['credits']]
    ])

df['producers'] = pd.Series(
    [list(set(
        [x['name'] for x in crew_list if x['job'] 
         in ['Producer', 'Co-Producer', 'Associate Producer']])) 
         for crew_list in [x['crew'] for x in df['credits']]
    ])

df['exec_producers'] = pd.Series(
    [list(set(
        [x['name'] for x in crew_list if x['job'] == 'Executive Producer'])) 
         for crew_list in [x['crew'] for x in df['credits']]
    ])

df = df.apply(remove_job, jobtitle='writing', joblist='directors', axis=1)
df = df.apply(remove_job, jobtitle='producers', joblist='directors', axis=1)
df = df.apply(remove_job, jobtitle='exec_producers', joblist='directors', axis=1)
df = df.apply(remove_job, jobtitle='writing', joblist='producers', axis=1)
df = df.apply(remove_job, jobtitle='writing', joblist='exec_producers', axis=1)
df = df.apply(remove_job, jobtitle='producers', joblist='exec_producers', axis=1)

df['raked_plot'] = df.apply(rake_plot, axis=1)

# Content based filtering

We are now in a good position to define our recommendation function. These are the following steps we'll follow :-

- Get the index of the movie given its title.
- Get the list of cosine similarity scores for that particular movie with all movies. Convert it into a list of tuples where the first element is its position and the second is the similarity score.
- Sort the aforementioned list of tuples based on the similarity scores; that is, the second element.
- Get the top 10 elements of this list. Ignore the first element as it refers to self (the movie most similar to a particular movie is the movie itself).
- Return the titles corresponding to the indices of the top elements.

In [4]:
# Function that takes in movie title as input and outputs most similar movies
def get_recommendationsdf(title, cosine_sim):
    # Get the index of the movie that matches the title
    idx = indices[title]
    
    # Get the pairwsie similarity scores of all movies with that movie
    scores = list(enumerate(cosine_sim[idx]))
    
    results = df.merge(pd.Series([x[1] for x in scores], name='score'),
                       left_index=True, right_index=True)

    return results[['score', 'title', 'raked_plot', 'cleaned_genres', 'cleaned_keywords',
                    'url', 'directors', 'short_cast', 
                    'writing', 'producers', 'year', 'production_companies',
                    'Metacritic_score', 'RT_score','imdb_id', 'directors_gender']] \
            .sort_values('score', ascending=False)

# Function to convert all strings to lower case and strip names of spaces
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        #Check if director exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

In [5]:
# Apply clean_data function to your features.
features = ['short_cast', 'directors', 'genres', 'short_keywords', 'writing', 
            'producers', 'plot', 'overview', 'keywords', 'exec_producers',
            'production_companies']

for feature in features:
    df['cleaned_'+feature] = df[feature].apply(clean_data)

# Strip punctuation from plot.
df['cleaned_plot'] = df['cleaned_plot'].str.replace(r'[^\w\s]+', '')
df['cleaned_overview'] = df['cleaned_overview'].str.replace(r'[^\w\s]+', '')

## Make the soup

In [6]:
def create_soup(x):
    soup = ' '.join(x['cleaned_short_cast']) + ' ' + \
           2*(' '.join(x['cleaned_directors']) + ' ') + \
           2*(' '.join(x['cleaned_writing']) + ' '  + \
           ' '.join(x['cleaned_exec_producers']) + ' ') + \
           ' '.join(x['cleaned_producers']) + ' ' + \
           ' '.join(x['cleaned_production_companies']) + ' ' + \
           2*(' '.join(x['cleaned_genres']) + ' ') + \
           1*(' '.join(x['cleaned_keywords']) + ' ') + \
           1*(' '.join(x['raked_plot']) + ' ') 

    soup = ' '.join(soup.split())
                                               
    return soup


# def create_soup(x):
#     soup = 2*(' '.join(x['cleaned_short_cast']) + ' ' + \
#            ' '.join(x['cleaned_directors']) + ' ' + \
#            ' '.join(x['cleaned_writing']) + ' '  + \
#            ' '.join(x['cleaned_exec_producers']) + ' ') + \
#            ' '.join(x['cleaned_producers']) + ' ' + \
#            ' '.join(x['cleaned_production_companies']) + ' ' + \
#            2*(' '.join(x['cleaned_genres']) + ' ' + \
#            ' '.join(x['cleaned_short_keywords']) + ' ')
# #            x['plot']

In [7]:
# Make the soup from the different variables    
df['soup'] = df.apply(create_soup, axis=1)

# Create the count matrix
count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(df['soup'])

# Compute the Cosine Similarity matrix based on the count_matrix
cosine_sim2 = cosine_similarity(count_matrix, count_matrix)

# Reset index of our main DataFrame and construct reverse mapping as before
df = df.reset_index(drop=True)
indices = pd.Series(df.index, index=df['title'])

In [9]:
# Result with 5 cast members and adding writing
get_recommendationsdf('Timecrimes', cosine_sim2)

Unnamed: 0,score,title,raked_plot,cleaned_genres,cleaned_keywords,url,directors,short_cast,writing,producers,year,production_companies,Metacritic_score,RT_score,imdb_id,directors_gender
8794,1.000000,Timecrimes,"[man, accidentally, gets, finding, travels, ba...","[sciencefiction, thriller]","[scissors, radio, bicycle, timetravel, woods, ...",https://www.imdb.com/title/tt0480669,[Nacho Vigalondo],"[Karra Elejalde, Candela Fernández, Bárbara Go...",[],"[Jordi Rediu, Nahikari Ipiña, Santi Camuñas, D...",2007,"[Arsénico Producciones, Zip Films, Fine Produc...",68,89,tt0480669,[2]
14662,0.200000,Project Almanac,"[construct, one, group, teens, discover, secre...","[sciencefiction, thriller]","[timetravel, timemachine, teenager, foundfootage]",https://www.imdb.com/title/tt2436386,[Dean Israelite],"[Jonny Weston, Virginia Gardner, Sofia Black-D...","[Andrew Stark, Jason Pagan]","[Michael Bay, Andrew Form, Bradley Fuller]",2015,"[Insurge Pictures, Paramount, MTV Films, Plati...",47,36,tt2436386,[0]
7823,0.199223,Brain Dead,"[machine, showdown, psychological, thriller, m...","[horror, action, thriller, sciencefiction]","[sex, showdown, experiment, nightmare, man, in...",https://www.imdb.com/title/tt0099173,[Adam Simon],"[Bill Pullman, Bill Paxton, Nicholas Pryor, Pa...",[],"[Julie Corman, Lynn Whitney]",1990,"[New Horizons, Concorde Pictures]",,17,tt0099173,[2]
12198,0.183533,The Time Shifters,"[crew, evanston, power, plant, nearly, killed,...","[sciencefiction, thriller, action]","[timetravel, disastermovie]",https://www.imdb.com/title/tt0204686,[Mario Philip Azzopardi],"[Casper Van Dien, Catherine Bell, Theresa Sald...","[Gay Walch, Kurt Inderbitzin]",[],2001,[Abandon Pictures],,,tt0204686,[2]
19457,0.177778,Alien Dawn,[],"[thriller, sciencefiction]","[spaceinvasion, southwesternu.s., ragtagresist...",https://www.imdb.com/title/tt2275499,[Neil Johnson],"[Rachelle Dimaria, Brooke Lewis, Alex Bell, Mi...",[],"[Cynthia Martin, Philip Burthem]",2012,[Morphius Film],,,tt2275499,[0]
16270,0.176141,Synchronicity,"[machine, stealing, fi, noir, trying, stop, wo...","[thriller, mystery, sciencefiction]",[],https://www.imdb.com/title/tt2049543,[Jacob Gentry],"[Chad McKnight, Brianne Davis, AJ Bowen, Scott...",[Alex Orr],"[Alexander Motlagh, Christopher Alender]",2015,"[Soapbox Films, Pop Films]",39,,tt2049543,[2]
12446,0.174574,Subject Two,"[killing, tests, assistant, resurrection, form...","[horror, sciencefiction, thriller]",[],https://www.imdb.com/title/tt0492912,[Philip Chidel],"[Christian Oliver, Dean Stapleton, Courtney Ma...",[],[],2006,[],,,tt0492912,[0]
18903,0.173205,Astral,"[consequences, astral, projection, reconnect, ...","[drama, sciencefiction, thriller]",[],https://www.imdb.com/title/tt4765240,[Chris Mul],"[Frank Dillane, Catherine Steadman, Trevor Whi...",[Michael Mul],[],2018,[],,,tt4765240,[0]
15350,0.170251,Paradise Hills,"[sent, learn, high, paradise, hills, reformed,...","[fantasy, sciencefiction]",[womandirector],https://www.imdb.com/title/tt6127004,[Alice Waddington],"[Emma Roberts, Eiza González, Milla Jovovich, ...","[Nacho Vigalondo, Brian DeLeeuw]","[Núria Valls, Adrián Guerra]",2019,"[Nostromo Pictures, Colina Paraiso AIE, Televi...",50,62,tt6127004,[1]
18746,0.168238,Man Vs.,"[routine, episode, doug, five, days, things, g...","[thriller, sciencefiction, horror]",[],https://www.imdb.com/title/tt3584354,[Adam Massey],"[Chris Diamantopoulos, Chloe Bradt, Michael Cr...",[Thomas Michael],[Nicholas Tabarrok],2015,[Darius Films],,50,tt3584354,[2]


In [10]:
recom = get_recommendationsdf('Timecrimes', cosine_sim2)
recom.to_excel('timecrimes.xlsx')
# recom[recom['directors_gender'].apply(lambda x: 1 in x) & recom['score']>0]

In [None]:
pd.options.display.max_colwidth = 300

## Explore additional crew roles
What about using production companies, or adding full plots from OMDb? Or cinematographers or screenwriters?

In [None]:
def list_col_to_rows(df, list_col):
    new_df = pd.DataFrame(
        {col:np.repeat(df[col].values, df[list_col].str.len())
            for col in df.columns.drop(list_col)}
            ).assign(**{list_col:np.concatenate(df[list_col].values)})[df.columns]
    return new_df

In [None]:
jobs = pd.Series(
    [[x['job'] for x in crew_list] 
         for crew_list in [x['crew'] for x in df['credits']]
    ]).to_frame(name='jobs')

In [None]:
lst_col = 'jobs'

jobs = list_col_to_rows(jobs, lst_col)

In [None]:
jobs['jobs'].value_counts()

In [None]:
departments = pd.Series(
    [[x['department'] for x in crew_list ] 
         for crew_list in [x['crew'] for x in df['credits']]
    ]).to_frame(name='depts')

departments = list_col_to_rows(departments, 'depts')

In [None]:
departments['depts'].value_counts()

In [None]:
productionjobs = pd.Series(
    [[x['job'] for x in crew_list if x['department']=='Production'] 
         for crew_list in [x['crew'] for x in df['credits']]
    ]).to_frame(name='jobs')

productionjobs = list_col_to_rows(productionjobs, 'jobs')

productionjobs['jobs'].value_counts()[0:50]

In [None]:
pd.Series(
    [[x['name'] for x in companies_list] for companies_list in df['production_companies']],
    name='companies').to_frame()

# Test out with my IMDb lists

In [9]:
# Function that takes in movie title as input and outputs most similar movies
def my_list_recommendations(imdb_id, cosine_sim):
    # Get the index of the movie that matches the title
    idx = indices[imdb_id]
    
    # Get the pairwsie similarity scores of all movies with that movie
    scores = list(enumerate(cosine_sim[idx]))
    
    results = df.merge(pd.Series([x[1] for x in scores], name='score'),
                       left_index=True, right_index=True)

    return results[['score', 'title', 'url', 'directors', 'short_cast', 
                    'writing', 'producers', 'year', 'production_companies',
                    'Metacritic_score', 'RT_score', 'IMdb_score', 'critic_score', 'imdb_id', 
                    'directors_gender']].sort_values('score', ascending=False)

Import IMDb lists.

In [132]:
my_2019 = pd.read_csv('data/2019.csv', encoding = 'latin')
my_2018 = pd.read_csv('data/2018.csv', encoding = 'latin')
my_2017 = pd.read_csv('data/2017.csv', encoding = 'latin')
my_2016 = pd.read_csv('data/2016.csv', encoding = 'latin')
my_all = pd.concat([my_2019, my_2018, my_2017, my_2016], ignore_index=True)

In [138]:
seen = pd.read_csv('data/seen_it.csv', encoding = 'latin')

To avoid issues with title changes between TMDb and IMDb, we can use the imdb_ids as the index. To avoid issues here, we will first remove duplicate imdb_ids. Then set the index as the imdb_id. We'll pass this as the thing to match with in the function.

In [11]:
df.drop_duplicates(subset=['imdb_id'], inplace=True)

In [12]:
indices = pd.Series(df.index, index=df['imdb_id'])

Set dummy column called score, fill with Metacritic unless NaN, when use RT. Filter on that for high scores.

In [13]:
def combined_scores(s):
    if s['Metacritic_score']==0:
        if s['RT_score']==0:
            return s['IMdb_score']
        else:
            return s['RT_score']
    else:
        return s['Metacritic_score']

In [14]:
df['Metacritic_score'] = pd.to_numeric(df['Metacritic_score'], errors='coerce')
df['RT_score'] = pd.to_numeric(df['RT_score'], errors='coerce')
df['IMdb_score'] = pd.to_numeric(df['IMdb_score'], errors='coerce')

df['Metacritic_score'] = df['Metacritic_score'].fillna(0).astype(int)
df['RT_score'] = df['RT_score'].fillna(0).astype(int)
df['IMdb_score'] = df['IMdb_score'].fillna(0).astype(int)

df['critic_score'] = df.apply(combined_scores, axis=1)

Define then run the function for generating top recommendations from lists.

In [120]:
def get_list_recommendations(source_list, critic_score=0, num_matches=1, f_directors=False, seen_it=False):
    recs_list = pd.DataFrame()
    film_list = []
    missing = []

    for film in source_list['Const']:
        try:
            recs = my_list_recommendations(film, cosine_sim2)
            recs['source_film'] = df[df['imdb_id']==film]['title'].item()
            
            # Remove films I have seen recently.
            recs = recs[~recs['imdb_id'].isin(source_list['Const'])]
            
            if seen_it == True:
                recs = recs[~recs['imdb_id'].isin(seen['Const'])]
            
            if f_directors == True:
            # Pick only female directors.
                recs = recs[recs['directors_gender'].apply(lambda x: 1 in x)]
    
            # Pick only films from the 1970s.
    #         recs = recs[(recs['year']>=1970) & (recs['year']<1980)]
    
            # Pick only films with Metascores/RT/IMDb scores over critic_score.
            recs = recs[(recs['critic_score']>=critic_score) 
                    | (recs['critic_score']==0)]
            
    #         recs = recs[recs['cleaned_genres'].apply(lambda x: 'comedy' in x)]
    
            # Return the highest scoring matches.
            recs_list = recs_list.append(recs.iloc[0:num_matches], ignore_index=True)

        except:
            missing += [film]
    
    recs_list = recs_list[['source_film', 'score', 'title', 'url', 'directors', 
        'short_cast', 'writing', 'producers', 'year', 'production_companies', 
        'Metacritic_score', 'RT_score', 'IMdb_score', 'critic_score', 'directors_gender']]
    
    recs_list['year'] = recs_list['year'].astype(int)
    
    features = ['directors', 'short_cast', 'writing', 'producers', 'production_companies']

    for feature in features:
        recs_list[feature] = recs_list[feature].apply(lambda x: ', '.join([str(i) for i in x]))

    return recs_list.sort_values('score', ascending=False).reset_index(drop=True)

In [76]:
high_scores = get_list_recommendations(my_all.drop_duplicates(subset='Const'), 
                                       critic_score=70, num_matches=5)
high_scores

Unnamed: 0,source_film,score,title,url,directors,short_cast,writing,producers,year,production_companies,Metacritic_score,RT_score,IMdb_score,critic_score,directors_gender
0,X-Men: Apocalypse,0.583996,X-Men: Days of Future Past,https://www.imdb.com/title/tt1877832,[Bryan Singer],"[Hugh Jackman, James McAvoy, Michael Fassbende...","[Jane Goldman, Matthew Vaughn]","[Luca Marco Paracels, Hutch Parker, Derek Hoff...",2014,"[Revolution Sun Studios, The Donners' Company,...",75,90,80,75,[2]
1,Ghost in the Shell,0.562149,Ghost in the Shell 2.0,https://www.imdb.com/title/tt1260502,[Mamoru Oshii],"[Atsuko Tanaka, Akio Otsuka, Kouichi Yamadera,...",[Kazunori Ito],[],2008,[Production I.G],0,100,80,100,[2]
2,Avengers: Infinity War,0.560937,Captain America: Civil War,https://www.imdb.com/title/tt3498820,"[Anthony Russo, Joe Russo]","[Chris Evans, Robert Downey Jr., Scarlett Joha...","[Christopher Markus, Stephen McFeely]","[Mitchell Bell, Lars P. Winther, Henning Molfe...",2016,[Marvel Studios],75,91,78,75,"[2, 2]"
3,Annie Hall,0.560102,Manhattan,https://www.imdb.com/title/tt0079522,[Woody Allen],"[Woody Allen, Diane Keaton, Michael Murphy, Ma...",[Marshall Brickman],[Charles H. Joffe],1979,"[United Artists, Jack Rollins & Charles H. Jof...",83,95,79,83,[2]
4,From Russia with Love,0.542885,Dr. No,https://www.imdb.com/title/tt0055928,[Terence Young],"[Sean Connery, Ursula Andress, Joseph Wiseman,...","[Richard Maibaum, Berkely Mather, Johanna Harw...","[Albert R. Broccoli, Harry Saltzman]",1962,"[United Artists, Eon Productions, Danjaq]",78,95,73,78,[2]
5,The Thin Man,0.539013,After the Thin Man,https://www.imdb.com/title/tt0027260,[W.S. Van Dyke],"[William Powell, Myrna Loy, Elissa Landi, Jame...","[Frances Goodrich, Albert Hackett, Dashiell Ha...",[Hunt Stromberg],1936,[Metro-Goldwyn-Mayer],0,100,77,100,[2]
6,Monty Python and the Holy Grail,0.535955,Monty Python Live (Mostly),https://www.imdb.com/title/tt3872778,"[Eric Idle, Aubrey Powell]","[John Cleese, Terry Gilliam, Eric Idle, Terry ...","[Terry Gilliam, Michael Palin, Graham Chapman,...","[Jim Beach, Fiz Oliver]",2014,"[Hipgnosis Ltd., Picturehouse Entertainment]",0,0,77,77,"[2, 2]"
7,Monty Python and the Holy Grail,0.529772,And Now for Something Completely Different,https://www.imdb.com/title/tt0066765,[Ian MacNaughton],"[Graham Chapman, John Cleese, Terry Gilliam, E...","[Terry Gilliam, Graham Chapman, Michael Palin,...",[Patricia Casey],1971,"[Columbia Pictures, Playboy Productions, Kettl...",0,91,76,91,[2]
8,Monty Python and the Holy Grail,0.518382,The Meaning of Life,https://www.imdb.com/title/tt0085959,"[Terry Gilliam, Terry Jones]","[Terry Gilliam, Graham Chapman, John Cleese, E...","[Graham Chapman, Michael Palin, Eric Idle, Joh...",[John Goldstone],1983,"[Celandine Films, The Monty Python Partnership...",0,86,76,86,"[2, 2]"
9,Mission: Impossible - Fallout,0.515774,Mission: Impossible - Rogue Nation,https://www.imdb.com/title/tt2381249,[Christopher McQuarrie],"[Tom Cruise, Jeremy Renner, Simon Pegg, Rebecc...",[Drew Pearce],"[Bryan Burk, David Ellison, Tom Cruise, J.J. A...",2015,"[Paramount, China Movie Channel, Bad Robot, TC...",75,93,74,75,[2]


In [174]:
high_scores_not_seen = get_list_recommendations(my_all.drop_duplicates(subset='Const'), 
                                       critic_score=70, num_matches=5, seen_it=True)
high_scores_not_seen

Unnamed: 0,source_film,score,title,url,directors,short_cast,writing,producers,year,production_companies,Metacritic_score,RT_score,IMdb_score,critic_score,directors_gender
0,Ghost in the Shell,0.562149,Ghost in the Shell 2.0,https://www.imdb.com/title/tt1260502,[Mamoru Oshii],"[Atsuko Tanaka, Akio Otsuka, Kouichi Yamadera,...",[Kazunori Ito],[],2008,[Production I.G],0,100,80,100,[2]
1,The Thin Man,0.539013,After the Thin Man,https://www.imdb.com/title/tt0027260,[W.S. Van Dyke],"[William Powell, Myrna Loy, Elissa Landi, Jame...","[Frances Goodrich, Albert Hackett, Dashiell Ha...",[Hunt Stromberg],1936,[Metro-Goldwyn-Mayer],0,100,77,100,[2]
2,Monty Python and the Holy Grail,0.535955,Monty Python Live (Mostly),https://www.imdb.com/title/tt3872778,"[Eric Idle, Aubrey Powell]","[John Cleese, Terry Gilliam, Eric Idle, Terry ...","[Terry Gilliam, Michael Palin, Graham Chapman,...","[Jim Beach, Fiz Oliver]",2014,"[Hipgnosis Ltd., Picturehouse Entertainment]",0,0,77,77,"[2, 2]"
3,Monty Python and the Holy Grail,0.529772,And Now for Something Completely Different,https://www.imdb.com/title/tt0066765,[Ian MacNaughton],"[Graham Chapman, John Cleese, Terry Gilliam, E...","[Terry Gilliam, Graham Chapman, Michael Palin,...",[Patricia Casey],1971,"[Columbia Pictures, Playboy Productions, Kettl...",0,91,76,91,[2]
4,Monty Python and the Holy Grail,0.518382,The Meaning of Life,https://www.imdb.com/title/tt0085959,"[Terry Gilliam, Terry Jones]","[Terry Gilliam, Graham Chapman, John Cleese, E...","[Graham Chapman, Michael Palin, Eric Idle, Joh...",[John Goldstone],1983,"[Celandine Films, The Monty Python Partnership...",0,86,76,86,"[2, 2]"
5,Mission: Impossible - Fallout,0.515774,Mission: Impossible - Rogue Nation,https://www.imdb.com/title/tt2381249,[Christopher McQuarrie],"[Tom Cruise, Jeremy Renner, Simon Pegg, Rebecc...",[Drew Pearce],"[Bryan Burk, David Ellison, Tom Cruise, J.J. A...",2015,"[Paramount, China Movie Channel, Bad Robot, TC...",75,93,74,75,[2]
6,Monty Python and the Holy Grail,0.507116,Monty Python Live at the Hollywood Bowl,https://www.imdb.com/title/tt0084352,"[Ian MacNaughton, Terry Hughes]","[Terry Gilliam, John Cleese, Graham Chapman, E...","[Terry Gilliam, Graham Chapman, Michael Palin,...",[James Rich Jr.],1982,"[HandMade Films, Columbia Pictures]",0,0,79,79,"[2, 2]"
7,John Wick: Chapter 2,0.489805,John Wick: Chapter 3 - Parabellum,https://www.imdb.com/title/tt6146586,[Chad Stahelski],"[Keanu Reeves, Halle Berry, Ian McShane, Laure...","[Marc Abrams, Derek Kolstad, Chris Collins, Sh...","[Erica Lee, Basil Iwanyk, John R. Saunders, Je...",2019,"[Summit Entertainment, Thunder Road Pictures, ...",73,90,76,73,[2]
8,The Front Page,0.444521,Avanti!,https://www.imdb.com/title/tt0068240,[Billy Wilder],"[Jack Lemmon, Juliet Mills, Clive Revill, Edwa...",[I. A. L. Diamond],[],1972,[The Mirisch Corporation],0,88,72,88,[2]
9,Deadpool,0.431517,Deadpool: No Good Deed,https://www.imdb.com/title/tt6612630,[David Leitch],"[Ryan Reynolds, Stan Lee]","[Ryan Reynolds, Paul Wernick, Rhett Reese]",[],2017,[20th Century Fox],0,0,81,81,[2]


In [78]:
high_scores_f = get_list_recommendations(my_all.drop_duplicates(subset='Const'), 
                                       critic_score=70, num_matches=5, f_directors=True)
high_scores_f

Unnamed: 0,source_film,score,title,url,directors,short_cast,writing,producers,year,production_companies,Metacritic_score,RT_score,IMdb_score,critic_score,directors_gender
0,Jupiter Ascending,0.356313,The Matrix,https://www.imdb.com/title/tt0133093,"[Lilly Wachowski, Lana Wachowski]","[Keanu Reeves, Laurence Fishburne, Carrie-Anne...",[],"[Richard Mirisch, Carol Hughes, Joel Silver, D...",1999,"[Village Roadshow Pictures, Groucho II Film Pa...",73,88,87,73,"[1, 1]"
1,Mixed Nuts,0.344016,Sleepless in Seattle,https://www.imdb.com/title/tt0108160,[Nora Ephron],"[Tom Hanks, Meg Ryan, Bill Pullman, Rita Wilso...","[David S. Ward, Jeff Arch]","[Delia Ephron, Gary Foster, James W. Skotchdop...",1993,[TriStar Pictures],72,74,68,72,[1]
2,How to Marry a Millionaire,0.323029,The Incredibly True Adventure of Two Girls in ...,https://www.imdb.com/title/tt0113416,[Maria Maggenti],"[Laurel Holloman, Nicole Ari Parker, Maggie Mo...",[],[Dolly Hall],1995,[Smash Pictures],0,70,65,70,[1]
3,Mixed Nuts,0.309295,Lovely & Amazing,https://www.imdb.com/title/tt0258273,[Nicole Holofcener],"[Catherine Keener, Brenda Blethyn, Emily Morti...",[],[],2001,[Lionsgate Home Entertainment],75,86,68,75,[1]
4,Guess Who's Coming to Dinner,0.308607,Lovely & Amazing,https://www.imdb.com/title/tt0258273,[Nicole Holofcener],"[Catherine Keener, Brenda Blethyn, Emily Morti...",[],[],2001,[Lionsgate Home Entertainment],75,86,68,75,[1]
5,The Shop Around the Corner,0.307125,The Incredibly True Adventure of Two Girls in ...,https://www.imdb.com/title/tt0113416,[Maria Maggenti],"[Laurel Holloman, Nicole Ari Parker, Maggie Mo...",[],[Dolly Hall],1995,[Smash Pictures],0,70,65,70,[1]
6,Moonstruck,0.306452,The Incredibly True Adventure of Two Girls in ...,https://www.imdb.com/title/tt0113416,[Maria Maggenti],"[Laurel Holloman, Nicole Ari Parker, Maggie Mo...",[],[Dolly Hall],1995,[Smash Pictures],0,70,65,70,[1]
7,Moonstruck,0.302896,Lovely & Amazing,https://www.imdb.com/title/tt0258273,[Nicole Holofcener],"[Catherine Keener, Brenda Blethyn, Emily Morti...",[],[],2001,[Lionsgate Home Entertainment],75,86,68,75,[1]
8,Annie Hall,0.297468,The Incredibly True Adventure of Two Girls in ...,https://www.imdb.com/title/tt0113416,[Maria Maggenti],"[Laurel Holloman, Nicole Ari Parker, Maggie Mo...",[],[Dolly Hall],1995,[Smash Pictures],0,70,65,70,[1]
9,The Front Page,0.296733,Lovely & Amazing,https://www.imdb.com/title/tt0258273,[Nicole Holofcener],"[Catherine Keener, Brenda Blethyn, Emily Morti...",[],[],2001,[Lionsgate Home Entertainment],75,86,68,75,[1]


In [111]:
high_scores.to_csv('high_score_recommendations.csv', index=False)

In [109]:
high_scores_f.to_csv('high_score_recommendations_f.csv', index=False)

In [137]:
high_scores_not_seen.drop_duplicates('title').to_csv('high_score_recommendations_not_seen.csv', index=False)

In [154]:
features = ['directors', 'short_cast', 'writing', 'producers', 'production_companies']

for feature in features:
    high_scores_not_seen[feature] = high_scores_not_seen[feature].apply(lambda x: ', '.join([str(i) for i in x]))

In [175]:
high_scores_not_seen.sort_values('title', ascending=False)

Unnamed: 0,source_film,score,title,url,directors,short_cast,writing,producers,year,production_companies,Metacritic_score,RT_score,IMdb_score,critic_score,directors_gender
562,Love & Friendship,0.237664,Питер FM,https://www.imdb.com/title/tt0813541,[Oksana Bychkova],"[Yekaterina Fedulova, Yevgeni Tsyganov, Irina ...",[Nana Grinshtein],"[Elena Glikman, Alexandr Rodnyansky]",2006,[],0,0,70,70,[1]
1243,Police,0.178783,À Nos Amours,https://www.imdb.com/title/tt0086650,[Maurice Pialat],"[Sandrine Bonnaire, Evelyne Ker, Maurice Piala...",[Arlette Langmann],[Daniel Toscan du Plantier],1983,"[Les Films du Livradois, Gaumont, France 3 Cin...",0,88,73,88,[2]
1476,Jackie,0.141317,mother!,https://www.imdb.com/title/tt5109784,[Darren Aronofsky],"[Jennifer Lawrence, Javier Bardem, Ed Harris, ...",[],"[Ari Handel, Scott Franklin]",2017,[Protozoa Pictures],75,69,66,75,[2]
190,Annie Hall,0.291695,Zelig,https://www.imdb.com/title/tt0086637,[Woody Allen],"[Woody Allen, Mia Farrow, Patrick Horgan, John...",[],"[Robert Greenhut, Michael Peyser]",1983,[Orion Pictures],0,100,77,100,[2]
22,The Purple Rose of Cairo,0.377706,Zelig,https://www.imdb.com/title/tt0086637,[Woody Allen],"[Woody Allen, Mia Farrow, Patrick Horgan, John...",[],"[Robert Greenhut, Michael Peyser]",1983,[Orion Pictures],0,100,77,100,[2]
331,"Play It Again, Sam",0.266593,Zelig,https://www.imdb.com/title/tt0086637,[Woody Allen],"[Woody Allen, Mia Farrow, Patrick Horgan, John...",[],"[Robert Greenhut, Michael Peyser]",1983,[Orion Pictures],0,100,77,100,[2]
529,Captain Fantastic,0.240048,Youth in Oregon,https://www.imdb.com/title/tt3687316,[Joel David Moore],"[Frank Langella, Billy Crudup, Christina Apple...",[Andrew Eisen],[],2017,"[Campfire, Sundial Pictures]",0,0,0,0,[2]
556,The 39 Steps,0.237908,Young and Innocent,https://www.imdb.com/title/tt0029811,[Alfred Hitchcock],"[Nova Pilbeam, Derrick De Marney, Percy Marmon...","[Anthony Armstrong, Gerald Savory, Charles Ben...",[Edward Black],1937,[Gaumont British Picture Corporation],0,100,69,100,[2]
598,Secret Agent,0.234905,Young and Innocent,https://www.imdb.com/title/tt0029811,[Alfred Hitchcock],"[Nova Pilbeam, Derrick De Marney, Percy Marmon...","[Anthony Armstrong, Gerald Savory, Charles Ben...",[Edward Black],1937,[Gaumont British Picture Corporation],0,100,69,100,[2]
626,La La Land,0.232133,Young Man with a Horn,https://www.imdb.com/title/tt0043153,[Michael Curtiz],"[Kirk Douglas, Lauren Bacall, Doris Day, Hoagy...","[Edmund H. North, Carl Foreman]",[Jerry Wald],1950,[Warner Bros. Pictures],0,0,72,72,[2]


In [176]:
(high_scores_not_seen.groupby('title')['url']
    .count().reset_index().rename(columns={'url':'count'})
    ).merge(high_scores_not_seen, on='title') \
    .drop_duplicates('url') \
    .sort_values('count', ascending=False)

Unnamed: 0,title,count,source_film,score,url,directors,short_cast,writing,producers,year,production_companies,Metacritic_score,RT_score,IMdb_score,critic_score,directors_gender
590,Lovely & Amazing,18,Mixed Nuts,0.309295,https://www.imdb.com/title/tt0258273,[Nicole Holofcener],"[Catherine Keener, Brenda Blethyn, Emily Morti...",[],[],2001,[Lionsgate Home Entertainment],75,86,68,75,[1]
673,Monday,17,48 Hrs.,0.362608,https://www.imdb.com/title/tt0239655,[SABU],"[Shinichi Tsutsumi, Yasuko Matsuyuki, Masanobu...",[],[],2000,[],0,0,74,74,[2]
497,Inuyasha the Movie 4: Fire on the Mystic Island,16,Star Wars: The Last Jedi,0.254084,https://www.imdb.com/title/tt0473658,[Toshiya Shinohara],"[Kappei Yamaguchi, Satsuki Yukino]",[],[],2004,[SUNRISE],0,0,74,74,[2]
1208,The Incredibly True Adventure of Two Girls in ...,14,How to Marry a Millionaire,0.323029,https://www.imdb.com/title/tt0113416,[Maria Maggenti],"[Laurel Holloman, Nicole Ari Parker, Maggie Mo...",[],[Dolly Hall],1995,[Smash Pictures],0,70,65,70,[1]
1456,Vampires: Out For Blood,14,Westworld,0.303869,https://www.imdb.com/title/tt0375562,[Richard Brandes],"[Kevin Dillon, Vanessa Angel, Jodi Lyn O'Keefe...",[],[],2004,[],0,0,0,0,[2]
1441,Underworld U.S.A.,13,The Fate of the Furious,0.305505,https://www.imdb.com/title/tt0055571,[Samuel Fuller],"[Cliff Robertson, Dolores Dorn, Beatrice Kay, ...",[],[],1961,[Globe Enterprises],0,0,74,74,[2]
158,Brother,13,Shooter,0.370076,https://www.imdb.com/title/tt0118767,[Alexey Balabanov],"[Sergei Bodrov Jr., Svetlana Pismichenko, Mari...",[],[Sergey Selyanov],1997,[CTB Film Company],0,100,80,100,[2]
460,I Always Wanted to Be a Gangster,13,Down by Law,0.316847,https://www.imdb.com/title/tt0827713,[Samuel Benchetrit],"[Anna Mouglalis, Edouard Baer, Jean Rochefort,...",[],"[Marc Missonnier, Olivier Delbosc]",2008,[Virtual Films],0,0,71,71,[2]
227,Crossfire,12,In the Heat of the Night,0.338364,https://www.imdb.com/title/tt0039286,[Edward Dmytryk],"[Robert Young, Robert Mitchum, Robert Ryan, Gl...",[John Paxton],[Adrian Scott],1947,[RKO Radio Pictures],0,81,73,81,[2]
1425,Tropic Thunder: Rain of Madness,12,The Spy Who Dumped Me,0.270746,https://www.imdb.com/title/tt1286750,[],"[Justin Theroux, Jack Black, Steve Coogan, Rob...",[],[],2008,[],0,0,73,73,[]


In [156]:
(high_scores_not_seen.groupby(['directors', 'source_film'])['title']
    .nunique()
    .reset_index()
    .sort_values('title', ascending=False))

Unnamed: 0,directors,source_film,title
1380,Woody Allen,The Purple Rose of Cairo,5
84,Alfred Hitchcock,Secret Agent,5
1379,Woody Allen,"Play It Again, Sam",5
1378,Woody Allen,Annie Hall,5
983,Pedro Almodóvar,Pain and Glory,5
151,Billy Wilder,The Front Page,4
1013,Preston Sturges,The Lady Eve,4
882,"Michael Powell, Emeric Pressburger",Black Narcissus,3
85,Alfred Hitchcock,Shadow of a Doubt,3
86,Alfred Hitchcock,Strangers on a Train,3


In [151]:
(high_scores_f.groupby('directors')['title']
    .nunique()
    .reset_index()
    .sort_values('title', ascending=False))

Unnamed: 0,directors,title
33,Claire Denis,4
76,Kelly Reichardt,4
74,Kathryn Bigelow,4
10,Andrea Arnold,3
116,Nicole Holofcener,3
111,Mira Nair,3
56,Ida Lupino,3
144,Sofia Coppola,3
106,Martha Coolidge,3
43,Elaine May,3


In [148]:
((high_scores_f.groupby('directors')['title']
    .nunique()
    .reset_index()
    .rename(columns={'title':'count'})).merge(high_scores_f, on='directors')
    .sort_values(['count', 'critic_score'], ascending=False)
    .drop_duplicates('title'))

Unnamed: 0,directors,count,source_film,score,title,url,short_cast,writing,producers,year,production_companies,Metacritic_score,RT_score,IMdb_score,critic_score,directors_gender
564,Kathryn Bigelow,4,Dunkirk,0.214583,The Hurt Locker,https://www.imdb.com/title/tt0887912,"Jeremy Renner, Anthony Mackie, Brian Geraghty,...",,"Kirk Shaw, Jack Schuster, Nicolas Chartier, Gr...",2008,"Film Capital Europe Funds (FCEF ), First Light...",95,97,76,95,[1]
615,Kathryn Bigelow,4,Clear and Present Danger,0.140469,Zero Dark Thirty,https://www.imdb.com/title/tt1790885,"Jessica Chastain, Jason Clarke, Mark Strong, J...",,"Megan Ellison, Mark Boal",2012,"First Light, Columbia Pictures, Annapurna Pict...",95,91,74,95,[1]
282,Claire Denis,4,Tunes of Glory,0.127076,Beau Travail,https://www.imdb.com/title/tt0209933,"Denis Lavant, Michel Subor, Grégoire Colin, Ni...","Jean-Pol Fargeau, Herman Melville",Patrick Grandperret,2000,"Tanaïs Productions, La Sept-Arte, S.M. Films",91,81,74,91,[1]
643,Kelly Reichardt,4,Certain Women,0.222998,Meek's Cutoff,https://www.imdb.com/title/tt1518812,"Michelle Williams, Bruce Greenwood, Will Patto...",Jonathan Raymond,"David Urrutia, Elizabeth Cuthrell, Neil Kopp, ...",2010,"Oscilloscope, filmscience, Evenstar Films",85,86,65,85,[1]
648,Kelly Reichardt,4,Leave No Trace,0.147708,Old Joy,https://www.imdb.com/title/tt0468526,"Daniel London, Will Oldham, Tanya Smith, Robin...",Jonathan Raymond,,2006,"Washington Square Films, filmscience, Van Hoy/...",84,84,67,84,[1]
289,Claire Denis,4,Jackie,0.097052,White Material,https://www.imdb.com/title/tt1135952,"Isabelle Huppert, Christopher Lambert, Nicolas...","Marie N'Diaye, Lucie Borleteau","Serge Hayat, Pascal Caucheteux",2010,"Why Not Productions, Wild Bunch, France 3 Cinéma",81,87,69,81,[1]
642,Kelly Reichardt,4,Certain Women,0.230646,Wendy and Lucy,https://www.imdb.com/title/tt1152850,"Michelle Williams, Will Patton, Will Oldham, J...",Jonathan Raymond,"Larry Fessenden, Neil Kopp, Anish Savjani",2008,"Oscilloscope, Glass Eye Pix, filmscience, Wash...",80,85,71,80,[1]
259,Claire Denis,4,The Shop Around the Corner,0.280875,Let the Sunshine In,https://www.imdb.com/title/tt6423776,"Juliette Binoche, Xavier Beauvois, Gérard Depa...",Christine Angot,"Philippe Logie, Olivier Delbosc",2017,"Curiosa Films, Cinémage 10",79,0,60,79,[1]
271,Claire Denis,4,Bunny Lake Is Missing,0.176822,High Life,https://www.imdb.com/title/tt4827558,"Robert Pattinson, Juliette Binoche, André Benj...","Geoff Cox, Jean-Pol Fargeau","Claudia Steffen, Klaudia Smieja, D.J. Gugenhei...",2018,"Wild Bunch, ARTE France Cinéma, Canal+, Pandor...",77,83,59,77,[1]
561,Kathryn Bigelow,4,Destroyer,0.244624,Detroit,https://www.imdb.com/title/tt5390504,"John Boyega, Will Poulter, John Krasinski, Ant...",,"Colin Wilson, Mark Boal, Matthew Budman, Megan...",2017,"Annapurna Pictures, First Light",77,84,73,77,[1]


# Get Gav's recommendations

In [None]:
gav_2019 = pd.read_csv('data/gav_2019.csv', encoding = 'latin')
gav_2018 = pd.read_csv('data/gav_2018.csv', encoding = 'latin')
gav = pd.concat([gav_2019, gav_2018], ignore_index=True)