Build a recommender based on https://www.kaggle.com/ibtesama/getting-started-with-a-movie-recommendation-system

TODO:

In [1]:
import pandas as pd 
import numpy as np 

# Build data

In [2]:
def remove_directors(x, jobtitle):
    x[jobtitle] = [job for job in x[jobtitle] for director in x['directors'] if job != director]
#     x['test'] = [x for x in x['test']]
    return x

def remove_job(x, jobtitle, joblist):
    x[jobtitle] = [job for job in x[jobtitle] if job not in x[joblist]]
    return x

# Returns the list top 5 elements or entire list; whichever is more.
def shorten_list(x):
    if isinstance(x, list):
        names = [i for i in x]
        # Check if more than 10 elements exist. If yes, return only first three. If no, return entire list.
        if len(names) > 10:
            names = names[:10]
        return names

    #Return empty list in case of missing/malformed data
    return []

In [3]:
import dill
with open('pickles/all_films.pkl', 'rb') as file:
    all_films = dill.load(file)

In [4]:
all_films.columns

Index(['budget', 'credits', 'genres', 'id', 'imdb_id', 'keywords',
       'original_language', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'title', 'vote_average', 'vote_count',
       'budget_imdb', 'revenue_imdb', 'year', 'decade', 'budget_adj',
       'revenue_adj', 'profit', 'profit_adj', 'budget_bin', 'budget_adj_bin',
       'imdbID', 'IMdb_score', 'Metacritic_score', 'RT_score'],
      dtype='object')

In [5]:
all_films[all_films['id']==471507]

Unnamed: 0,budget,credits,genres,id,imdb_id,keywords,original_language,overview,popularity,production_companies,...,budget_adj,revenue_adj,profit,profit_adj,budget_bin,budget_adj_bin,imdbID,IMdb_score,Metacritic_score,RT_score


In [6]:
omdb = pd.read_json('data/omdb.json')
# omdb_df = pd.DataFrame(omdb)

In [7]:
df = all_films.merge(omdb[['imdbID', 'Plot']], on='imdbID', how='left') \
    .drop(columns=['budget_imdb', 'revenue_imdb',  
                   'decade', 'budget_adj', 'revenue_adj', 
                   'profit', 'profit_adj', 'budget_bin', 
                   'budget_adj_bin', 'imdbID', 'IMdb_score', 
                   'RT_score', 'budget', 'id',
                   'revenue', 'runtime', 'status']) \
    .rename(columns={'Plot':'plot'})

In [8]:
# df['genres'] = [[x['name'] for x in list_dict] for list_dict in df['genres']]

df['directors'] = pd.Series(
    [[x['name'] for x in crew_list if x['job'] == 'Director'] 
         for crew_list in [x['crew'] for x in df['credits']]
    ])

df['directors_gender'] = pd.Series(
    [[x['gender'] for x in crew_list if x['job'] == 'Director'] 
        for crew_list in [x['crew'] for x in df['credits']]
    ], name='gender')

df['cast'] = pd.Series([[x['name'] for x in cast_list] 
              for cast_list in [x['cast'] for x in df['credits']]])

df['short_cast'] = df['cast'].apply(shorten_list)

df['short_keywords'] = df['keywords'].apply(shorten_list)

df['url'] = 'https://www.imdb.com/title/' + df['imdb_id']

df['production_companies'] = pd.Series(
    [[x['name'] for x in companies_list] for companies_list in df['production_companies']])

writing_jobs = ['Screenplay', 'Writer','Story', 'Original Story', 'Screenstory',
                'Dialogue', 'Adaptation', 'Scenario Writer', 'Story Artist', 
                'Lyricist', 'Idea', 'Co-Writer', 'Story Editor', 'Script Editor',
                'Original Film Writer', 'Teleplay', 'Script Consultant', 'Musical',
                'Head of Story', 'Creative Producer', 'Story Coordinator', 
                'Story Consultant', 'Story Manager', 'Story Supervisor']

df['writing'] = pd.Series(
    [list(set(
        [x['name'] for x in crew_list if x['job'] in writing_jobs]))
     for crew_list in [x['crew'] for x in df['credits']]
    ])

df['producers'] = pd.Series(
    [list(set(
        [x['name'] for x in crew_list if x['job'] in ['Producer', 'Co-Producer', 'Associate Producer']])) 
         for crew_list in [x['crew'] for x in df['credits']]
    ])

df['exec_producers'] = pd.Series(
    [list(set(
        [x['name'] for x in crew_list if x['job'] == 'Executive Producer'])) 
         for crew_list in [x['crew'] for x in df['credits']]
    ])

In [9]:
df = df.apply(remove_job, jobtitle='writing', joblist='directors', axis=1)
df = df.apply(remove_job, jobtitle='producers', joblist='directors', axis=1)
df = df.apply(remove_job, jobtitle='exec_producers', joblist='directors', axis=1)
df = df.apply(remove_job, jobtitle='writing', joblist='producers', axis=1)
df = df.apply(remove_job, jobtitle='writing', joblist='exec_producers', axis=1)
df = df.apply(remove_job, jobtitle='producers', joblist='exec_producers', axis=1)

# Content based filtering

We are now in a good position to define our recommendation function. These are the following steps we'll follow :-

- Get the index of the movie given its title.
- Get the list of cosine similarity scores for that particular movie with all movies. Convert it into a list of tuples where the first element is its position and the second is the similarity score.
- Sort the aforementioned list of tuples based on the similarity scores; that is, the second element.
- Get the top 10 elements of this list. Ignore the first element as it refers to self (the movie most similar to a particular movie is the movie itself).
- Return the titles corresponding to the indices of the top elements.

In [65]:
pd.options.display.max_colwidth = 200

In [10]:
# Import CountVectorizer and create the count matrix
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [114]:
df['title_year'] = df['title'] + ' ' + df['year'].apply(str)

pd.Series(df.index, index=df['title_year'])

In [54]:
# Function that takes in movie title as input and outputs most similar movies
def get_recommendationsdf(title, cosine_sim):
    # Get the index of the movie that matches the title
    idx = indices[title]
    
    # Get the pairwsie similarity scores of all movies with that movie
    scores = list(enumerate(cosine_sim[idx]))
    
    results = df.merge(pd.Series([x[1] for x in scores], name='score'),
                       left_index=True, right_index=True)

    return results[['score', 'title', 'url', 'directors', 'short_cast', 
                    'writing', 'producers', 'year', 'production_companies',
                    'Metacritic_score', 'imdb_id', 'directors_gender']] \
            .sort_values('score', ascending=False)

# Function to convert all strings to lower case and strip names of spaces
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        #Check if director exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

def create_soup(x):
    soup = ' '.join(x['cleaned_short_cast']) + ' ' + \
           1*(' '.join(x['cleaned_directors']) + ' ' + \
           ' '.join(x['cleaned_exec_producers']) + ' ') + \
           1*(' '.join(x['cleaned_genres']) + ' ') + \
           1*(' '.join(x['cleaned_keywords']) + ' ') + \
           x['cleaned_plot'] + x['cleaned_overview']

    soup = ' '.join(soup.split())
                                               
    return soup


# def create_soup(x):
#     soup = 2*(' '.join(x['cleaned_short_cast']) + ' ' + \
#            ' '.join(x['cleaned_directors']) + ' ' + \
#            ' '.join(x['cleaned_writing']) + ' '  + \
#            ' '.join(x['cleaned_exec_producers']) + ' ') + \
#            ' '.join(x['cleaned_producers']) + ' ' + \
#            ' '.join(x['cleaned_production_companies']) + ' ' + \
#            2*(' '.join(x['cleaned_genres']) + ' ' + \
#            ' '.join(x['cleaned_short_keywords']) + ' ')
# #            x['plot']


In [55]:
# Apply clean_data function to your features.
features = ['short_cast', 'directors', 'genres', 'short_keywords', 'writing', 
            'producers', 'plot', 'overview', 'keywords', 'exec_producers',
            'production_companies']

for feature in features:
    df['cleaned_'+feature] = df[feature].apply(clean_data)

# Strip punctuation from plot.
df['cleaned_plot'] = df['cleaned_plot'].str.replace(r'[^\w\s]+', '')
df['cleaned_overview'] = df['cleaned_overview'].str.replace(r'[^\w\s]+', '')

# Make the soup from the different variables    
df['soup'] = df.apply(create_soup, axis=1)

# Create the count matrix
count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(df['soup'])

# Compute the Cosine Similarity matrix based on the count_matrix
cosine_sim2 = cosine_similarity(count_matrix, count_matrix)

# Reset index of our main DataFrame and construct reverse mapping as before
df = df.reset_index(drop=True)
indices = pd.Series(df.index, index=df['title'])

In [58]:
# Result with 5 cast members and adding writing
get_recommendationsdf('Casino', cosine_sim2)

Unnamed: 0,score,title,url,directors,short_cast,writing,producers,year,production_companies,Metacritic_score,imdb_id,directors_gender
3541,1.000000,Casino,https://www.imdb.com/title/tt0112641,[Martin Scorsese],"[Robert De Niro, Sharon Stone, Joe Pesci, Jame...",[Nicholas Pileggi],"[Joseph P. Reidy, Barbara De Fina]",1995,"[Universal Pictures, Syalis DA, Légende Entrep...",73,tt0112641,[2]
3518,0.207514,GoodFellas,https://www.imdb.com/title/tt0099685,[Martin Scorsese],"[Robert De Niro, Ray Liotta, Joe Pesci, Lorrai...",[Nicholas Pileggi],"[Bruce S. Pustin, Irwin Winkler]",1990,[Winkler Films],89,tt0099685,[2]
1861,0.188445,Taxi Driver,https://www.imdb.com/title/tt0075314,[Martin Scorsese],"[Robert De Niro, Jodie Foster, Albert Brooks, ...",[Paul Schrader],"[Julia Phillips, Phillip M. Goldfarb, Michael ...",1976,"[Italo/Judeo Productions, Bill/Phillips, Colum...",94,tt0075314,[2]
10321,0.177822,The Family,https://www.imdb.com/title/tt2404311,[Luc Besson],"[Robert De Niro, Michelle Pfeiffer, Dianna Agr...",[Michael Caleo],[Ryan Kavanaugh],2013,"[Canal+, Grive Productions, EuropaCorp, Relati...",42,tt2404311,[2]
2639,0.150756,Drugstore Cowboy,https://www.imdb.com/title/tt0097240,[Gus Van Sant],"[Matt Dillon, Kelly Lynch, James Le Gros, Heat...","[William S. Burroughs, Daniel Yost]","[Nick Wechsler, Karen Murphy]",1989,[Avenue Pictures Productions],82,tt0097240,[2]
9947,0.139573,Last Vegas,https://www.imdb.com/title/tt1204975,[Jon Turteltaub],"[Robert De Niro, Morgan Freeman, Michael Dougl...",[Dan Fogelman],"[Joseph Drake, Amy Baer, Matt Leonetti, Lauren...",2013,"[Gidden Media, CBS Films, Good Universe, Laure...",48,tt1204975,[2]
6195,0.139573,Ocean's Thirteen,https://www.imdb.com/title/tt0496806,[Steven Soderbergh],"[George Clooney, Brad Pitt, Matt Damon, Andy G...","[Brian Koppelman, David Levien]",[Jerry Weintraub],2007,"[Section Eight, Village Roadshow Pictures, Jer...",62,tt0496806,[2]
2035,0.139573,Mean Streets,https://www.imdb.com/title/tt0070379,[Martin Scorsese],"[Robert De Niro, Harvey Keitel, David Proval, ...",[Mardik Martin],[Jonathan T. Taplin],1973,[Scorsese Productions],96,tt0070379,[2]
7419,0.139573,City By The Sea,https://www.imdb.com/title/tt0269095,[Michael Caton-Jones],"[Robert De Niro, Frances McDormand, James Fran...",[Ken Hixon],[],2002,"[Warner Bros. Pictures, Epsilon Motion Picture...",50,tt0269095,[2]
2434,0.138343,Once Upon a Time in America,https://www.imdb.com/title/tt0087843,[Sergio Leone],"[Robert De Niro, James Woods, Elizabeth McGove...","[Enrico Medioli, Leonardo Benvenuti, Franco Fe...",[Arnon Milchan],1984,"[The Ladd Company, Embassy International Pictu...",,tt0087843,[2]


In [57]:
recom = get_recommendationsdf('Baby Driver', cosine_sim2)
recom
# recom[recom['directors_gender'].apply(lambda x: 1 in x) & recom['score']>0]

Unnamed: 0,score,title,url,directors,short_cast,writing,producers,year,production_companies,Metacritic_score,imdb_id,directors_gender
8674,1.000000,Baby Driver,https://www.imdb.com/title/tt3890160,[Edgar Wright],"[Ansel Elgort, Kevin Spacey, Lily James, Jon B...",[],"[Nira Park, Eric Fellner, Leo Thompson, Tim Be...",2017,"[TriStar Pictures, Big Talk Productions, Media...",86,tt3890160,[2]
2027,0.161165,The Driver,https://www.imdb.com/title/tt0077474,[Walter Hill],"[Ryan O'Neal, Bruce Dern, Isabelle Adjani, Ron...",[],[Lawrence Gordon],1978,"[20th Century Fox, EMI Films]",56,tt0077474,[2]
2078,0.160872,The Getaway,https://www.imdb.com/title/tt0068638,[Sam Peckinpah],"[Steve McQueen, Ali MacGraw, Ben Johnson, Al L...",[Walter Hill],"[Gordon T. Dawson, Mitchell Brower, David Foster]",1972,"[Foster-Brower Productions, First Artists, Sol...",55,tt0068638,[0]
11858,0.154303,Force of Execution,https://www.imdb.com/title/tt2611626,[Keoni Waxman],"[Steven Seagal, Danny Trejo, Ving Rhames, Davi...","[Richard Beattie, Michael Black]","[Steven Seagal, Phillip B. Goldfine, Nicolas C...",2013,"[Steamroller Productions, Voltage Pictures]",,tt2611626,[2]
2219,0.141598,Dirty Mary Crazy Larry,https://www.imdb.com/title/tt0071424,[John Hough],"[Peter Fonda, Susan George, Adam Roarke, Kenne...","[Antonio Santean, Leigh Chapman, James H. Nich...",[Norman T. Herman],1974,"[Academy Pictures Corporation, 20th Century Fox]",52,tt0071424,[2]
9716,0.141186,Den of Thieves,https://www.imdb.com/title/tt1259528,[Christian Gudegast],"[Gerard Butler, Pablo Schreiber, Dawn Olivieri...",[Paul Scheuring],"[Jason Barhydt, Tucker Tooley, Daniel Kaslow, ...",2018,"[Atmosphere Entertainment MM, Diamond Film Pro...",49,tt1259528,[2]
6914,0.138013,Dhoom 2,https://www.imdb.com/title/tt0441048,[Sanjay Gadhvi],"[Hrithik Roshan, Abhishek Bachchan, Aishwarya ...","[Vijay Krishna Acharya, Sameer Anjaan]","[Yash Chopra, Aditya Chopra]",2006,[Yash Raj Films],,tt0441048,[0]
1048,0.134687,King Creole,https://www.imdb.com/title/tt0051818,[Michael Curtiz],"[Elvis Presley, Carolyn Jones, Walter Matthau,...","[Michael V. Gazzo, Herbert Baker]","[Hal B. Wallis, Joseph H. Hazen, Paul Nathan]",1958,"[Paramount, Hal Wallis Productions]",,tt0051818,[2]
8903,0.132314,Fast Five,https://www.imdb.com/title/tt1596343,[Justin Lin],"[Vin Diesel, Paul Walker, Jordana Brewster, Ty...",[Chris Morgan],"[Michael Fottrell, Neal H. Moritz, Vin Diesel]",2011,"[Original Film, Universal Pictures, One Race]",66,tt1596343,[2]
8494,0.131590,Victoria,https://www.imdb.com/title/tt4226388,[Sebastian Schipper],"[Laia Costa, Frederick Lau, Franz Rogowski, Bu...","[Eike Frederik Schulz, Olivia Neergaard-Holm]","[Catherine Baikousis, Jan Dressler, Anatol Nit...",2015,"[Westdeutscher Rundfunk, MonkeyBoy, RadicalMed...",77,tt4226388,[2]


## Add yet more info
What about using production companies, or adding full plots from OMDb? Or cinematographers or screenwriters?

In [30]:
def list_col_to_rows(df, list_col):
    new_df = pd.DataFrame(
        {col:np.repeat(df[col].values, df[list_col].str.len())
            for col in df.columns.drop(list_col)}
            ).assign(**{list_col:np.concatenate(df[list_col].values)})[df.columns]
    return new_df

In [35]:
jobs = pd.Series(
    [[x['job'] for x in crew_list] 
         for crew_list in [x['crew'] for x in df['credits']]
    ]).to_frame(name='jobs')

In [37]:
lst_col = 'jobs'

jobs = list_col_to_rows(jobs, lst_col)

In [None]:
jobs['jobs'].value_counts()

In [333]:
departments = pd.Series(
    [[x['department'] for x in crew_list ] 
         for crew_list in [x['crew'] for x in df['credits']]
    ]).to_frame(name='depts')

departments = list_col_to_rows(departments, 'depts')

In [None]:
departments['depts'].value_counts()

In [None]:
productionjobs = pd.Series(
    [[x['job'] for x in crew_list if x['department']=='Production'] 
         for crew_list in [x['crew'] for x in df['credits']]
    ]).to_frame(name='jobs')

productionjobs = list_col_to_rows(productionjobs, 'jobs')

productionjobs['jobs'].value_counts()[0:50]

In [None]:
pd.Series(
    [[x['name'] for x in companies_list] for companies_list in df['production_companies']],
    name='companies').to_frame()

# Test out with my IMDb lists

In [118]:
# Function that takes in movie title as input and outputs most similar movies
def get_my_recommendationsdf(title_year, cosine_sim):
    # Get the index of the movie that matches the title
    idx = indices[title_year]
    
    # Get the pairwsie similarity scores of all movies with that movie
    scores = list(enumerate(cosine_sim[idx]))
    
    results = df.merge(pd.Series([x[1] for x in scores], name='score'),
                       left_index=True, right_index=True)

    return results[['score', 'title', 'url', 'directors', 'short_cast', 
                    'writing', 'producers', 'year', 'production_companies',
                    'Metacritic_score', 'imdb_id', 'directors_gender']] \
            .sort_values('score', ascending=False)

In [83]:
my_2019 = pd.read_csv('data/2019.csv', encoding = 'latin')
my_2018 = pd.read_csv('data/2018.csv', encoding = 'latin')
my_2017 = pd.read_csv('data/2017.csv', encoding = 'latin')
my_2016 = pd.read_csv('data/2016.csv', encoding = 'latin')

In [104]:
my_2019['Title'] = my_2019['Title'].str.replace('Kaze no tani no Naushika',
                                                'Nausicaä of the Valley of the Wind')

Goodfellas

In [105]:
my_all = pd.concat([my_2019, my_2018, my_2017, my_2016], ignore_index=True)

In [117]:
my_2019['title_year'] = my_2019['Title'] + ' ' + my_2019['Year'].apply(str)

In [146]:
my_all['title_year'] = my_all['Title'] + ' ' + my_all['Year'].apply(str)

In [None]:
my_all['Title'] = my_all['Title'].str.replace('Goodfellas', 'GoodFellas')

In [132]:
df['title_year'] = df['title'] + ' ' + df['year'].apply(str)

indices = pd.Series(df.index, index=df['title_year'])

In [190]:
rec_list = pd.DataFrame()
film_list = []
missing = []

for film in my_all['title_year']:
    try:
        recommendations = get_my_recommendationsdf(film, cosine_sim2)
        # Remove films I have seen.
        recommendations = recommendations[~recommendations['title'].isin(my_all['Title'])]
        # Pick only female directors.
#         recommendations = recommendations[recommendations['directors_gender'].apply(lambda x: 1 in x)]
        # Pick only films from the 1970s.
#         recommendations = recommendations[(recommendations['year']>=1970) & (recommendations['year']<1980)]
        # Pick only animations.
#         recommendations = recommendations[recommendations['cleaned_genres'].apply(lambda x: 'comedy' in x)]
        # Return the highest scoring match.
        rec_list = rec_list.append(recommendations.iloc[0], ignore_index=True)
        film_list += [film[:-5]]
        
    except:
        missing += [film]
        
rec_list['source_film'] = film_list
rec_list = rec_list[['source_film', 'score', 'title', 'url', 'directors', 
                'short_cast', 'writing', 'producers', 'year',
                'production_companies', 'Metacritic_score', 'directors_gender']]
rec_list['year'] = rec_list['year'].astype(int)

rec_list.sort_values('score', ascending=False).reset_index(drop=True)#.to_csv('recommendations.csv', index=False)

Unnamed: 0,source_film,score,title,url,directors,short_cast,writing,producers,year,production_companies,Metacritic_score,directors_gender
0,Deadpool,0.627572,Deadpool 2,https://www.imdb.com/title/tt5463162,[David Leitch],"[Ryan Reynolds, Josh Brolin, Zazie Beetz, Juli...",[],"[Lauren Shuler Donner, Simon Kinberg, Ryan Rey...",2018,"[Marvel Entertainment, 20th Century Fox]",66,[2]
1,Home Alone,0.620066,Home Alone 2: Lost in New York,https://www.imdb.com/title/tt0104431,[Chris Columbus],"[Macaulay Culkin, Joe Pesci, Catherine O'Hara,...",[],[John Hughes],1992,"[Hughes Entertainment, 20th Century Fox]",,[2]
2,The Santa Clause 2,0.589768,The Santa Clause 3: The Escape Clause,https://www.imdb.com/title/tt0452681,[Michael Lembeck],"[Tim Allen, Elizabeth Mitchell, Eric Lloyd, Li...","[John J. Strauss, Ed Decter]","[Brian Reilly, Robert F. Newmyer, Jeffrey Silver]",2006,"[Santa Frost Productions, Walt Disney Pictures...",32,[2]
3,Analyze This,0.583840,Analyze That,https://www.imdb.com/title/tt0289848,[Harold Ramis],"[Robert De Niro, Billy Crystal, Lisa Kudrow, J...","[Peter Tolan, Peter Steinfeld, Kenneth Lonergan]","[Jane Rosenthal, Paula Weinstein]",2002,"[Face Productions, Village Roadshow Pictures, ...",37,[2]
4,Paddington 2,0.548795,Paddington,https://www.imdb.com/title/tt1109624,[Paul King],"[Ben Whishaw, Hugh Bonneville, Sally Hawkins, ...","[Hamish McColl, Michael Bond]",[David Heyman],2014,"[Heyday Films, StudioCanal, Anton Capital Ente...",77,[2]
5,National Treasure: Book of Secrets,0.547791,National Treasure,https://www.imdb.com/title/tt0368891,[Jon Turteltaub],"[Nicolas Cage, Diane Kruger, Justin Bartha, Se...","[Jim Kouf, Cormac Wibberley, Marianne Wibberley]","[Pat Sandston, Michael E. Uslan, Benjamin Meln...",2004,"[Walt Disney Pictures, Jerry Bruckheimer Films...",39,[2]
6,Avengers: Infinity War,0.543746,Captain America: Civil War,https://www.imdb.com/title/tt3498820,"[Anthony Russo, Joe Russo]","[Chris Evans, Robert Downey Jr., Scarlett Joha...","[Christopher Markus, Stephen McFeely]","[Christoph Fisser, Mitchell Bell, Henning Molf...",2016,[Marvel Studios],75,"[2, 2]"
7,X-Men: Apocalypse,0.536056,Dark Phoenix,https://www.imdb.com/title/tt6565702,[Simon Kinberg],"[Sophie Turner, James McAvoy, Michael Fassbend...","[Chris Claremont, Dave Cockrum, John Byrne]",[Lauren Shuler Donner],2019,"[The Donners' Company, 20th Century Fox, Genre...",43,[2]
8,Thor: Ragnarok,0.532291,Thor: The Dark World,https://www.imdb.com/title/tt1981115,[Alan Taylor],"[Chris Hemsworth, Natalie Portman, Tom Hiddles...","[Don Payne, Christopher Markus, Stephen McFeel...","[Jamie Christopher, David J. Grant, Kevin Feig...",2013,[Marvel Studios],54,[2]
9,The Fate of the Furious,0.531610,Fast & Furious 6,https://www.imdb.com/title/tt1905041,[Justin Lin],"[Vin Diesel, Paul Walker, Dwayne Johnson, Jord...",[],"[Neal H. Moritz, Clayton Townsend, Vin Diesel]",2013,"[Universal Pictures, Relativity Media, Origina...",61,[2]


In [183]:
rec_list[['title', 'year', 'url', 'score', 'Metacritic_score']] \
    .sort_values('score', ascending=False) \
    .reset_index(drop=True) \
    .rename(columns={'score':'match_score'}) \
    .drop_duplicates(subset='title')

Unnamed: 0,title,year,url,match_score,Metacritic_score
0,Analyze That,2002,https://www.imdb.com/title/tt0289848,0.58384,37.0
1,Avengers: Age of Ultron,2015,https://www.imdb.com/title/tt2395427,0.492805,66.0
2,The Avengers,2012,https://www.imdb.com/title/tt0848228,0.445132,69.0
3,John Wick: Chapter 3 - Parabellum,2019,https://www.imdb.com/title/tt6146586,0.402015,73.0
4,Mission: Impossible - Rogue Nation,2015,https://www.imdb.com/title/tt2381249,0.394771,75.0
5,Mo' Better Blues,1990,https://www.imdb.com/title/tt0100168,0.35,61.0
6,Mr. Arkadin,1955,https://www.imdb.com/title/tt0048393,0.344265,
7,Fahrenheit 451,2018,https://www.imdb.com/title/tt0360556,0.305129,
8,Robin and Marian,1976,https://www.imdb.com/title/tt0075147,0.299602,
9,The Long Voyage Home,1940,https://www.imdb.com/title/tt0032728,0.291667,


In [213]:
df[df['title'].str.contains('Star Wars')][['title', 'year', 'cast']]

Unnamed: 0,title,year,cast
1857,Star Wars,1977,"[Mark Hamill, Harrison Ford, Carrie Fisher, Pe..."
4314,Star Wars: Episode I - The Phantom Menace,1999,"[Liam Neeson, Ewan McGregor, Natalie Portman, ..."
5330,Robot Chicken: Star Wars Episode II,2008,"[Seth Green, Bob Bergen, Ahmed Best, Breckin M..."
5559,Robot Chicken: Star Wars,2007,"[Candace Bailey, Bob Bergen, Ahmed Best, Seth ..."
5592,Star Wars: Episode III - Revenge of the Sith,2005,"[Ewan McGregor, Natalie Portman, Hayden Christ..."
6433,Star Wars: Episode II - Attack of the Clones,2002,"[Ewan McGregor, Natalie Portman, Hayden Christ..."
7393,Star Wars: The Clone Wars,2008,"[Tom Kane, Matt Lanter, Ashley Eckstein, James..."
8606,Robot Chicken: Star Wars Episode III,2010,"[Seth Green, Zac Efron, Seth MacFarlane, Carri..."
8651,Rogue One: A Star Wars Story,2016,"[Felicity Jones, Diego Luna, Alan Tudyk, Donni..."
8694,Star Wars: The Force Awakens,2015,"[Daisy Ridley, John Boyega, Harrison Ford, Ada..."


# Get Gav's recommendations

In [242]:
gav_2019 = pd.read_csv('data/gav_2019.csv', encoding = 'latin')
gav_2018 = pd.read_csv('data/gav_2018.csv', encoding = 'latin')
gav = pd.concat([gav_2019, gav_2018], ignore_index=True)

gav['Title'] = gav['Title'].str.replace('Birdman or \(The Unexpected Virtue of Ignorance\)',
                                                'Birdman')
gav['Title'] = gav['Title'].str.replace('Star Wars\: Episode VIII \- The Last Jedi',
                                                'Star Wars: The Last Jedi')
gav['title_year'] = gav['Title'] + ' ' + gav['Year'].apply(str)

In [243]:
rec_list = pd.DataFrame()
film_list = []
missing = []

for film in gav['title_year']:
    try:
        recommendations = get_my_recommendationsdf(film, cosine_sim2)
        # Remove films I have seen.
        recommendations = recommendations[~recommendations['title'].isin(gav['Title'])]
        # Pick only female directors.
#         recommendations = recommendations[recommendations['directors_gender'].apply(lambda x: 1 in x)]
        # Pick only films from the 1970s.
#         recommendations = recommendations[(recommendations['year']>=1970) & (recommendations['year']<1980)]
        # Pick only animations.
#         recommendations = recommendations[recommendations['cleaned_genres'].apply(lambda x: 'comedy' in x)]
        # Return the highest scoring match.
        rec_list = rec_list.append(recommendations.iloc[0], ignore_index=True)
        film_list += [film[:-5]]
        
    except:
        missing += [film]
        
rec_list['source_film'] = film_list
rec_list = rec_list[['source_film', 'score', 'title', 'url', 'directors', 
                'short_cast', 'writing', 'producers', 'year',
                'production_companies', 'Metacritic_score', 'directors_gender']]
rec_list['year'] = rec_list['year'].astype(int)

rec_list.sort_values('score', ascending=False).reset_index(drop=True).to_csv('gav_recommendations.csv', index=False)

In [199]:
rec_list[['title', 'year', 'url', 'score', 'Metacritic_score']] \
    .sort_values('score', ascending=False) \
    .reset_index(drop=True) \
    .rename(columns={'score':'match_score'}) \
    .drop_duplicates(subset='title')

Unnamed: 0,title,year,url,match_score,Metacritic_score
0,Once Upon a Deadpool,2018,https://www.imdb.com/title/,0.641941,
1,Incredibles 2,2018,https://www.imdb.com/title/tt3606756,0.590243,80
2,The Incredibles,2004,https://www.imdb.com/title/tt0317705,0.590243,90
3,Avengers: Infinity War,2018,https://www.imdb.com/title/tt4154756,0.555770,68
4,Paddington 2,2017,https://www.imdb.com/title/tt4468740,0.548795,88
5,Paddington,2014,https://www.imdb.com/title/tt1109624,0.548795,77
6,Thor: The Dark World,2013,https://www.imdb.com/title/tt1981115,0.532291,54
7,Spider-Man: Homecoming,2017,https://www.imdb.com/title/tt2250912,0.488450,73
8,Riley's First Date?,2015,https://www.imdb.com/title/tt4941804,0.474693,
9,The Avengers,2012,https://www.imdb.com/title/tt0848228,0.445132,69
