Build a recommender based on https://www.kaggle.com/ibtesama/getting-started-with-a-movie-recommendation-system

TODO:

In [1]:
import pandas as pd 
import numpy as np 

# Build data

In [2]:
def remove_directors(x, jobtitle):
    x[jobtitle] = [job for job in x[jobtitle] for director in x['directors'] if job != director]
#     x['test'] = [x for x in x['test']]
    return x

def remove_job(x, jobtitle, joblist):
    x[jobtitle] = [job for job in x[jobtitle] if job not in x[joblist]]
    return x

# Returns the list top 5 elements or entire list; whichever is more.
def shorten_list(x):
    if isinstance(x, list):
        names = [i for i in x]
        # Check if more than 10 elements exist. If yes, return only first three. If no, return entire list.
        if len(names) > 10:
            names = names[:10]
        return names

    #Return empty list in case of missing/malformed data
    return []

In [88]:
import dill
with open('pickles/all_films.pkl', 'rb') as file:
    all_films = dill.load(file)

In [89]:
all_films.shape

(13244, 33)

In [90]:
all_films[all_films['id']==471507]

Unnamed: 0,title,budget,credits,genres,keywords,overview,id,imdb_id,original_language,popularity,...,budget_adj,revenue_adj,profit,profit_adj,budget_bin,budget_adj_bin,imdbID,IMdb_score,Metacritic_score,RT_score
11418,Destroyer,9000000.0,"{'cast': [{'cast_id': 1, 'character': 'Erin Be...","[Thriller, Crime, Drama, Action]","[gang, undercover cop, interrogation, policewo...","Erin Bell is an LAPD detective who, as a young...",471507,tt7137380,en,13.576,...,9000000.0,5580940.0,-3419060.0,-3419060.0,5-10M,5-10M,tt7137380,62.0,62.0,74.0


In [91]:
omdb = pd.read_json('data/omdb.json')
# omdb_df = pd.DataFrame(omdb)

In [92]:
df = all_films.merge(omdb[['imdbID', 'Plot']], on='imdbID', how='left') \
    .drop(columns=['budget_imdb', 'revenue_imdb',  
                   'decade', 'budget_adj', 'revenue_adj', 
                   'profit', 'profit_adj', 'budget_bin', 
                   'budget_adj_bin', 'imdbID', 'IMdb_score', 
                   'RT_score', 'budget', 'id',
                   'revenue', 'runtime', 'status']) \
    .rename(columns={'Plot':'plot'})

In [93]:
# df['genres'] = [[x['name'] for x in list_dict] for list_dict in df['genres']]

df['directors'] = pd.Series(
    [[x['name'] for x in crew_list if x['job'] == 'Director'] 
         for crew_list in [x['crew'] for x in df['credits']]
    ])

df['directors_gender'] = pd.Series(
    [[x['gender'] for x in crew_list if x['job'] == 'Director'] 
        for crew_list in [x['crew'] for x in df['credits']]
    ], name='gender')

df['cast'] = pd.Series([[x['name'] for x in cast_list] 
              for cast_list in [x['cast'] for x in df['credits']]])

df['short_cast'] = df['cast'].apply(shorten_list)

df['short_keywords'] = df['keywords'].apply(shorten_list)

df['url'] = 'https://www.imdb.com/title/' + df['imdb_id']

df['production_companies'] = pd.Series(
    [[x['name'] for x in companies_list] for companies_list in df['production_companies']])

writing_jobs = ['Screenplay', 'Writer','Story', 'Original Story', 'Screenstory',
                'Dialogue', 'Adaptation', 'Scenario Writer', 'Story Artist', 
                'Lyricist', 'Idea', 'Co-Writer', 'Story Editor', 'Script Editor',
                'Original Film Writer', 'Teleplay', 'Script Consultant', 'Musical',
                'Head of Story', 'Creative Producer', 'Story Coordinator', 
                'Story Consultant', 'Story Manager', 'Story Supervisor']

df['writing'] = pd.Series(
    [list(set(
        [x['name'] for x in crew_list if x['job'] in writing_jobs]))
     for crew_list in [x['crew'] for x in df['credits']]
    ])

df['producers'] = pd.Series(
    [list(set(
        [x['name'] for x in crew_list if x['job'] in ['Producer', 'Co-Producer', 'Associate Producer']])) 
         for crew_list in [x['crew'] for x in df['credits']]
    ])

df['exec_producers'] = pd.Series(
    [list(set(
        [x['name'] for x in crew_list if x['job'] == 'Executive Producer'])) 
         for crew_list in [x['crew'] for x in df['credits']]
    ])

In [94]:
df = df.apply(remove_job, jobtitle='writing', joblist='directors', axis=1)
df = df.apply(remove_job, jobtitle='producers', joblist='directors', axis=1)
df = df.apply(remove_job, jobtitle='exec_producers', joblist='directors', axis=1)
df = df.apply(remove_job, jobtitle='writing', joblist='producers', axis=1)
df = df.apply(remove_job, jobtitle='writing', joblist='exec_producers', axis=1)
df = df.apply(remove_job, jobtitle='producers', joblist='exec_producers', axis=1)

# Content based filtering

We are now in a good position to define our recommendation function. These are the following steps we'll follow :-

- Get the index of the movie given its title.
- Get the list of cosine similarity scores for that particular movie with all movies. Convert it into a list of tuples where the first element is its position and the second is the similarity score.
- Sort the aforementioned list of tuples based on the similarity scores; that is, the second element.
- Get the top 10 elements of this list. Ignore the first element as it refers to self (the movie most similar to a particular movie is the movie itself).
- Return the titles corresponding to the indices of the top elements.

In [65]:
pd.options.display.max_colwidth = 200

In [13]:
# Import CountVectorizer and create the count matrix
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [14]:
df['title_year'] = df['title'] + ' ' + df['year'].apply(str)

pd.Series(df.index, index=df['title_year'])

title_year
City Lights 1931                                                                            0
Modern Times 1936                                                                           1
Scram! 1932                                                                                 2
Going Bye-Bye! 1934                                                                         3
M 1931                                                                                      4
The Prisoner of Zenda 1937                                                                  5
Mr. Smith Goes to Washington 1939                                                           6
Them Thar Hills 1934                                                                        7
Our Wife 1931                                                                               8
César 1936                                                                                  9
Gone with the Wind 1939                          

In [15]:
# Function that takes in movie title as input and outputs most similar movies
def get_recommendationsdf(title, cosine_sim):
    # Get the index of the movie that matches the title
    idx = indices[title]
    
    # Get the pairwsie similarity scores of all movies with that movie
    scores = list(enumerate(cosine_sim[idx]))
    
    results = df.merge(pd.Series([x[1] for x in scores], name='score'),
                       left_index=True, right_index=True)

    return results[['score', 'title', 'url', 'directors', 'short_cast', 
                    'writing', 'producers', 'year', 'production_companies',
                    'Metacritic_score', 'imdb_id', 'directors_gender']] \
            .sort_values('score', ascending=False)

# Function to convert all strings to lower case and strip names of spaces
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        #Check if director exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

def create_soup(x):
    soup = ' '.join(x['cleaned_short_cast']) + ' ' + \
           1*(' '.join(x['cleaned_directors']) + ' ' + \
           ' '.join(x['cleaned_exec_producers']) + ' ') + \
           1*(' '.join(x['cleaned_genres']) + ' ') + \
           1*(' '.join(x['cleaned_keywords']) + ' ') + \
           x['cleaned_plot'] + x['cleaned_overview']

    soup = ' '.join(soup.split())
                                               
    return soup


# def create_soup(x):
#     soup = 2*(' '.join(x['cleaned_short_cast']) + ' ' + \
#            ' '.join(x['cleaned_directors']) + ' ' + \
#            ' '.join(x['cleaned_writing']) + ' '  + \
#            ' '.join(x['cleaned_exec_producers']) + ' ') + \
#            ' '.join(x['cleaned_producers']) + ' ' + \
#            ' '.join(x['cleaned_production_companies']) + ' ' + \
#            2*(' '.join(x['cleaned_genres']) + ' ' + \
#            ' '.join(x['cleaned_short_keywords']) + ' ')
# #            x['plot']


In [95]:
# Apply clean_data function to your features.
features = ['short_cast', 'directors', 'genres', 'short_keywords', 'writing', 
            'producers', 'plot', 'overview', 'keywords', 'exec_producers',
            'production_companies']

for feature in features:
    df['cleaned_'+feature] = df[feature].apply(clean_data)

# Strip punctuation from plot.
df['cleaned_plot'] = df['cleaned_plot'].str.replace(r'[^\w\s]+', '')
df['cleaned_overview'] = df['cleaned_overview'].str.replace(r'[^\w\s]+', '')

# Make the soup from the different variables    
df['soup'] = df.apply(create_soup, axis=1)

# Create the count matrix
count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(df['soup'])

# Compute the Cosine Similarity matrix based on the count_matrix
cosine_sim2 = cosine_similarity(count_matrix, count_matrix)

# Reset index of our main DataFrame and construct reverse mapping as before
df = df.reset_index(drop=True)
indices = pd.Series(df.index, index=df['title'])

In [96]:
# Result with 5 cast members and adding writing
get_recommendationsdf('Casino', cosine_sim2)

Unnamed: 0,score,title,url,directors,short_cast,writing,producers,year,production_companies,Metacritic_score,imdb_id,directors_gender
4052,1.000000,Casino,https://www.imdb.com/title/tt0112641,[Martin Scorsese],"[Robert De Niro, Sharon Stone, Joe Pesci, Jame...",[Nicholas Pileggi],"[Joseph P. Reidy, Barbara De Fina]",1995,"[Universal Pictures, Syalis DA, Légende Entrep...",73.0,tt0112641,[2]
4027,0.207514,GoodFellas,https://www.imdb.com/title/tt0099685,[Martin Scorsese],"[Robert De Niro, Ray Liotta, Joe Pesci, Lorrai...",[Nicholas Pileggi],"[Bruce S. Pustin, Irwin Winkler]",1990,[Winkler Films],89.0,tt0099685,[2]
2343,0.188445,Taxi Driver,https://www.imdb.com/title/tt0075314,[Martin Scorsese],"[Robert De Niro, Jodie Foster, Albert Brooks, ...",[Paul Schrader],"[Michael Phillips, Julia Phillips, Phillip M. ...",1976,"[Italo/Judeo Productions, Bill/Phillips, Colum...",94.0,tt0075314,[2]
11122,0.177822,The Family,https://www.imdb.com/title/tt2404311,[Luc Besson],"[Robert De Niro, Michelle Pfeiffer, Dianna Agr...",[Michael Caleo],[Ryan Kavanaugh],2013,"[Canal+, Grive Productions, EuropaCorp, Relati...",42.0,tt2404311,[2]
9025,0.164488,The Irishman,https://www.imdb.com/title/tt1302006,[Martin Scorsese],"[Robert De Niro, Al Pacino, Joe Pesci, Stephen...",[Steven Zaillian],"[Emma Tillinger Koskoff, Gastón Pavlovich, Irw...",2019,"[Tribeca Productions, Sikelia Productions, Win...",94.0,tt1302006,[2]
3135,0.150756,Drugstore Cowboy,https://www.imdb.com/title/tt0097240,[Gus Van Sant],"[Matt Dillon, Kelly Lynch, James Le Gros, Heat...","[William S. Burroughs, Daniel Yost]","[Karen Murphy, Nick Wechsler]",1989,[Avenue Pictures Productions],82.0,tt0097240,[2]
2520,0.139573,Mean Streets,https://www.imdb.com/title/tt0070379,[Martin Scorsese],"[Robert De Niro, Harvey Keitel, David Proval, ...",[Mardik Martin],[Jonathan T. Taplin],1973,[Scorsese Productions],96.0,tt0070379,[2]
10696,0.139573,Last Vegas,https://www.imdb.com/title/tt1204975,[Jon Turteltaub],"[Robert De Niro, Morgan Freeman, Michael Dougl...",[Dan Fogelman],"[Joseph Drake, Amy Baer, Laurence Mark, Matt L...",2013,"[Gidden Media, CBS Films, Good Universe, Laure...",48.0,tt1204975,[2]
6718,0.139573,Ocean's Thirteen,https://www.imdb.com/title/tt0496806,[Steven Soderbergh],"[George Clooney, Brad Pitt, Matt Damon, Andy G...","[Brian Koppelman, David Levien]",[Jerry Weintraub],2007,"[Section Eight, Village Roadshow Pictures, Jer...",62.0,tt0496806,[2]
7978,0.139573,City By The Sea,https://www.imdb.com/title/tt0269095,[Michael Caton-Jones],"[Robert De Niro, Frances McDormand, James Fran...",[Ken Hixon],[],2002,"[Warner Bros. Pictures, Epsilon Motion Picture...",50.0,tt0269095,[2]


In [57]:
recom = get_recommendationsdf('Baby Driver', cosine_sim2)
recom
# recom[recom['directors_gender'].apply(lambda x: 1 in x) & recom['score']>0]

Unnamed: 0,score,title,url,directors,short_cast,writing,producers,year,production_companies,Metacritic_score,imdb_id,directors_gender
8674,1.000000,Baby Driver,https://www.imdb.com/title/tt3890160,[Edgar Wright],"[Ansel Elgort, Kevin Spacey, Lily James, Jon B...",[],"[Nira Park, Eric Fellner, Leo Thompson, Tim Be...",2017,"[TriStar Pictures, Big Talk Productions, Media...",86,tt3890160,[2]
2027,0.161165,The Driver,https://www.imdb.com/title/tt0077474,[Walter Hill],"[Ryan O'Neal, Bruce Dern, Isabelle Adjani, Ron...",[],[Lawrence Gordon],1978,"[20th Century Fox, EMI Films]",56,tt0077474,[2]
2078,0.160872,The Getaway,https://www.imdb.com/title/tt0068638,[Sam Peckinpah],"[Steve McQueen, Ali MacGraw, Ben Johnson, Al L...",[Walter Hill],"[Gordon T. Dawson, Mitchell Brower, David Foster]",1972,"[Foster-Brower Productions, First Artists, Sol...",55,tt0068638,[0]
11858,0.154303,Force of Execution,https://www.imdb.com/title/tt2611626,[Keoni Waxman],"[Steven Seagal, Danny Trejo, Ving Rhames, Davi...","[Richard Beattie, Michael Black]","[Steven Seagal, Phillip B. Goldfine, Nicolas C...",2013,"[Steamroller Productions, Voltage Pictures]",,tt2611626,[2]
2219,0.141598,Dirty Mary Crazy Larry,https://www.imdb.com/title/tt0071424,[John Hough],"[Peter Fonda, Susan George, Adam Roarke, Kenne...","[Antonio Santean, Leigh Chapman, James H. Nich...",[Norman T. Herman],1974,"[Academy Pictures Corporation, 20th Century Fox]",52,tt0071424,[2]
9716,0.141186,Den of Thieves,https://www.imdb.com/title/tt1259528,[Christian Gudegast],"[Gerard Butler, Pablo Schreiber, Dawn Olivieri...",[Paul Scheuring],"[Jason Barhydt, Tucker Tooley, Daniel Kaslow, ...",2018,"[Atmosphere Entertainment MM, Diamond Film Pro...",49,tt1259528,[2]
6914,0.138013,Dhoom 2,https://www.imdb.com/title/tt0441048,[Sanjay Gadhvi],"[Hrithik Roshan, Abhishek Bachchan, Aishwarya ...","[Vijay Krishna Acharya, Sameer Anjaan]","[Yash Chopra, Aditya Chopra]",2006,[Yash Raj Films],,tt0441048,[0]
1048,0.134687,King Creole,https://www.imdb.com/title/tt0051818,[Michael Curtiz],"[Elvis Presley, Carolyn Jones, Walter Matthau,...","[Michael V. Gazzo, Herbert Baker]","[Hal B. Wallis, Joseph H. Hazen, Paul Nathan]",1958,"[Paramount, Hal Wallis Productions]",,tt0051818,[2]
8903,0.132314,Fast Five,https://www.imdb.com/title/tt1596343,[Justin Lin],"[Vin Diesel, Paul Walker, Jordana Brewster, Ty...",[Chris Morgan],"[Michael Fottrell, Neal H. Moritz, Vin Diesel]",2011,"[Original Film, Universal Pictures, One Race]",66,tt1596343,[2]
8494,0.131590,Victoria,https://www.imdb.com/title/tt4226388,[Sebastian Schipper],"[Laia Costa, Frederick Lau, Franz Rogowski, Bu...","[Eike Frederik Schulz, Olivia Neergaard-Holm]","[Catherine Baikousis, Jan Dressler, Anatol Nit...",2015,"[Westdeutscher Rundfunk, MonkeyBoy, RadicalMed...",77,tt4226388,[2]


## Add yet more info
What about using production companies, or adding full plots from OMDb? Or cinematographers or screenwriters?

In [30]:
def list_col_to_rows(df, list_col):
    new_df = pd.DataFrame(
        {col:np.repeat(df[col].values, df[list_col].str.len())
            for col in df.columns.drop(list_col)}
            ).assign(**{list_col:np.concatenate(df[list_col].values)})[df.columns]
    return new_df

In [35]:
jobs = pd.Series(
    [[x['job'] for x in crew_list] 
         for crew_list in [x['crew'] for x in df['credits']]
    ]).to_frame(name='jobs')

In [37]:
lst_col = 'jobs'

jobs = list_col_to_rows(jobs, lst_col)

In [None]:
jobs['jobs'].value_counts()

In [333]:
departments = pd.Series(
    [[x['department'] for x in crew_list ] 
         for crew_list in [x['crew'] for x in df['credits']]
    ]).to_frame(name='depts')

departments = list_col_to_rows(departments, 'depts')

In [None]:
departments['depts'].value_counts()

In [None]:
productionjobs = pd.Series(
    [[x['job'] for x in crew_list if x['department']=='Production'] 
         for crew_list in [x['crew'] for x in df['credits']]
    ]).to_frame(name='jobs')

productionjobs = list_col_to_rows(productionjobs, 'jobs')

productionjobs['jobs'].value_counts()[0:50]

In [None]:
pd.Series(
    [[x['name'] for x in companies_list] for companies_list in df['production_companies']],
    name='companies').to_frame()

# Test out with my IMDb lists

In [18]:
my_2019 = pd.read_csv('data/2019.csv', encoding = 'latin')
my_2018 = pd.read_csv('data/2018.csv', encoding = 'latin')
my_2017 = pd.read_csv('data/2017.csv', encoding = 'latin')
my_2016 = pd.read_csv('data/2016.csv', encoding = 'latin')

In [19]:
my_all = pd.concat([my_2019, my_2018, my_2017, my_2016], ignore_index=True)

To avoid issues with title changes between TMDb and IMDb, we can use the imdb_ids as the index. To avoid issues here, we will first remove duplicate imdb_ids. Then set the index as the imdb_id. We'll pass this as the thing to match with in the function.

In [100]:
df.drop_duplicates(subset=['imdb_id'], inplace=True)

In [101]:
indices = pd.Series(df.index, index=df['imdb_id'])

In [28]:
# Function that takes in movie title as input and outputs most similar movies
def my_list_recommendations(imdb_id, cosine_sim):
    # Get the index of the movie that matches the title
    idx = indices[imdb_id]
    
    # Get the pairwsie similarity scores of all movies with that movie
    scores = list(enumerate(cosine_sim[idx]))
    
    results = df.merge(pd.Series([x[1] for x in scores], name='score'),
                       left_index=True, right_index=True)

    return results[['score', 'title', 'url', 'directors', 'short_cast', 
                    'writing', 'producers', 'year', 'production_companies',
                    'Metacritic_score', 'imdb_id', 'directors_gender']] \
            .sort_values('score', ascending=False)

In [109]:
my_all[my_all['Title']=='Goodfellas']

Unnamed: 0,Position,Const,Created,Modified,Description,Title,URL,Title Type,IMDb Rating,Runtime (mins),Year,Genres,Num Votes,Release Date,Directors,Your Rating,Date Rated
142,84,tt0099685,2018-12-26,2018-12-26,,Goodfellas,https://www.imdb.com/title/tt0099685/,movie,8.7,146,1990,"Biography, Crime, Drama",938121,1990-09-09,Martin Scorsese,,


In [110]:
df[df['title']=='GoodFellas']

Unnamed: 0,title,credits,genres,keywords,overview,imdb_id,original_language,popularity,production_companies,production_countries,...,cleaned_genres,cleaned_short_keywords,cleaned_writing,cleaned_producers,cleaned_plot,cleaned_overview,cleaned_keywords,cleaned_exec_producers,cleaned_production_companies,soup
4027,GoodFellas,"{'cast': [{'cast_id': 16, 'character': 'James ...","[Drama, Crime]","[prison, based on novel or book, florida, 1970...","The true story of Henry Hill, a half-Irish, ha...",tt0099685,en,26.103,[Winkler Films],"[{'iso_3166_1': 'US', 'name': 'United States o...",...,"[drama, crime]","[prison, basedonnovelorbook, florida, 1970s, m...",[nicholaspileggi],"[bruces.pustin, irwinwinkler]",thestoryofhenryhillandhislifeinthemobcoveringh...,thetruestoryofhenryhillahalfirishhalfsicilianb...,"[prison, basedonnovelorbook, florida, 1970s, m...",[barbaradefina],[winklerfilms],robertdeniro rayliotta joepesci lorrainebracco...


In [115]:
imdb_list = my_all.drop_duplicates(subset='Const')
rec_list = pd.DataFrame()
film_list = []
missing = []

for film in imdb_list['Const']:
    try:
        recommendations = my_list_recommendations(film, cosine_sim2)
        # Remove films I have seen.
        recommendations = recommendations[~recommendations['imdb_id'].isin(imdb_list['Const'])]
        # Pick only female directors.
#         recommendations = recommendations[recommendations['directors_gender'].apply(lambda x: 1 in x)]
        # Pick only films from the 1970s.
#         recommendations = recommendations[(recommendations['year']>=1970) & (recommendations['year']<1980)]
        # Pick only animations.
#         recommendations = recommendations[recommendations['cleaned_genres'].apply(lambda x: 'comedy' in x)]
        # Return the highest scoring match.
        rec_list = rec_list.append(recommendations.iloc[0], ignore_index=True)
        film_list += [df[df['imdb_id']==film]['title'].item()]
        
    except:
        missing += [film]
        

rec_list['source_film'] = film_list
rec_list = rec_list[['source_film', 'score', 'title', 'url', 'directors', 
                'short_cast', 'writing', 'producers', 'year',
                'production_companies', 'Metacritic_score', 'directors_gender']]
rec_list['year'] = rec_list['year'].astype(int)

rec_list.sort_values('score', ascending=False).reset_index(drop=True)#.to_csv('recommendations.csv', index=False)

In [81]:
rec_list[['title', 'year', 'url', 'score', 'Metacritic_score']] \
    .sort_values('score', ascending=False) \
    .reset_index(drop=True) \
    .rename(columns={'score':'match_score'}) \
    .drop_duplicates(subset='title')

Unnamed: 0,title,year,url,match_score,Metacritic_score
0,Analyze That,2002,https://www.imdb.com/title/tt0289848,0.58384,37.0
1,Avengers: Infinity War,2018,https://www.imdb.com/title/tt4154756,0.55577,68.0
2,The Meaning of Life,1983,https://www.imdb.com/title/tt0085959,0.535231,
3,The Avengers,2012,https://www.imdb.com/title/tt0848228,0.445132,69.0
4,Creed,2015,https://www.imdb.com/title/tt3076658,0.429669,82.0
5,John Wick: Chapter 3 - Parabellum,2019,https://www.imdb.com/title/tt6146586,0.402015,73.0
6,Mission: Impossible - Rogue Nation,2015,https://www.imdb.com/title/tt2381249,0.394771,75.0
7,Mo' Better Blues,1990,https://www.imdb.com/title/tt0100168,0.35,61.0
8,Mr. Arkadin,1955,https://www.imdb.com/title/tt0048393,0.344265,
9,Robin and Marian,1976,https://www.imdb.com/title/tt0075147,0.299602,


In [82]:
missing

['tt0083702', 'tt4157220', 'tt6920356', 'tt8811382', 'tt8526872', 'tt0119092']

# Get Gav's recommendations

In [83]:
gav_2019 = pd.read_csv('data/gav_2019.csv', encoding = 'latin')
gav_2018 = pd.read_csv('data/gav_2018.csv', encoding = 'latin')
gav = pd.concat([gav_2019, gav_2018], ignore_index=True)

In [103]:
imdb_list = gav
rec_list = pd.DataFrame()
film_list = []
missing = []

for film in imdb_list['Const']:
    try:
        recommendations = my_list_recommendations(film, cosine_sim2)
        # Remove films I have seen.
        recommendations = recommendations[~recommendations['imdb_id'].isin(imdb_list['Const'])]
        # Pick only female directors.
#         recommendations = recommendations[recommendations['directors_gender'].apply(lambda x: 1 in x)]
        # Pick only films from the 1970s.
#         recommendations = recommendations[(recommendations['year']>=1970) & (recommendations['year']<1980)]
        # Pick only animations.
#         recommendations = recommendations[recommendations['cleaned_genres'].apply(lambda x: 'comedy' in x)]
        # Return the highest scoring match.
        rec_list = rec_list.append(recommendations.iloc[0], ignore_index=True)
        film_list += [imdb_list[imdb_list['Const']==film]['Title'].item()]
        
    except:
        missing += [film]
        

rec_list['source_film'] = film_list
rec_list = rec_list[['source_film', 'score', 'title', 'url', 'directors', 
                'short_cast', 'writing', 'producers', 'year',
                'production_companies', 'Metacritic_score', 'directors_gender']]
rec_list['year'] = rec_list['year'].astype(int)

rec_list.sort_values('score', ascending=False).reset_index(drop=True)#.to_csv('recommendations.csv', index=False)

Unnamed: 0,source_film,score,title,url,directors,short_cast,writing,producers,year,production_companies,Metacritic_score,directors_gender
0,Star Wars: Episode VIII - The Last Jedi,0.633446,Star Wars: The Force Awakens,https://www.imdb.com/title/tt2488496,[J.J. Abrams],"[Harrison Ford, Mark Hamill, Carrie Fisher, Ad...",[Lawrence Kasdan],"[Kathleen Kennedy, Michael Arndt, Bryan Burk]",2015,"[Truenorth Productions, Lucasfilm, Bad Robot]",81.0,[2]
1,Deadpool 2,0.627572,Deadpool,https://www.imdb.com/title/tt1431045,[Tim Miller],"[Ryan Reynolds, Morena Baccarin, Ed Skrein, T....",[Rhett Reese],"[Simon Kinberg, Ryan Reynolds, Lauren Shuler D...",2016,"[Marvel Entertainment, TSG Entertainment, 20th...",65.0,[2]
2,Avengers: Endgame,0.555770,Avengers: Infinity War,https://www.imdb.com/title/tt4154756,"[Anthony Russo, Joe Russo]","[Robert Downey Jr., Chris Hemsworth, Chris Eva...","[Christopher Markus, Stephen McFeely]","[Mitchell Bell, Kevin Feige]",2018,[Marvel Studios],68.0,"[2, 2]"
3,Thor: Ragnarok,0.532291,Thor: The Dark World,https://www.imdb.com/title/tt1981115,[Alan Taylor],"[Chris Hemsworth, Natalie Portman, Tom Hiddles...","[Robert Rodat, Christopher Markus, Stephen McF...","[Kenneth Branagh, David J. Grant, Jamie Christ...",2013,"[Marvel Studios, Walt Disney Pictures]",54.0,[2]
4,Spider-Man: Far from Home,0.488450,Spider-Man: Homecoming,https://www.imdb.com/title/tt2250912,[Jon Watts],"[Tom Holland, Michael Keaton, Robert Downey Jr...","[Chris McKenna, John Francis Daley, Christophe...","[Eric Hauserman Carroll, Kevin Feige, Amy Pascal]",2017,"[Marvel Studios, Columbia Pictures, Pascal Pic...",73.0,[2]
5,Captain Marvel,0.445132,The Avengers,https://www.imdb.com/title/tt0848228,[Joss Whedon],"[Robert Downey Jr., Chris Evans, Chris Hemswor...",[Zak Penn],[Kevin Feige],2012,[Marvel Studios],69.0,[2]
6,Black Panther,0.444949,Captain America: Civil War,https://www.imdb.com/title/tt3498820,"[Anthony Russo, Joe Russo]","[Chris Evans, Robert Downey Jr., Scarlett Joha...","[Christopher Markus, Stephen McFeely]","[Charlie Woebcken, Lars P. Winther, Trinh Tran...",2016,[Marvel Studios],75.0,"[2, 2]"
7,Shaun the Sheep Movie,0.384308,A Close Shave,https://www.imdb.com/title/tt0112691,[Nick Park],"[Peter Sallis, Anne Reid, Peter Hawkins, Justi...",[Bob Baker],"[Carla Shelley, Michael Rose]",1996,[Aardman Animations],,[2]
8,Inside Out,0.365148,Red's Dream,https://www.imdb.com/title/tt0093832,[John Lasseter],[],[],[],1987,[Pixar],,[2]
9,Incredibles 2,0.359211,Red's Dream,https://www.imdb.com/title/tt0093832,[John Lasseter],[],[],[],1987,[Pixar],,[2]


In [104]:
rec_list.sort_values('score', ascending=False).reset_index(drop=True).to_csv('gav_recommendations.csv', index=False)

In [85]:
rec_list[['title', 'year', 'url', 'score', 'Metacritic_score']] \
    .sort_values('score', ascending=False) \
    .reset_index(drop=True) \
    .rename(columns={'score':'match_score'}) \
    .drop_duplicates(subset='title')

Unnamed: 0,title,year,url,match_score,Metacritic_score
0,Star Wars: The Force Awakens,2015,https://www.imdb.com/title/tt2488496,0.633446,81.0
1,Deadpool,2016,https://www.imdb.com/title/tt1431045,0.627572,65.0
2,Avengers: Infinity War,2018,https://www.imdb.com/title/tt4154756,0.555770,68.0
3,Thor: The Dark World,2013,https://www.imdb.com/title/tt1981115,0.532291,54.0
4,Spider-Man: Homecoming,2017,https://www.imdb.com/title/tt2250912,0.488450,73.0
5,Riley's First Date?,2015,https://www.imdb.com/title/tt4941804,0.474693,
6,The Avengers,2012,https://www.imdb.com/title/tt0848228,0.445132,69.0
7,Captain America: Civil War,2016,https://www.imdb.com/title/tt3498820,0.444949,75.0
8,Jack-Jack Attack,2005,https://www.imdb.com/title/tt0455565,0.387298,
9,A Close Shave,1996,https://www.imdb.com/title/tt0112691,0.384308,
