In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('cleaned_score.csv')
df = df.drop(columns=['Unnamed: 0']) # drop useless column

In [3]:
#df_score = df.copy()
#df_score = df_score.dropna(subset=['score'])
#df_score = df_score.loc[df_score['score'] >=5.5 ]

In [4]:
# Turn all characters into lower case and delete all spaces
def lower_space(x):
        return str.lower(x.replace(" ", ""))

In [5]:
# Select useful features to be processed by lower_space()
features=['type','title','director','cast','country','release_year','rating','listed_in','description','score']
df_lower = df.copy()
df_lower=df_lower[features]
for feature in features:
    if feature != 'release_year' and feature != 'score':
        df_lower[feature] = df_lower[feature].apply(lower_space)
    
df_lower['cast'] = df_lower['cast'].apply(lambda x: x.strip().split(',')[:3])
df_lower['cast'] = [','.join(map(str, l)) for l in df_lower['cast']]
df_lower.head(2)

Unnamed: 0,type,title,director,cast,country,release_year,rating,listed_in,description,score
0,movie,dickjohnsonisdead,kirstenjohnson,nodata,unitedstates,2020,pg-13,documentaries,"asherfathernearstheendofhislife,filmmakerkirst...",7.4
1,tvshow,blood&water,nodata,"amaqamata,khosingema,gailmabalane",southafrica,2021,tv-ma,"internationaltvshows,tvdramas,tvmysteries","aftercrossingpathsataparty,acapetownteensetsou...",6.6


In [6]:
# function for creating bag-of-words
def create_soup(x):
    #return x['type']+ ' ' + x['title']+ ' ' + x['director'] + ' ' + x['cast'] + ' ' + x['country'] + ' ' + x['release_year'] + ' ' + x['rating'] + ' ' +x['listed_in']+' '+ x['description']+' '+ x['score']
    return x['type']+ ' ' + x['title']+ ' ' + x['director'] + ' ' + x['cast'] + ' ' + x['country'] + ' ' + x['release_year'] + ' ' + x['rating'] + ' ' +x['listed_in']+' '+ x['description']

In [7]:
# create bag-of-words
df_str = df_lower.copy()
df_str = df_str.applymap(str)
df_str['soup'] = df_str.apply(create_soup, axis=1)

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# generate Count Vectorizer matrix
count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(df_str['soup'])

# generate Cosine Similarity matrix
cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [9]:
# use movie titiles as indices
df_str=df_str.reset_index()
indices = pd.Series(df_str.index, index=df_str['title'])
indices

title
dickjohnsonisdead         0
blood&water               1
ganglands                 2
jailbirdsneworleans       3
kotafactory               4
                       ... 
zodiac                 8789
zombiedumb             8790
zombieland             8791
zoom                   8792
zubaan                 8793
Length: 8794, dtype: int64

In [10]:
def get_recommendations(title, cosine_sim=cosine_sim):
    title=title.replace(' ','').lower()
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with the input movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    top_recom = []
    for i in sim_scores:
        if df['score'].iloc[i[0]]>= 7.0:
            top_recom.append(i[0])
    # Get the indices of the 10 most similar movies
    top_recom = top_recom[1:11]

    # Return the top 10 most similar movies
    features=['type','title','director','cast','country','release_year','rating','listed_in','description','score']
    return df[features].iloc[top_recom]

In [11]:
get_recommendations('Women Behind Bars', cosine_sim)

Unnamed: 0,type,title,director,cast,country,release_year,rating,listed_in,description,score
4494,TV Show,Making a Murderer,No Data,No Data,United States,2018,TV-14,"Crime TV Shows, Docuseries","Filmed over 10 years, this real-life thriller ...",8.6
1061,TV Show,Jeffrey Epstein: Filthy Rich,No Data,No Data,United States,2020,TV-MA,"Crime TV Shows, Docuseries",Stories from survivors fuel this docuseries ex...,7.1
2684,TV Show,The Innocence Files,No Data,No Data,United States,2020,TV-MA,"Crime TV Shows, Docuseries",The Innocence Project unravels missteps and de...,7.9
2825,TV Show,Dirty Money,No Data,No Data,United States,2020,TV-MA,"Crime TV Shows, Docuseries",From crippling payday loans to cars that cheat...,8.1
2944,TV Show,Who Killed Malcolm X?,No Data,No Data,United States,2020,TV-MA,"Crime TV Shows, Docuseries",Decades after the assassination of African Ame...,7.5
3313,TV Show,The Devil Next Door,No Data,No Data,United States,2019,TV-MA,"Crime TV Shows, Docuseries",A Cleveland grandfather is brought to trial in...,7.6
3727,TV Show,The Confession Tapes,No Data,No Data,United States,2019,TV-MA,"Crime TV Shows, Docuseries",This true crime documentary series investigate...,7.5
4980,TV Show,Wild Wild Country,No Data,No Data,United States,2018,TV-MA,"Crime TV Shows, Docuseries",When a controversial cult leader builds a utop...,8.1
1828,TV Show,Unsolved Mysteries,No Data,No Data,No Data,2020,TV-MA,"Crime TV Shows, Docuseries","Real cases of perplexing disappearances, shock...",7.6
3789,TV Show,Killer Ratings,No Data,No Data,No Data,2019,TV-MA,"Crime TV Shows, Docuseries, International TV S...",Brazilian TV personality and politician Wallac...,7.6


In [12]:
get_recommendations('Avengers: Infinity War', cosine_sim)

Unnamed: 0,type,title,director,cast,country,release_year,rating,listed_in,description,score
6320,Movie,Black Panther,Ryan Coogler,"Chadwick Boseman, Michael B. Jordan, Lupita Ny...",United States,2018,PG-13,"Action & Adventure, Sci-Fi & Fantasy","T'Challa, the superpowered new leader of the h...",7.3
946,Movie,Stargate,Roland Emmerich,"Kurt Russell, James Spader, Jaye Davidson, Viv...","United States, France",1994,PG-13,"Action & Adventure, Sci-Fi & Fantasy",An Egyptologist joins a mission into the unkno...,7.1
6498,Movie,Cloverfield,Matt Reeves,"Lizzy Caplan, Jessica Lucas, T.J. Miller, Mich...",United States,2008,PG-13,"Action & Adventure, Horror Movies, Sci-Fi & Fa...",A going-away party in Manhattan is interrupted...,7.0
7431,Movie,Men in Black,Barry Sonnenfeld,"Tommy Lee Jones, Will Smith, Linda Fiorentino,...",United States,1997,PG-13,"Action & Adventure, Comedies, Sci-Fi & Fantasy",A streetwise cop teams with a veteran governme...,7.3
8391,Movie,The Lord of the Rings: The Return of the King,Peter Jackson,"Elijah Wood, Ian McKellen, Liv Tyler, Viggo Mo...","New Zealand, United States",2003,PG-13,"Action & Adventure, Sci-Fi & Fantasy",Aragorn is revealed as the heir to the ancient...,9.0
8567,Movie,Thor: Ragnarok,Taika Waititi,"Chris Hemsworth, Tom Hiddleston, Cate Blanchet...",United States,2017,PG-13,"Action & Adventure, Comedies, Sci-Fi & Fantasy",To save Asgard from a bloodthirsty goddess of ...,7.9
6167,Movie,Ant-Man and the Wasp,Peyton Reed,"Paul Rudd, Evangeline Lilly, Michael Douglas, ...",United States,2018,PG-13,"Action & Adventure, Comedies, Sci-Fi & Fantasy","Problems big and small pop up when Scott Lang,...",7.1
594,Movie,Star Trek,J.J. Abrams,"Chris Pine, Zachary Quinto, Karl Urban, Zoe Sa...","United States, Germany",2009,PG-13,"Action & Adventure, Sci-Fi & Fantasy",On their first voyage aboard the starship Ente...,7.9
5955,Movie,9,Shane Acker,"Elijah Wood, John C. Reilly, Jennifer Connelly...",United States,2009,PG-13,"Action & Adventure, Independent Movies, Sci-Fi...","In a postapocalyptic world, rag-doll robots hi...",7.7
8392,Movie,The Lord of the Rings: The Two Towers,Peter Jackson,"Elijah Wood, Ian McKellen, Liv Tyler, Viggo Mo...","New Zealand, United States",2002,PG-13,"Action & Adventure, Sci-Fi & Fantasy",Frodo and Sam head to Mordor to destroy the On...,8.8
