This notebook follows the development of the algorithms for weighted scoring and a simple search engine for movies related to a chosen actor.

In [1]:
import pandas as pd
import numpy as np

In [2]:
%run ./support_functions.ipynb

In [6]:
data = pd.read_csv("data_preprocessed_v2.csv")

In [7]:
head(data)

Unnamed: 0,uids,titles,genres,ratings,scores,votes,lengths,directors,stars,descriptions,year
0,tt0066026,MASH,"Comedy, Drama, War",R,7.4,67665,116,Robert Altman,"Donald Sutherland, Elliott Gould, Tom Skerritt...",The staff of a Korean War field hospital use h...,1970
1,tt0065988,Little Big Man,"Adventure, Comedy, Drama",PG-13,7.6,32952,139,Arthur Penn,"Dustin Hoffman, Faye Dunaway, Chief Dan George...","Jack Crabb, looking back from extreme old age,...",1970
2,tt0066011,Love Story,"Drama, Romance",PG,6.9,31300,100,Arthur Hiller,"Ali MacGraw, Ryan O'Neal, John Marley, Ray Mil...",A boy and a girl from different backgrounds fa...,1970
3,tt0065134,Two Mules for Sister Sara,"Adventure, Romance, War",GP,7.0,24029,116,Don Siegel,"Clint Eastwood, Shirley MacLaine, Manolo Fábre...",Nun Sara (Shirley MacLaine) is on the run in M...,1970
4,tt0065421,The AristoCats,"Animation, Adventure, Comedy",G,7.1,94523,78,Wolfgang Reitherman,"Phil Harris, Eva Gabor, Sterling Holloway, Sca...","With the help of a smooth talking tomcat, a fa...",1970


In [8]:
info(data)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 298048 entries, 0 to 298047
Data columns (total 11 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   uids          298048 non-null  object 
 1   titles        298048 non-null  object 
 2   genres        298048 non-null  object 
 3   ratings       298048 non-null  object 
 4   scores        298048 non-null  float64
 5   votes         298048 non-null  int64  
 6   lengths       298048 non-null  int64  
 7   directors     298048 non-null  object 
 8   stars         298048 non-null  object 
 9   descriptions  298048 non-null  object 
 10  year          298048 non-null  int64  
dtypes: float64(1), int64(3), object(7)
memory usage: 25.0+ MB


In [27]:
data.shape

(298048, 11)

#### Inspect available ratings

In [9]:
data.ratings.unique()

array(['R', 'PG-13', 'PG', 'GP', 'G', 'M/PG', 'Not Rated', 'NC-17', 'X',
       'None', 'M', 'Unrated', 'TV-G', 'TV-14', 'TV-MA', 'TV-PG',
       'Approved', 'TV-13', 'Passed', 'TV-Y7', 'TV-Y7-FV', 'MA-13', 'AO',
       'E10+', '(Banned)', 'Open', 'TV-Y', 'E', '12', 'MA-17', 'T', '18'],
      dtype=object)

#### Inspect available genres

In [18]:
genres = set()

In [36]:
for i in range(data.shape[0]):
    glist = [el.strip() for el in data.loc[i].genres.split(",")]
    for g in glist: genres.add(g)

In [37]:
genres

{'Action',
 'Adult',
 'Adventure',
 'Animation',
 'Biography',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Family',
 'Fantasy',
 'Game-Show',
 'History',
 'Horror',
 'Music',
 'Musical',
 'Mystery',
 'News',
 'Reality-TV',
 'Romance',
 'Sci-Fi',
 'Sport',
 'Talk-Show',
 'Thriller',
 'War',
 'Western',
 'unknown'}

In [42]:
len(genres)

27

#### Inspect available actors

In [38]:
actors = set()

In [39]:
for i in range(data.shape[0]):
    alist = [el.strip() for el in data.loc[i].stars.split(",")]
    for a in alist: actors.add(a)

In [40]:
actors

{'',
 'Menaka Peiris',
 'Anwari',
 'Nick Canterucci',
 'Alkis Giannakas',
 'Nelson Rodríguez',
 'Maria Vittoria Garlanda',
 'Karen Contreras',
 'Anita Ericsson',
 'Franciele Mazetti',
 'Mangal Chowdhury',
 'Parisa Fitz-Henley',
 'Mohammad Ali Abtahi',
 'Svetlana Janjanin',
 'Yue',
 'Bailey Gambertoglio',
 'Rubén Ochandiano',
 'Krishnam Raju',
 'Helmencia Alexander',
 'Yi Wang',
 'Antoni Ruiz',
 'Yessy Gusman',
 'Olga Schoberová',
 'Gabriela Benacková',
 'Yanet Cuevas',
 'Leon Samuel Kilian',
 'Mehmet Birkiye',
 'Kim Haugan Andersen',
 'Mona Hofland',
 'Bogdan Magomedov',
 'Amber Styles',
 'Shannon Barry',
 'Tony Ramos Wright',
 'Chingsum Luk',
 'Rainer-Maria Pfeffer',
 'Martin Compston',
 'Anton Grobler',
 'Russell Dykstra',
 'Enrique Imperio',
 'Robert Braxton',
 'Hongisto',
 'Eugenio Lopez Matos',
 'Deborah Wakeham',
 'Thomas Kerr',
 'Johannes Freitag',
 'Christine Hébert',
 'Justin Bilancieri',
 'Zarina Tadjibaeva',
 'Oxmo Puccino',
 'Achmed Xussein',
 'Sofia Vigliar',
 'Kyoji Yamag

In [41]:
len(actors)

407619

Pull one actor

In [144]:
name = "Jonathan Majors"

In [145]:
df = data.loc[data.stars.str.contains(name)].reset_index()
df.drop("index", axis=1, inplace=True)
df

Unnamed: 0,uids,titles,genres,ratings,scores,votes,lengths,directors,stars,descriptions,year
0,tt5968394,Captive State,"Action, Horror, Sci-Fi",PG-13,6.0,48959,109,Rupert Wyatt,"John Goodman, Ashton Sanders, Jonathan Majors,...",Set in a Chicago neighborhood nearly a decade ...,2019
1,tt4353250,The Last Black Man in San Francisco,Drama,R,7.3,14764,121,Joe Talbot,"Jimmie Fails, Jonathan Majors, Rob Morgan, Tic...",A young man searches for home in the changing ...,2019
2,tt5013984,Gully,"Crime, Drama",,1.5,981,90,Nabil Elderkin,"Amber Heard, John Corbett, Terrence Howard, Jo...","A slightly dystopian vision of LA, we follow t...",2019
3,tt9777644,Da 5 Bloods,"Adventure, Drama, War",R,6.5,40553,154,Spike Lee,"Delroy Lindo, Jonathan Majors, Clarke Peters, ...",Four African-American vets battle the forces o...,2020


Explore weighted averages for movies pertaining to a chosen actor

In [204]:
def weighted_scoring(df, min_voters=1000):
    
    deets = np.array(df[["scores", "votes"]])
    cluster_score = df.loc[df.votes > min_voters, "scores"].mean()
    
    weighted_list = []
    for sample in deets:
        avg_score = sample[0]
        num_voters = sample[1]
        
        common_denominator = (num_voters + min_votes)
        wr = (num_voters * avg_score) / common_denominator + (cluster_score * min_voters) / common_denominator
        weighted_list.append(wr)
        
    df["Weighted_score"] = weighted_list
    
    df = df.sort_values(by="Weighted_score", ascending=False)
    
    return df

In [151]:
dfnew = weighted_scoring(df)
dfnew

Unnamed: 0,Film,Genre,Rating,Runtime,Director,Also starring,Description,Year,Weighted_score
1,The Last Black Man in San Francisco,Drama,R,121,Joe Talbot,"Jimmie Fails, Jonathan Majors, Rob Morgan, Tic...",A young man searches for home in the changing ...,2019,7.255595
3,Da 5 Bloods,"Adventure, Drama, War",R,154,Spike Lee,"Delroy Lindo, Jonathan Majors, Clarke Peters, ...",Four African-American vets battle the forces o...,2020,6.502407
0,Captive State,"Action, Horror, Sci-Fi",PG-13,109,Rupert Wyatt,"John Goodman, Ashton Sanders, Jonathan Majors,...",Set in a Chicago neighborhood nearly a decade ...,2019,6.01201
2,Gully,"Crime, Drama",,90,Nabil Elderkin,"Amber Heard, John Corbett, Terrence Howard, Jo...","A slightly dystopian vision of LA, we follow t...",2019,4.074457


In [218]:
def reccommend_byActor(name):
    name = name.title()
    
    df = data.loc[data.stars.str.contains(name)].reset_index()
    df.drop("index", axis=1, inplace=True)
    
    # obtain sorted, weighted scores
    df = weighted_scoring(df)
    
    # drop extraneous columns
    df.drop(["uids", "scores", "votes"], axis=1, inplace=True)
    
    # remove the searched name from the stars list
    # the actor is known to be in the returned movies, so only return the names of other actors

    for i in range(df.shape[0]):
        tmp = df.loc[i, "stars"].replace(", ",",").split(',')
        tmp.remove(n)
        entry = ", ".join(tmp)
        df.loc[i, "stars"] = entry
    
    df.reset_index(inplace=True)
    
    df["index"] = list(df.index +1)
    
    mapper = {"index": "Rank", "titles": "Film", "genres":"Genre", "ratings":"Rating", "scores":"Score",
              "lengths":"Runtime", "directors":"Director", "stars": "Also starring",
              "descriptions": "Description", "year": "Year", "Weighted_score": "Score"}
    
    df = df.rename(columns=mapper)
    
    print(f"Top Recommended Movies with {name}")
    return df

In [219]:
name = "JonatHAn majors"

In [220]:
df = reccommend_byActor(name)
df

Top Recommended Movies with Jonathan Majors


Unnamed: 0,Rank,Film,Genre,Rating,Runtime,Director,Also starring,Description,Year,Score
0,1,The Last Black Man in San Francisco,Drama,R,121,Joe Talbot,"Jimmie Fails, Rob Morgan, Tichina Arnold",A young man searches for home in the changing ...,2019,7.255595
1,2,Da 5 Bloods,"Adventure, Drama, War",R,154,Spike Lee,"Delroy Lindo, Clarke Peters, Norm Lewis",Four African-American vets battle the forces o...,2020,6.502407
2,3,Captive State,"Action, Horror, Sci-Fi",PG-13,109,Rupert Wyatt,"John Goodman, Ashton Sanders, Vera Farmiga",Set in a Chicago neighborhood nearly a decade ...,2019,6.01201
3,4,Gully,"Crime, Drama",,90,Nabil Elderkin,"Amber Heard, John Corbett, Terrence Howard","A slightly dystopian vision of LA, we follow t...",2019,4.074457


In [217]:
df["Rank"] = list(df.index +1)

In [202]:
for i in range(df.shape[0]):
    n = name.title()
    tmp = df.loc[i, "Also starring"].replace(", ",",").split(',')
    tmp.remove(n)
    entry = ", ".join(tmp)
    print(entry)

John Goodman, Ashton Sanders, Vera Farmiga
Jimmie Fails, Rob Morgan, Tichina Arnold
Amber Heard, John Corbett, Terrence Howard
Delroy Lindo, Clarke Peters, Norm Lewis


In [188]:
name

'JonatHAn majors'

In [169]:
df.loc[0].Description

"Set in a Chicago neighborhood nearly a decade after an occupation by an extraterrestrial force, 'Captive State' explores the lives on both sides of the conflict - the collaborators and dissidents."