# Nicole's code starts here

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Read in csv
movies = pd.read_csv("IMDb movies.csv")
print(movies.shape)
movies.head(1)

(85854, 22)


  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,imdb_title_id,title,original_title,year,date_published,genre,duration,country,language,director,...,actors,description,avg_vote,votes,budget,usa_gross_income,worlwide_gross_income,metascore,reviews_from_users,reviews_from_critics
0,tt11777308,Murder Death Koreatown,Murder Death Koreatown,2020,3/21/2020,"Crime, Horror, Mystery",80,USA,English,,...,,"After his neighbor's real life murder, an unem...",5.1,140,$0,,,,4.0,36.0


In [3]:
# Remove rows w/NaN values from pertinent columns
movies = movies[~movies.budget.isna()]
movies = movies[~movies.worlwide_gross_income.isna()]
movies = movies[~movies.metascore.isna()]

In [4]:
# Remove $ and , from budget/income columns
movies['budget'] = movies['budget'].replace({'\$': '', ',': ''}, regex=True)
movies['worlwide_gross_income'] = movies['worlwide_gross_income'].replace({'\$': '', ',': ''}, regex=True)

In [5]:
# Remove non-US currencies (all have text) from budget column
movies = movies[pd.to_numeric(movies['budget'], errors='coerce').notnull()]

In [6]:
# Convert currency columns to floats
movies['budget'] = movies['budget'].astype(float)
movies['worlwide_gross_income'] = movies['worlwide_gross_income'].astype(float)

In [7]:
movies['worlwide_gross_income'] = round(movies['worlwide_gross_income'], 2)

In [8]:
# Calculate Revenue % Column
movies['revenue_percent'] = round(((movies['worlwide_gross_income'] - movies['budget']) / movies['budget']) * 100, 2)

In [9]:
# Remove extraneous columns & reset index
movies.drop(columns=['title', 'date_published', 'description', 'usa_gross_income',
                     'reviews_from_users', 'reviews_from_critics'], inplace=True)
movies.set_index('imdb_title_id', inplace=True)

In [10]:
# removed the following from 'drop' column in case model can be trained to predict revenues:
# 'budget', 'worlwide_gross_income'

In [11]:
print(movies.shape)
movies.head()

(6379, 16)


Unnamed: 0_level_0,original_title,year,genre,duration,country,language,director,writer,production_company,actors,avg_vote,votes,budget,worlwide_gross_income,metascore,revenue_percent
imdb_title_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
tt6793280,The Devil's Doorway,2018,Horror,76,"Ireland, UK",English,Aislinn Clarke,"Martin Brennan, Aislinn Clarke",23ten,"Lalor Roddy, Ciaran Flynn, Helena Bereen, Laur...",5.2,2571,1.0,516660.0,48.0,51665900.0
tt0457452,The Reception,2005,"Comedy, Drama, Romance",80,USA,English,John G. Young,John G. Young,Black Water Films,"Maggie Burkwit, Chris Burmester, Darien Sills-...",5.8,183,5000.0,18389.0,64.0,267.78
tt0154506,Following,1998,"Crime, Mystery, Thriller",69,UK,English,Christopher Nolan,Christopher Nolan,Next Wave Films,"Jeremy Theobald, Alex Haw, Lucy Russell, John ...",7.5,84550,6000.0,48482.0,60.0,708.03
tt0104815,El mariachi,1992,"Action, Crime, Thriller",81,"Mexico, USA",Spanish,Robert Rodriguez,Robert Rodriguez,Columbia Pictures,"Carlos Gallardo, Consuelo Gómez, Jaime de Hoyo...",6.9,62362,7000.0,2040920.0,73.0,29056.0
tt0390384,Primer,2004,"Drama, Sci-Fi, Thriller",77,USA,"English, French",Shane Carruth,Shane Carruth,ERBP,"Shane Carruth, David Sullivan, Casey Gooden, A...",6.9,96229,7000.0,545436.0,68.0,7691.94


# Nicole's code ends here

# The following code provided by instructor Dom Labella to assist with translating our data into a useable format.

In [12]:
MAX_ACTORS_PER_MOVIE = 3  # Count only this many actors for each movie;
                          # i.e., ignore the lesser-known actors to keep the dataframe manageable
actors_dictionary = {}    # Define an empty dictionary for tracking the actors

In [13]:
# Walk down the dataframe, movie by movie ...
for index, row in movies.iterrows():
    
    # read the list of actors, splitting them at the comma 
    actor_names = row['actors'].split(',')
    
    # for each actor in the list (up to the maximum desired)
    for name in actor_names[0:MAX_ACTORS_PER_MOVIE]:
        
        # remove any whitespace from the name
        name = name.strip()
        
        # if the actor is already in the dictionary then
        # simply increase the count. Otherwise, add the actor
        # and set the count to 1. 
        if name in actors_dictionary:
            actors_dictionary[name] += 1
        else:
            actors_dictionary[name] = 1
            

In [14]:
# Count the total number of actors found
total_actors = len(actors_dictionary)
print(f"Found a total of {total_actors} actors")

Found a total of 7809 actors


In [15]:
# Add one new column to the dataframe for each actor found,
# and initialize that new column with 0s.
for actor in actors_dictionary:
    movies[actor] = np.zeros(movies.shape[0])

In [16]:
# Walk down the dataframe, movie by movie ...
for index, row in movies.iterrows():
    
    # read the list of actors 
    actor_names = row['actors'].split(',')
    
    # for each actor in the list (up to the maximum desired)
    for name in actor_names[0:MAX_ACTORS_PER_MOVIE]: 
                
        # remove any whitespace from the name
        name = name.strip()
                
        # then indicate that the actor starred in this movie
        movies.loc[index, name] = 1
        

In [17]:
# This cell checks the column for a particular actor, just for debugging purposes. 
# Here we see that Tom Cruise appears in 35 of the movies and doesn't appear in the other 6344.
movies['Tom Cruise'].value_counts()

0.0    6344
1.0      35
Name: Tom Cruise, dtype: int64

In [18]:
movies['Carrie Fisher'].value_counts()

0.0    6369
1.0      10
Name: Carrie Fisher, dtype: int64

# Dom's code ends here

# Marianne's Code starts here

In [19]:
# create a dropdown list of actors, in order of # of movies, to create drop-down for website
top_actors = dict(sorted(actors_dictionary.items(), key=lambda item: item[1], reverse=True))

In [20]:
# top_actors

In [21]:
actor_df = pd.DataFrame(top_actors.items(), columns=['actor', 'total movies'])

actor_df.head()

Unnamed: 0,actor,total movies
0,Robert De Niro,57
1,Nicolas Cage,56
2,Bruce Willis,48
3,Samuel L. Jackson,41
4,Clint Eastwood,40


In [22]:
# create the 'success' column that will be used to train the model/predict the outcome of a movie
# set the initial value to 0 for 'not successful'
movies['success'] = 0


In [23]:
# Walk down the dataframe, movie by movie ...
for index, row in movies.iterrows():
    
    if row['revenue_percent'] >= 200:
        movies.loc[index, 'success'] = 1
    
# success based on 200% revenue based on this article:
# https://io9.gizmodo.com/how-much-money-does-a-movie-need-to-make-to-be-profitab-5747305


In [24]:
# define an empty dictionary for tracking genres
genre_dictionary = {}

In [25]:
# go through the dataframe row by row
for index, row in movies.iterrows():
    
#     read the list of genres and split at the comma
    genre_names = row['genre'].split(',')   
    
#     for each genre in the list
    for genre in genre_names:
        
#     remove whitespace from the name
        genre = genre.strip()
    
#     if the genre is in the dictionary, increase the count. 
#     otherwise, add the genre and set the count to one
        if genre in genre_dictionary:
            genre_dictionary[genre] += 1
        else: 
            genre_dictionary[genre] = 1

In [26]:
# count the total number of genres found
total_genres = len(genre_dictionary)
print(f"Found a total of {total_genres} genres")

Found a total of 21 genres


In [27]:
# create a column for each genre
for genre in genre_dictionary:
    movies[genre] = np.zeros(movies.shape[0])
    

In [28]:
# initialize the new columns

for index, row in movies.iterrows():
    genre_names = row['genre'].split(',')
    
#     for each genre in the list
    for genre in genre_names:
        genre = genre.strip()
        
        movies.loc[index, genre] = 1

In [29]:
movies['Comedy'].value_counts()

0.0    3947
1.0    2432
Name: Comedy, dtype: int64

In [30]:
movies['Action'].value_counts()

0.0    4803
1.0    1576
Name: Action, dtype: int64

In [31]:
director_dictionary = {} 

In [32]:
# repeat the actor and genre code for directors
for index, row in movies.iterrows():
    
    # read the list of director, splitting them at the comma 
    director_names = row['director'].split(',')
    
    # for each director in the list 
    for director in director_names:
        
        # remove any whitespace 
        director = director.strip()
        
        # if the director is already in the dictionary then
        # simply increase the count. Otherwise, add the director
        # and set the count to 1. 
        if director in director_dictionary:
            director_dictionary[director] += 1
        else:
            director_dictionary[director] = 1

In [33]:
# Count the total number of directors found
total_director = len(director_dictionary)
print(f"Found a total of {total_director} directors")

Found a total of 2917 directors


In [34]:
# Add one new column to the dataframe for each actor found,
# and initialize that new column with 0s.
for director in director_dictionary:
    movies[director] = np.zeros(movies.shape[0])

In [35]:
# Walk down the dataframe, movie by movie ...
for index, row in movies.iterrows():
    
    # read the list of actors 
    director_names = row['director'].split(',')
    
    # for each actor in the list (up to the maximum desired)
    for director in director_names: 
                
        # remove any whitespace from the name
        director = director.strip()
                
        # then indicate that the actor starred in this movie
        movies.loc[index, director] = 1
        

In [36]:
movies['Mel Brooks'].value_counts()

0.0    6370
1.0       9
Name: Mel Brooks, dtype: int64

In [37]:
movies['Steven Spielberg'].value_counts()

0.0    6348
1.0      31
Name: Steven Spielberg, dtype: int64

# Marianne's code ends here

In [38]:
# Save to file
movies.to_csv('moviesClean.csv', index=False)