# Nicole's code starts here

In [None]:
import pandas as pd
import numpy as np

In [None]:
# Read in csv
movies = pd.read_csv("IMDb movies.csv")
print(movies.shape)
movies.head(1)

In [None]:
# Remove rows w/NaN values from pertinent columns
movies = movies[~movies.budget.isna()]
movies = movies[~movies.worlwide_gross_income.isna()]
movies = movies[~movies.metascore.isna()]

In [None]:
# Remove $ and , from budget/income columns
movies['budget'] = movies['budget'].replace({'\$': '', ',': ''}, regex=True)
movies['worlwide_gross_income'] = movies['worlwide_gross_income'].replace({'\$': '', ',': ''}, regex=True)

In [None]:
# Remove non-US currencies (all have text) from budget column
movies = movies[pd.to_numeric(movies['budget'], errors='coerce').notnull()]

In [None]:
# Convert currency columns to floats
movies['budget'] = movies['budget'].astype(float)
movies['worlwide_gross_income'] = movies['worlwide_gross_income'].astype(float)

In [None]:
movies['worlwide_gross_income'] = round(movies['worlwide_gross_income'], 2)

In [None]:
# Calculate Revenue % Column
movies['revenue_percent'] = round(((movies['worlwide_gross_income'] - movies['budget']) / movies['budget']) * 100, 2)

In [None]:
# Remove extraneous columns & reset index
movies.drop(columns=['title', 'date_published', 'description', 'usa_gross_income',
                     'reviews_from_users', 'reviews_from_critics'], inplace=True)
movies.set_index('imdb_title_id', inplace=True)

In [None]:
# removed the following from 'drop' column in case model can be trained to predict revenues:
# 'budget', 'worlwide_gross_income'

In [None]:
print(movies.shape)
movies.head()

# Nicole's code ends here

# The following code provided by instructor Dom Labella to assist with translating our data into a useable format.

In [None]:
MAX_ACTORS_PER_MOVIE = 3  # Count only this many actors for each movie;
                          # i.e., ignore the lesser-known actors to keep the dataframe manageable
actors_dictionary = {}    # Define an empty dictionary for tracking the actors

In [None]:
# Walk down the dataframe, movie by movie ...
for index, row in movies.iterrows():
    
    # read the list of actors, splitting them at the comma 
    actor_names = row['actors'].split(',')
    
    # for each actor in the list (up to the maximum desired)
    for name in actor_names[0:MAX_ACTORS_PER_MOVIE]:
        
        # remove any whitespace from the name
        name = name.strip()
        
        # if the actor is already in the dictionary then
        # simply increase the count. Otherwise, add the actor
        # and set the count to 1. 
        if name in actors_dictionary:
            actors_dictionary[name] += 1
        else:
            actors_dictionary[name] = 1
            

In [None]:
# Count the total number of actors found
total_actors = len(actors_dictionary)
print(f"Found a total of {total_actors} actors")

In [None]:
# Add one new column to the dataframe for each actor found,
# and initialize that new column with 0s.
for actor in actors_dictionary:
    movies[actor] = np.zeros(movies.shape[0])

In [None]:
# Walk down the dataframe, movie by movie ...
for index, row in movies.iterrows():
    
    # read the list of actors 
    actor_names = row['actors'].split(',')
    
    # for each actor in the list (up to the maximum desired)
    for name in actor_names[0:MAX_ACTORS_PER_MOVIE]: 
                
        # remove any whitespace from the name
        name = name.strip()
                
        # then indicate that the actor starred in this movie
        movies.loc[index, name] = 1
        

In [None]:
# This cell checks the column for a particular actor, just for debugging purposes. 
# Here we see that Tom Cruise appears in 35 of the movies and doesn't appear in the other 6344.
movies['Tom Cruise'].value_counts()

In [None]:
movies['Carrie Fisher'].value_counts()

# Dom's code ends here

# Marianne's Code starts here

In [None]:
# create a dropdown list of actors, in order of # of movies, to create drop-down for website
top_actors = dict(sorted(actors_dictionary.items(), key=lambda item: item[1], reverse=True))

In [None]:
# top_actors

In [None]:
actor_df = pd.DataFrame(top_actors.items(), columns=['actor', 'total movies'])

actor_df.head()

In [None]:
# Save to file
actor_df.to_csv('top_actors_for_drop_down.csv', index=False)

In [None]:
# create the 'success' column that will be used to train the model/predict the outcome of a movie
# set the initial value to 0 for 'not successful'
movies['success'] = 0


In [None]:
# Walk down the dataframe, movie by movie ...
for index, row in movies.iterrows():
    
    if row['revenue_percent'] >= 200:
        movies.loc[index, 'success'] = 1
    
# success based on 200% revenue based on this article:
# https://io9.gizmodo.com/how-much-money-does-a-movie-need-to-make-to-be-profitab-5747305


In [None]:
# define an empty dictionary for tracking genres
genre_dictionary = {}

In [None]:
# go through the dataframe row by row
for index, row in movies.iterrows():
    
#     read the list of genres and split at the comma
    genre_names = row['genre'].split(',')   
    
#     for each genre in the list
    for genre in genre_names:
        
#     remove whitespace from the name
        genre = genre.strip()
    
#     if the genre is in the dictionary, increase the count. 
#     otherwise, add the genre and set the count to one
        if genre in genre_dictionary:
            genre_dictionary[genre] += 1
        else: 
            genre_dictionary[genre] = 1

In [None]:
# count the total number of genres found
total_genres = len(genre_dictionary)
print(f"Found a total of {total_genres} genres")

In [None]:
# create a column for each genre
for genre in genre_dictionary:
    movies[genre] = np.zeros(movies.shape[0])
    

In [None]:
# initialize the new columns

for index, row in movies.iterrows():
    genre_names = row['genre'].split(',')
    
#     for each genre in the list
    for genre in genre_names:
        genre = genre.strip()
        
        movies.loc[index, genre] = 1

In [None]:
movies['Comedy'].value_counts()

In [None]:
movies['Action'].value_counts()

In [None]:
director_dictionary = {} 

In [None]:
# repeat the actor and genre code for directors
for index, row in movies.iterrows():
    
    # read the list of director, splitting them at the comma 
    director_names = row['director'].split(',')
    
    # for each director in the list 
    for director in director_names:
        
        # remove any whitespace 
        director = director.strip()
        
        # if the director is already in the dictionary then
        # simply increase the count. Otherwise, add the director
        # and set the count to 1. 
        if director in director_dictionary:
            director_dictionary[director] += 1
        else:
            director_dictionary[director] = 1

In [None]:
# Count the total number of directors found
total_director = len(director_dictionary)
print(f"Found a total of {total_director} directors")

In [None]:
# Add one new column to the dataframe for each actor found,
# and initialize that new column with 0s.
for director in director_dictionary:
    movies[director] = np.zeros(movies.shape[0])

In [None]:
# Walk down the dataframe, movie by movie ...
for index, row in movies.iterrows():
    
    # read the list of actors 
    director_names = row['director'].split(',')
    
    # for each actor in the list (up to the maximum desired)
    for director in director_names: 
                
        # remove any whitespace from the name
        director = director.strip()
                
        # then indicate that the actor starred in this movie
        movies.loc[index, director] = 1
        

In [None]:
movies['Mel Brooks'].value_counts()

In [None]:
movies['Steven Spielberg'].value_counts()

# Marianne's code ends here

In [None]:
# Save to file
movies.to_csv('moviesClean.csv', index=False)