## Final Project Submission

Please fill out:
* Student name: 
* Student pace: self paced / part time / full time
* Scheduled project review date/time: 
* Instructor name: 
* Blog post URL:


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import json
import re

In [None]:
rt_movie_info = pd.read_csv('./zippedData/rt.movie_info.tsv.gz', sep = '\t')
kaggle_movies = pd.read_csv('./kaggleData/movie_production.csv', encoding = 'latin1')

# Rotten Tomatoes

In [None]:
def clean_rt(rt_movie_info):
    """
    
    Takes in rt_movie_info and returns a cleaned dataframe.
    
    """
    rt_movie_info.dropna(subset = ['box_office'], inplace = True)
    
    for col in ['box_office']:
        rt_movie_info[col] = rt_movie_info[col].str.replace(r'\D', '')
        rt_movie_info[col] = rt_movie_info[col].astype('int64')
    
    return rt_movie_info





def top_10pct_rt_directors(rt_movie_info):
    """
    
    Takes in a cleaned rt_movie_info and returns a series of the top 10% grossing movie directors. 
    
    """
    top_10pct_directors = rt_movie_info.groupby(['director'])['box_office'].mean().sort_values(ascending = False).head(27)
    
    return top_10pct_directors





def plot_top_10pct_rt_directors(top_10pct_directors):
    """
    
    Takes in output of top_10pct_rt_directors and displays the series as a barplot.
    
    """
    plt.figure(figsize = (15,6))
    plt.title('Top 10% Average Grossing Movie Directors (Rotten Tomatoes)')
    directors_plot = sns.barplot(top_10pct_directors.index, top_10pct_directors.values)
    directors_plot.set_xticklabels(directors_plot.get_xticklabels(), rotation = 80);
    plt.xlabel('Movie Directors')
    plt.ylabel('Average Gross Return (Box Office)')
    return plt.show()




# Kaggle

In [None]:
def create_kaggle_profit_column(kaggle_movies):
    """
    
    Takes kaggle_movies df and returns same df with added 'profit' column.
    
    """
    
    kaggle_movies['profit'] = kaggle_movies['gross'] - kaggle_movies['budget']
    
    return kaggle_movies





def kaggle_studio_vs_profit(dataframe):
    
    """
    
    Takes kaggle_movies df and groups 'company' column by 'profit' in descending order,
    returning the top 20 most profitable studios, on average.
    
    """

    kag_studio_vs_profit = dataframe.groupby(['company'])['profit'].mean().sort_values(ascending = False).head(20)
    
    return kag_studio_vs_profit




def kaggle_studio_vs_profit_barplot(kaggle_studio_vs_profit):
    
    """
    
    Takes output of kaggle_studio_vs_profit and returns this output in the form of a barplot. 
    
    """

    plt.figure(figsize = (20,6))
    plt.title('Top 20 Most Profitable Movie Studios')
    studios_plot = sns.barplot(x = kaggle_studio_vs_profit.index,
                               y = kaggle_studio_vs_profit.values,
                               palette = 'Blues_r')
    studios_plot.set_xticklabels(studios_plot.get_xticklabels(), rotation = 70);
    plt.xlabel('Movie Studios')
    plt.ylabel('Average Profit ($10M)')
    return plt.show()
    
    

    
    
def clean_kaggle_ratings(dataframe):

    """
    
    Takes kaggle_movies df and returns cleaned 'rating' column.
    
    """

    dataframe['rating'] = dataframe['rating'].map(lambda x: 'NOT RATED' if x == 'Not specified' else x)
    dataframe['rating'] = dataframe['rating'].map(lambda x: 'R' if x == 'NC-17' else x)

    for rating in ['B15', 'TV-MA', 'TV-PG', 'TV-14', 'B']:
        dataframe.drop(dataframe[dataframe['rating'] == rating].index, inplace = True)

    return dataframe





def rating_vs_num_movies_barplot(dataframe):
    """
    
    Takes kaggle_movies df and creates a barplot with 'rating' column on x axis
    and number of movies for each rating on y axis.
    
    """

    plt.title('MPAA Rating vs. Number of Movies')
    plt.xlabel('MPAA Rating')
    plt.ylabel('Number of Movies')
    rating_count_barplot = sns.barplot(dataframe['rating'].value_counts().index, 
                                       dataframe['rating'].value_counts().values, 
                                       palette = 'mako')
    return plt.show()
    
    
    
    
    
def rating_vs_average_profit_barplot(dataframe):

    """
    
    Takes kaggle_movies df and creates a barplot with rating on the x axis 
    and average profit in units of $10M on the y axis.
    
    """
    plt.title('MPAA Rating vs. Average Profit ($10M)')
    rating_vs_mean_profit_barplot = sns.barplot(dataframe['rating'], 
                                                dataframe['profit'],  
                                                palette = 'mako', 
                                                order = ['R', 'PG-13', 'PG', 'NOT RATED','G','UNRATED'])
    plt.xlabel('MPAA Rating')
    plt.ylabel('Average Profit ($10M)')
    return plt.show()
    
    
    
    
    
def genre_vs_profit_barplot(dataframe, genre, profit):
    plt.title('Movie Genre vs Profit')
    chart = sns.barplot(genre, 
                        profit, 
                        data = dataframe)
    chart.set_xticklabels(chart.get_xticklabels(), rotation = 60)
    chart.set(xlabel='Movie Genre', ylabel='Profit ($10M)');
    return plt.show()
    

    
    
    
    
def create_kaggle_ratings_dfs_for_subplots(dataframe):
    """
    
    Takes kaggle_movies and returns six dfs, one for each rating.
    
    """
    ratings_list = ['R', 'PG-13', 'PG', 'NOT RATED','G','UNRATED']
    dfs_list = []
    
    for rating in range(len(ratings_list)):
        df = dataframe[dataframe['rating'] == ratings_list[rating]]
        dfs_list.append(df)
    
    return dfs_list

    

# def list_to_six_subplot_tuple(dfs_list):
    
#     """
#     Converts list of dataframes returned by create_kaggle_ratings_dfs_for_subplots()
#     into a tuple formatted specifically for the subplots function called
#     create_ratings_subplots_from_dfs_list().
#     """
    
#     nested_df_list = [[], []]
#     for j in range(len(nested_df_list)):
#         for i in range(len(dfs_list)):
#             if i < (len(dfs_list)/2):
#                 nested_df_list[j].append(dfs_list[i])
#             else:
#                 nested_df_list[j].append(dfs_list[i])
#     return nested_df_list
 
    

   
    
# def create_ratings_subplots_from_dfs_list(dfs_list):
    
#     """
#     Takes list of dataframes, one for each rating, and creates a subplot for each.
#     """
    
#     axes_tuple = ((ax1, ax2, ax3), (ax4, ax5, ax6))
#     plot_list = []
#     fig, axes_tuple = plt.subplots(ncols=3, nrows = 2, sharey = True, sharex = True); 
    
#     for a in range(len(axes_tuple)):
#         for axes in range(len(axes_tuple[a])):
#             plot = sns.regplot(dfs_list[a][axes]['runtime'], dfs_list[a][axes]['profit'], ax = axes_tuple[a][axes])
#             plot_list.append(plot)
    
#     return plot_list

    
    
    
    
    
    
def ratings_subplots(dfs_list):
    """
    Hardcoded subplots for each of the six MPAA movie ratings.
    """

    
    
    fig, ((ax1, ax2, ax3), (ax4, ax5, ax6)) = plt.subplots(ncols=3, 
                                              nrows = 2,  
                                              figsize = (18,10))
    
    axes_tuple = ((ax1, ax2, ax3), (ax4, ax5, ax6))
    
    fig.suptitle('Profit vs. Runtime for each MPAA Rating')
#     for a in range(len(axes_tuple)):
#         for axes in range(len(axes_tuple[a])):
#             axes_tuple[a][axes].set_xlabel('Runtime (Minutes)')
#             axes_tuple[a][axes].set_ylabel('Profit ($10M)')
            
    fig.suptitle('Profit vs. Runtime for each MPAA Rating')

    sns.regplot(x=dfs_list[0]['runtime'], y=dfs_list[0]['profit'], ax=ax1, color = 'black')
    sns.regplot(x=dfs_list[1]['runtime'], y=dfs_list[1]['profit'], ax=ax2, color = 'darkslateblue')
    sns.regplot(x=dfs_list[2]['runtime'], y=dfs_list[2]['profit'], ax=ax3, color = 'steelblue')
    sns.regplot(x=dfs_list[3]['runtime'], y=dfs_list[3]['profit'], ax=ax4, color = 'teal')
    sns.regplot(x=dfs_list[4]['runtime'], y=dfs_list[4]['profit'], ax=ax5, color = 'c')
    sns.regplot(x=dfs_list[5]['runtime'], y=dfs_list[5]['profit'], ax=ax6, color = 'palegreen')

    for a in range(len(axes_tuple)):
        for axes in range(len(axes_tuple[a])):
            axes_tuple[a][axes].set_xlabel('Runtime (Minutes)')
            axes_tuple[a][axes].set_ylabel('Profit ($10M)')
    
    ax1.set_title('R')
    ax2.set_title('PG-13')
    ax3.set_title('PG')
    ax4.set_title('NOT RATED')
    ax5.set_title('G')
    ax6.set_title('UNRATED') 
    
    return plt.show()
    
