### Notebook info:
> **Preprocessing Library** <br/>
> *Preprocessing_Library.ipynb* Version 1.0 <br/>
> Last updated in: September 15th, 2021; by Luiz Gustavo Fagundes Malpele. <br/>
    
</div>
<br/><hr/>

<br/>

### Package/library dependencies:

- **matplotlib**, for plots and graphs
- **numpy**, for float-point ranges
- **plotly**, for plotting aesthetics
- **pandas**, for reading json files into data frames
- **datetime**, for time related operations

In [1]:
#import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import math
from datetime import datetime, timedelta
import plotly.express as px 
import plotly.graph_objects as go

In [2]:
movies_data_path = '../data/movies_streaming_platforms.csv'
movies_cleaned_data_path = '../data/movies_streaming_platforms_cleaned.csv'

In [3]:
def get_color_palette():
    '''
    Standard color palette used for visualization and UI
    '''
    #Standard Color Palette
    color_light_blue = '#0194ED'
    color_dark_blue = '#294D6E'
    color_red = '#FF494E'
    color_gray = '#F0F9FF'
    return color_light_blue, color_dark_blue, color_red, color_gray

In [4]:
def get_color_platforms_palette():
    '''
    Standard color palette used for platforms-related visualization and UI
    '''
    #Standard Color Palette
    color_netflix = '#E50914'
    color_hulu = '#3DBB3D'
    color_prime_video = '#00A8E1'
    color_disney = '#332765'
    return color_netflix, color_hulu, color_prime_video, color_disney

<br/>

## Visualization Plots:

In [5]:
def plot_column_list(movies_data:pd.DataFrame, column_name:{'genres', 'directors', 'country', 'language'}, top:int):
    '''
    Plots a bar chart for list-based columns.
    '''
    #Gets unique list elements
    unique_list = unique_list_elements(movies_data, column_name = column_name)

    #Generates dummy variables
    movies_data = get_column_dummies_from_list(movies_data, column_name = column_name, merge_dummies = True)
    
    #Get color palette
    color_light_blue, color_dark_blue, color_red, color_gray = get_color_palette()

    #Starts an empty list
    elements_series = pd.Series(dtype = 'float64')

    #Creates a list of the aggregated output of the dummy columns
    for unique_element in unique_list:
        elements_aggregation = movies_data[unique_element].sum()
        elements_series[unique_element] = elements_aggregation  
    elements_series = elements_series.sort_values(ascending = False).head(top)

    #Starts Visualizations
    fig_aggregated = go.Figure()
    
    #Adds the data for the bar chart
    fig_aggregated.add_trace(go.Bar(x = elements_series.index,
                                    y = elements_series.values,
                                    marker_color = color_red))
                             
    #Update Y-axis Labels for figure 1
    fig_aggregated.update_yaxes(title_text='Frequency')
    
    #Standard Figure Layout for Data Visualization
    fig_aggregated.update_layout(
        dict(
            height=600, 
            width=1000,
            plot_bgcolor = "#F1F1F3",
            paper_bgcolor = 'white',
            #xaxis_tickformat = '%d %B <br>%Y',
            title = 'Bar Chart of all ' + str(column_name)))
    
    return fig_aggregated.show()

In [6]:
def plot_scores_distribution(movies_data:pd.DataFrame):
    '''
    Plots Histograms for Scores Movies' Distribution
    '''
    #Get color palette
    color_light_blue, color_dark_blue, color_red, color_gray = get_color_palette()
    
    fig_scores = make_subplots(rows=1, cols=2,
                               subplot_titles=('Distribution of IMDB Scores', 'Distribution of Rotten Tomato Scores'),
                               #shared_xaxes=True,
                               vertical_spacing = 0.05)
    
    #Creates Histogram for the distribution of IMDB Scores
    fig_scores.add_trace(go.Histogram(x = movies_data['imdb'],
                                      marker_color = color_red,
                                      opacity = 0.85,
                                      showlegend = False), 
                         row=1, col=1) # Row 1, Column 1
    
    #Creates Histogram for the distribution of Rotten Tomato Scores
    fig_scores.add_trace(go.Histogram(x=movies_data['rotten_tomatoes'],
                                      marker_color= color_dark_blue,
                                      showlegend = False,
                                      opacity=0.85), 
                         row=1, col=2) # Row 1, Column 2
    
    #Update Y-axis Labels for figure 1
    fig_scores.update_yaxes(title_text='Frequency', row=1, col=1)
    
    #Update Y-axis Labels for figure 2
    fig_scores.update_yaxes(title_text='Frequency', row=1, col=2)
    
    #Standard Figure Layout for Data Visualization
    fig_scores.update_layout(
        dict(
            height=600, 
            width=1000,
            plot_bgcolor = "#F1F1F3",
            paper_bgcolor = 'white',
            #xaxis_tickformat = '%d %B <br>%Y',
            title = 'Frequency Distribution of Critics\' scores'))
    
    #Returns Fig Scores
    return fig_scores

In [7]:
def plot_scores_per_platform(movies_data:pd.DataFrame):
    '''
    Plots Box-Plots for Scores Movies' Distribution per platforms.
    '''
    #Get color palette
    color_netflix, color_hulu, color_prime_video, color_disney = get_color_platforms_palette()
    
    fig_scores = make_subplots(rows=2, cols=1,
                               subplot_titles=('Boxplot of Rotten Tomatoes Scores', 'Boxplot of IMDB Scores'),
                               #shared_xaxes=True,
                               vertical_spacing = 0.2)
    
    #Creates Histogram for the distribution of IMDB Scores
    fig_scores.add_trace(go.Box(x = movies_data['rotten_tomatoes'][movies_data['netflix'] == True],
                                marker_color = color_netflix,
                                #opacity = 0.85,
                                showlegend = False,
                                name = 'Netflix'), row=1, col=1) # Row 1, Column 1
    
    fig_scores.add_trace(go.Box(x = movies_data['rotten_tomatoes'][movies_data['disney'] == True],
                                marker_color = color_disney,
                                #opacity = 0.85,
                                showlegend = False,
                                name = 'Disney+'), row=1, col=1) # Row 1, Column 1
    
    fig_scores.add_trace(go.Box(x = movies_data['rotten_tomatoes'][movies_data['hulu'] == True],
                                marker_color = color_hulu,
                                opacity = 0.85,
                                showlegend = False,
                                name = 'Hulu'), row=1, col=1) # Row 1, Column 1

    fig_scores.add_trace(go.Box(x = movies_data['rotten_tomatoes'][movies_data['prime_video'] == True],
                                marker_color = color_prime_video,
                                #opacity = 0.85,
                                showlegend = False,
                                name = 'Prime Video'), row=1, col=1) # Row 1, Column 1
        
    #Creates Histogram for the distribution of Rotten Tomato Scores
    fig_scores.add_trace(go.Box(x = movies_data['imdb'][movies_data['netflix'] == True],
                                marker_color = color_netflix,
                                #opacity = 0.85,
                                showlegend = False,
                                name = 'Netflix'), row=2, col=1) # Row 1, Column 1
    
    fig_scores.add_trace(go.Box(x = movies_data['imdb'][movies_data['disney'] == True],
                                marker_color = color_disney,
                                #opacity = 0.85,
                                showlegend = False,
                                name = 'Disney+'), row=2, col=1) # Row 1, Column 1
    
    fig_scores.add_trace(go.Box(x = movies_data['imdb'][movies_data['hulu'] == True],
                                marker_color = color_hulu,
                                #opacity = 0.85,
                                showlegend = False,
                                name = 'Hulu'), row=2, col=1) # Row 1, Column 1
    
    fig_scores.add_trace(go.Box(x = movies_data['imdb'][movies_data['prime_video'] == True],
                                marker_color = color_prime_video,
                                #opacity = 0.85,
                                showlegend = False,
                                name = 'Prime Video'), row=2, col=1) # Row 1, Column 1
    
    #Update Y-axis Labels for figure 1
    fig_scores.update_xaxes(title_text='Critics\' Score', row=1, col=1)
    
    #Update Y-axis Labels for figure 2
    fig_scores.update_xaxes(title_text='Critics\' Score', row=2, col=1)
    
    #Standard Figure Layout for Data Visualization
    fig_scores.update_layout(
        dict(
            height=700, 
            width=1000,
            plot_bgcolor = "#F1F1F3",
            paper_bgcolor = 'white',
            #xaxis_tickformat = '%d %B <br>%Y',
            title = 'Boxplot of Critics\' scores per Streaming Platform'))
    
    #Returns Fig Scores
    return fig_scores

In [8]:
def plot_runtime_distribution(movies_data:pd.DataFrame):
    '''
    Plots Histograms for Run-time variable
    '''
    #Get color palette
    color_light_blue, color_dark_blue, color_red, color_gray = get_color_palette()
    
    fig_runtime = make_subplots(rows=1, cols=1,
                               #subplot_titles=('Distribution of Run Time'),
                               #shared_xaxes=True,
                               vertical_spacing = 0.05)
    
    #Creates Histogram for the distribution of Run Time
    fig_runtime.add_trace(go.Histogram(x = movies_data['runtime'],
                                      marker_color= color_dark_blue,
                                      opacity=0.85), 
                         row=1, col=1) # Row 1, Column 1
    
    
    #Update Y-axis Labels for figure 3
    fig_runtime.update_yaxes(title_text='Frequency', row=1, col=1)

    
    
    #Standard Figure Layout for Data Visualization
    fig_runtime.update_layout(
        dict(
            height=600, 
            width=1000,
            plot_bgcolor = "#F1F1F3",
            paper_bgcolor = 'white',
            #xaxis_tickformat = '%d %B <br>%Y',
            title = 'Frequency Distribution of Run Time'))
 
    
    #Returns Fig Run Time
    return fig_runtime

In [9]:
def plot_platforms_distribution(movies_data:pd.DataFrame):
    '''
    Plots Histograms for Run-time variable
    '''
    #Get color palette
    color_light_blue, color_dark_blue, color_red, color_gray = get_color_palette()
    
    fig_platforms = make_subplots(rows=1, cols=4,
                               subplot_titles=('Netflix','Hulu','Prime','Disney'),
                               #shared_xaxes=True,
                               vertical_spacing = 0.05)
    
    #Creates Histogram for the distribution of IMDB Scores
    fig_platforms.add_trace(go.Histogram(x = movies_data['netflix'],
                                      marker_color= color_dark_blue,
                                      opacity=0.85),
                         row=1, col=1) # Row 1, Column 1
    
    
    fig_platforms.add_trace(go.Histogram(x = movies_data['hulu'],
                                      marker_color= color_red,
                                      opacity=0.85), row=1, col=2)
    
    fig_platforms.add_trace(go.Histogram(x = movies_data['prime_video'],
                                      marker_color= color_gray,
                                      opacity=0.85), row=1, col=3)
        
    fig_platforms.add_trace(go.Histogram(x = movies_data['disney'],
                                      marker_color= color_light_blue,
                                      opacity=0.85), row=1, col=4)
    
    #Update Y-axis Labels for figure 1
    fig_platforms.update_yaxes(title_text='Frequency', row=1, col=1)
    
    #Update Y-axis Labels for figure 2
    fig_platforms.update_yaxes(title_text='Frequency', row=1, col=2)
    
    #Standard Figure Layout for Data Visualization
    fig_platforms.update_layout(
        dict(
            height=600, 
            width=1000,
            plot_bgcolor = "#F1F1F3",
            paper_bgcolor = 'white',
            #xaxis_tickformat = '%d %B <br>%Y',
            title = 'Frequency Distribution of Platforms'))
    
    #Returns Fig Scores
    return fig_platforms

In [10]:
def plot_age_distribution(movies_data:pd.DataFrame):
    '''
    Plots Histograms 
    '''
    #Get color palette
    color_light_blue, color_dark_blue, color_red, color_gray = get_color_palette()
    
    fig_age = make_subplots(rows=1, cols=1,
                               subplot_titles=('Distribution of Age'),
                               #shared_xaxes=True,
                               vertical_spacing = 0.05)
    
    #Creates Histogram for the distribution of Age
    fig_age.add_trace(go.Histogram(x = movies_data['age'],
                                      marker_color= color_dark_blue,
                                      opacity=0.85), 
                         row=1, col=1) # Row 1, Column 1
    
    
    #Update Y-axis Labels for figure 3
    fig_age.update_yaxes(title_text='Frequency', row=1, col=1)

    
    
    #Standard Figure Layout for Data Visualization
    fig_age.update_layout(
        dict(
            height=600, 
            width=1000,
            plot_bgcolor = "#F1F1F3",
            paper_bgcolor = 'white',
            #xaxis_tickformat = '%d %B <br>%Y',
            title = 'Frequency Distribution of Age'))
 
    
    #Returns Fig Age
    return fig_age

<br/>

### Testing: