### Notebook info:
> **Preprocessing Library** <br/>
> *Preprocessing_Library.ipynb* Version 1.0 <br/>
> Last updated in: September 15th, 2021; by Luiz Gustavo Fagundes Malpele. <br/>
    
</div>
<br/><hr/>

<br/>

### Package/library dependencies:

- **matplotlib**, for plots and graphs
- **numpy**, for float-point ranges
- **plotly**, for plotting aesthetics
- **pandas**, for reading json files into data frames
- **datetime**, for time related operations

In [1]:
#import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import math
from datetime import datetime, timedelta
import plotly.express as px 
import plotly.graph_objects as go

In [45]:
movies_data_path = '../data/movies_streaming_platforms.csv'
movies_cleaned_data_path = '../data/movies_streaming_platforms_cleaned.csv'

In [6]:
def read_movies_dataframe(path:str):
    '''
    Takes the DataFrame paths' as argment and does basic preprocessing to 
    the movies DataFrame like dropping columns and chaging datatypes.
    '''
    # Reading Movies' DataFrame
    df = pd.read_csv(path, index_col = 'index',
                              names = ['index', 'id', 'title', 'year', 'age', 'imdb', 
                                       'rotten_tomatoes', 'netflix' , 'hulu', 'prime_video', 
                                       'disney', 'type', 'directors', 'genres', 'country', 
                                       'language','runtime'], 
                              skiprows = 1,
                              dtype =  {'netflix': bool, 'hulu': bool,
                                        'prime_video':bool, 'disney':bool})
    
    # Dropping Id and Type Columns
    df = df.drop(['id', 'type'], axis=1)
    return df

In [3]:
def fill_nan_values(df:pd.DataFrame):
    '''
    Fill Null-Value elemens according the columns' necessity. 
    Categorical columns received 'Other' as an additional category, 
    while numerical columns received an empty string.
    '''
    # Fills NaN values with 'Other' 
    df['genres'] = df['genres'].fillna('Other')
    df['language'] = df['language'].fillna('Other')
    df['directors'] = df['directors'].fillna('Other')
    df['country'] = df['country'].fillna('Other')
    df['rotten_tomatoes'] = df['rotten_tomatoes'].fillna('')
    df['imdb'] = df['imdb'].fillna('')
    return df

In [4]:
def get_comma_separated_to_list(df:pd.DataFrame):
    '''
    Transforms columns by spliting comma separated elements 
    and transforming into list-based columns.
    '''
    # Initializing an empty list
    df['genres'] = df['genres'].str.split(',', expand = False)
    df['language'] = df['language'].str.split(',', expand = False)
    df['directors'] = df['directors'].str.split(',', expand = False)
    df['country'] = df['country'].str.split(',', expand = False)
    return df

In [5]:
def get_numeric_scores(df:pd.DataFrame):
    '''
    Transform string-based scores into float-based scores.
    '''
    # Erares the number the '/10' or '/100' from string-based columns
    for i in range(len(df)):
        df.loc[i, 'rotten_tomatoes'] = df['rotten_tomatoes'][i][:-4]
        df.loc[i, 'imdb'] = df['imdb'][i][:-3]
        
    # Changes empty strings back to NaN values 
    df['imdb'] = df['imdb'].replace('', np.nan, regex=True)
    df['rotten_tomatoes'] = df['rotten_tomatoes'].replace('', np.nan, regex=True)
    
    # Convert string-columns to float data-type
    df['imdb'] = df['imdb'].astype(float)
    df['rotten_tomatoes'] = df['rotten_tomatoes'].astype(float)
    return df

In [48]:
def prepare_movies_dataframe(path:str, to_csv:bool = False):
    '''
    Calls all preprocessing funtions to prepare and cleanse the movies DataFrame.
    '''
    movies_data = read_movies_dataframe(path = movies_data_path)
    movies_data = fill_nan_values(df = movies_data)
    movies_data = get_comma_separated_to_list(df = movies_data)
    movies_data = get_numeric_scores(df = movies_data)
    
    #Creates a csv file on the data directory
    if to_csv:
        movies_data.to_csv('../data/movies_streaming_platforms_cleaned.csv')
        
    #Returns the cleaned dataframe
    return movies_data

In [49]:
def read_cleaned_movies_dataframe(path:str = movies_cleaned_data_path):
    '''
    Takes the cleaned DataFrame paths' as argment and returns the DataFrame
    '''
    # Reading Movies' DataFrame
    df = pd.read_csv(path, index_col = 'index')

    return df

In [50]:
prepare_movies_dataframe(path = movies_cleaned_data_path, to_csv = True)

Unnamed: 0_level_0,title,year,age,imdb,rotten_tomatoes,netflix,hulu,prime_video,disney,directors,genres,country,language,runtime
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,The Irishman,2019,18+,7.8,98.0,True,False,False,False,[Martin Scorsese],"[Biography, Crime, Drama]",[United States],"[English, Italian, Latin, Spanish, German]",209.0
1,Dangal,2016,7+,8.4,97.0,True,False,False,False,[Nitesh Tiwari],"[Action, Biography, Drama, Sport]","[India, United States, United Kingdom, Austral...","[Hindi, English]",161.0
2,David Attenborough: A Life on Our Planet,2020,7+,9.0,95.0,True,False,False,False,"[Alastair Fothergill, Jonathan Hughes, Keith S...","[Documentary, Biography]",[United Kingdom],[English],83.0
3,Lagaan: Once Upon a Time in India,2001,7+,8.1,94.0,True,False,False,False,[Ashutosh Gowariker],"[Drama, Musical, Sport]","[India, United Kingdom]","[Hindi, English]",224.0
4,Roma,2018,18+,7.7,94.0,True,False,False,False,[Other],"[Action, Drama, History, Romance, War]","[United Kingdom, United States]",[English],52.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9510,Most Wanted Sharks,2020,,,14.0,False,False,False,True,[Other],"[Crime, Reality-TV]",[United States],"[Greek, English]",
9511,Doc McStuffins: The Doc Is In,2020,,,13.0,False,False,False,True,[Chris Anthony Hamilton],[Animation],[United States],[English],23.0
9512,Ultimate Viking Sword,2019,,,13.0,False,False,False,True,[Other],[Other],[United States],[Other],
9513,Hunt for the Abominable Snowman,2011,,,10.0,False,False,False,True,[Dan Oliver],"[Drama, History]",[Other],[Other],
