# MMDb Functions

## Imports

In [1]:
## Basic
import pandas as pd
import numpy as np
# Auxiliar
import os
import calendar
import datetime as dt
from collections import Counter
from itertools import combinations
# For word frequency count
from sklearn.feature_extraction.text import CountVectorizer
# Output
import pickle
# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
## Plotting
from IPython.core.display import HTML
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
## APIs
# https://imdbpy.readthedocs.io/en/latest/usage/quickstart.html
from imdb import IMDb
# https://pytrakt.readthedocs.io/en/latest/getstarted.html
from trakt import init
from trakt.users import User
import trakt.core

In [4]:
## Export settings
path = os.getcwd()
src_path = path + '\\Desktop\\Data Analysis\\jupyterlab\\mmdb\\'
# Export pickle file
pkl_file = 'movies.pkl'
pkl_path = src_path + pkl_file
# Export txt file
txt_export = 'imdb.txt'
txt_path = src_path + txt_export   

# Settings

In [5]:
def get_bool(prompt):
    """Get user input."""
    while True:
        try:
            return {"true":True,"false":False}[input(prompt).lower()]
        except KeyError:
            print("Invalid input! Please answer True or False!")

In [6]:
## OPTIONAL
# Get data
get_data = get_bool("Do you want to get new data (True/False)? ")
# Export data
export = get_bool("Do you want to export data (True/False)? ")
# Check data before analysis
data_check = False
# Update metascore and imdbRating / imdbVotes
update = False

Do you want to get new data (True/False)?  false
Do you want to export data (True/False)?  false


# Classes

## Trakt

In [7]:
## Load Trakt API parameters
# user = ****
# user_client_id = ****
# user_client_secret = ****
# pin = ****
trakt_api_login = pd.read_csv(src_path + 'trakt_api_login.txt', sep=' = ', header=None)
trakt_api_login.columns = ['parameters','values']
trakt_api_login.set_index('parameters', inplace=True)

In [8]:
# https://trakt.docs.apiary.io/#
# Access Trakt
trakt.core.AUTH_METHOD = trakt.core.OAUTH_AUTH  # Set the auth method to OAuth
# Add trakt login parameters
user = trakt_api_login.loc['user'].values[0]
user_client_id = trakt_api_login.loc['user_client_id'].values[0]
user_client_secret = trakt_api_login.loc['user_client_secret'].values[0]

In [9]:
## Movies watched multiple times (between 2020 and 2022)
# Jojo Rabbit
# Dune (2021)
# The Trip
# Heavy Trip

# Method 'last_watched_movies" from pytrakt only gets last result

In [10]:
class TRAKT:
    """TRAKT class - Extract movie information from Trakt API and fill dataframe."""
    
    def __init__(self):
        """Constructor - Initialize dataframe."""
        self.__df = pd.DataFrame()
        init(user, client_id=user_client_id, client_secret=user_client_secret);

    def clean_id(self, x):
        """Clean movie id string (remove 'tt')."""
        return x.replace('tt','')
 
    def create_df(self, start_date='2018-01-01'):
        """Fill dataframe with data since 'start_date'."""
        # Access personal account
        self.__my = User(user)      
               
        temp = [
            [self.__my.watched_movies[i].title, 
             self.clean_id(self.__my.watched_movies[i].imdb), 
             self.__my.watched_movies[i].last_watched_at, 
             self.__my.watched_movies[i].year] 
            for i in range(len(self.__my.watched_movies))
        ]
       
        # Add info to dataframe
        self.__df = self.__df.append(temp)
        self.__df.columns = ['title','id','timestamp','year']                        
        self.__df.loc[:,'timestamp'] = pd.to_datetime(self.__df.timestamp, utc=True)
        # https://stackoverflow.com/questions/55598122/pandas-adding-timezone-offset-to-the-timestamp-after-using-tz-convert
        timezone = 'Europe/Lisbon'
        self.__df.loc[:,'timestamp'] = self.__df.loc[:,'timestamp'].dt.tz_convert(timezone).dt.tz_localize(None)
        self.__df = self.__df[self.__df.timestamp > pd.Timestamp(start_date)].sort_values(by='timestamp').reset_index(drop=True)
        return self.__df

## IMDb

In [11]:
class IMDB:
    """IMDb class - Extract movie information from IMDb API and fill dataframe."""
    # Significant keys from method 'get_movie'
    # Do not change the names!
    cols = ['title','year','imdbID','certificates','runtimes','genres',
            'director','writer','cinematographer','cast','countries',
            'languages','metascore','rating','votes','plot outline',
            'production companies','distributors','producer','composer',
            'box office','budget']

    # Column query extract type
    cols_info = ['runtimes','genres','countries','languages','rating','votes','plot outline']
    cols_name = ['director','writer','cinematographer','cast','production companies','distributors','producer','composer']
    # Numeric columns (not including year)
    cols_numeric = ['runtimes', 'metascore', 'rating', 'votes']
    
    def __init__(self):
        """Constructor - Initialize dataframe."""
        self.__df = pd.DataFrame(columns=self.cols)
        
        # basic column types
        self.__cols_int = ['year','runtimes','metascore','votes']
        self.__cols_float = ['rating']
        self.__cols_str = ['title','imdbID','certificates','genres','director','writer','cinematographer','cast','countries','languages','plot outline',
                           'production companies','distributors','producer','composer','box office']
    
    @staticmethod
    def load(filename):
        """Load IMDb file."""
        try:
            return pd.read_csv(filename)
        except FileNotFoundError:
            print('File not found!', filename)
            return 3            

    def request_db(self, moviesid):
        """Connect with IMDb API and get movie."""
        all_movies = []  
        # Create an instance of the IMDb class to access API
        ia = IMDb()        
        
        for idx in moviesid:
            search = idx.replace(r'tt', '')    
            # Get general movie data
            data = ia.get_movie(search)            
            all_movies.append({key: data.get(key) for key in self.cols[:-1]}) # self.cols[:-1] don't include 'budget'
            # Get metascore (off imdb)
            meta = ia.get_movie_critic_reviews(search)['data']
            all_movies[-1].update({'metascore': meta})             
        return all_movies  

    def convert_columns(self, df):
        """Convert columns to appropriate formats."""
        to_datetime = ['timestamp', 'date', 'time']
        try:
            df.columns = self.cols
        except ValueError:
            df.columns = self.cols + to_datetime
            
        # Clean and convert to type
        df[self.__cols_int] = df[self.__cols_int].replace('N/A', 0).fillna(0).astype(int)
        df[self.__cols_float] = df[self.__cols_float].astype(float)
        df[self.__cols_str] = df[self.__cols_str].fillna('N/A').astype(str)
        return df
    
    # Get values from 'keys' in need of special cleaning.
    @staticmethod
    def get_imdbid(x):
        """Need an imdbID with at least 7 digits."""
        return x.zfill(7)
     
    @staticmethod
    def get_rated(x):
        """Get values from certificates - renamed to rated."""
        notrated = 'Not Rated'
        try:
            # Get United States' certificates
            temp = [rated.split(':')[1] for rated in x if 'United States' in rated]
            aux = ', '.join(set(temp)) # remove duplicates in string
            if not aux: return notrated
            else: return aux
        except: # be careful with this...
            return notrated

    @staticmethod
    def get_metascore(x):
        """Get values from metascore."""
        try:  
            return x.get('metascore')
        except (ValueError, AttributeError):  
            return 0

    @staticmethod
    def get_boxoffice(x):
        """Get values from box office."""
        try:
            return x.get('Cumulative Worldwide Gross').lstrip('$').split(', ')[0]
        except (ValueError, AttributeError):  
            return 0
    
    @staticmethod
    def get_budget(x):
        """Get budget from box office information."""
        try:
            return x.get('Budget').lstrip('$').rstrip(' (estimated)')
        except (ValueError, AttributeError):  
            return 0

    def get_info_list(self, x, cols, d):
        """Extract information from imdb ia object : <info>."""
        # x - series
        # cols - columns
        # d - dictionary
        for col in cols:
            temp = []
            if x[col] != None:
                d[col] = x[col]
            else:
                if col in self.cols_numeric:
                    d[col] = 0
                elif col == 'plot outline':
                    d[col] = 'N/A'
                else:
                    d[col] = ['N/A']            
        return d

    def get_name_list(self, x, cols, d):
        """Extract information from imdb ia object : <name>."""
        # x - series
        # cols - columns
        # d - dictionary
        for col in cols:
            if x[col] != None:
                temp = []
                for info in x[col]:
                    if info != None:
                        temp.append(info)
                    else:
                        pass
                # drop duplicates
                # sometimes same person appears multiple times
                if col in self.cols_name:
                    # clean list
                    d[col] = list(filter(None, list(set(temp))))
                else:
                    d[col] = list(set(temp))     
            else:
                d[col] = ['N/A']            
        return d
    
    @staticmethod
    def list_to_string(df, cols):
        """Convert column of lists into column of strings.
        Convert value into same row != df.explode(col).
        """
        for col in cols:
            df[col] = df[col].agg(lambda x: ', '.join(map(str, x)))            
        return df
   
    def clean_df(self, movies):  
        """Clean output from db request."""
        df = pd.DataFrame(columns=self.cols)    
        # Create empty dictionaries
        dict_info = {}
        dict_name = {}

        for movie in movies:
            # Intiliaze list
            temp_movie = []
            # Set values
            title = movie['title']
            year = movie['year']
            imdbid = self.get_imdbid(movie['imdbID'])
            rated = self.get_rated(movie['certificates']) 
            metascore = self.get_metascore(movie['metascore'])
            boxOffice = self.get_boxoffice(movie['box office'])
            budget = self.get_budget(movie['box office'])
            # Populate dictionaries
            dict_info = self.get_info_list(movie, self.cols_info, dict_info)
            dict_name = self.get_name_list(movie, self.cols_name, dict_name)
            # Organize list according to df column order
            temp_movie = [title, year, imdbid, rated, 
                          dict_info['runtimes'], dict_info['genres'], 
                          dict_name['director'], dict_name['writer'], dict_name['cinematographer'], dict_name['cast'],
                          dict_info['countries'], dict_info['languages'],
                          metascore, 
                          dict_info['rating'], dict_info['votes'], dict_info['plot outline'], 
                          dict_name['production companies'], dict_name['distributors'], dict_name['producer'], dict_name['composer'], 
                          boxOffice, budget
                         ]   
            # Add data to df by row
            df.loc[len(df)] = temp_movie
        
        # Convert column of lists into column of strings
        df = self.list_to_string(df, self.cols_info[:-3])
        df = self.list_to_string(df, self.cols_name)         
        return df
       
    @staticmethod
    def movie_extra_info(moviedb, df, get):
        """Get movie extra info: filming locations OR keywords."""
        # Load file
        path_export = src_path + get + '.txt'        
        try:
            df_extra_info = pd.read_csv(path_export, names=[get], header=0)
            # Check for new movies
            getMovies = df.shape[0] - df_extra_info.shape[0]
            if getMovies > 0 : movieid = list(df['imdbID'].tail(getMovies))
            else: movieid = list(df['imdbID'])     
        except FileNotFoundError:
            print('File not found!')
            df_extra_info = 3
            movieid = list(df['imdbID'])    
                        
        # If no df
        if isinstance(df_extra_info, int):
            df_extra_info = pd.DataFrame()            
            if get == 'locations': info = [moviedb.get_movie_locations(idx).get('data').get(get) for idx in movieid]
            elif get == 'keywords': info = [moviedb.get_movie_keywords(idx).get('data').get(get) for idx in movieid]
            df_extra_info[get] = pd.Series(info)                
        # If df
        elif isinstance(df_extra_info, pd.core.frame.DataFrame):
            if getMovies > 0:            
                if get == 'locations': info = [moviedb.get_movie_locations(idx).get('data').get(get) for idx in movieid]
                elif get == 'keywords': info = [moviedb.get_movie_keywords(idx).get('data').get(get) for idx in movieid] 
                # Convert string array to array
                df_extra_info[get] = df_extra_info[get].apply(eval)   
                df_extra_info = pd.DataFrame(pd.concat([df_extra_info[get], pd.Series(info)], ignore_index=True), columns=[get])
            
        # Clean the mess
        df_extra_info[get] = df_extra_info[get].fillna('[]')  
        return df_extra_info
   
    def update_imdb_values(self, df, current_year):
        """Update recent movies' metascore, imdbRating and imdbVotes."""
        # Uniform columns
        df.columns = df.columns.str.lower()
        # Get movies from current year
        update = df[df.year == current_year]
        update.rename(columns={'imdbid':'id'}, inplace=True)
        # Get updated data and clean it
        movies = self.request_db(update['id'])
        movies = self.clean_df(movies)
        # Select data to replace
        indices = list(update.index)
        metascore = list(movies.metascore)
        ratings = list(movies.rating)
        votes = list(movies.votes)
        # Replace data
        for i in range(len(update)):    
            df.loc[indices[i], 'metascore'] = metascore[i]
            df.loc[indices[i], 'imdbrating'] = ratings[i]
            df.loc[indices[i], 'imdbvotes'] = votes[i]            
        return df
        
    def df_handler(self, foo, df_trakt):
        """"Check new data with existing df from pickle file."""
        # Check difference
        data, newMovies = check_new_movies(foo, df_trakt) 
        # Get index
        index = data.index        
        # Copy foo if pickle exists and there's no new movies
        if isinstance(foo, pd.core.frame.DataFrame):  
            if newMovies == 0:
                df = foo.copy()                
            else: # newMovies != 0
                # Request db and clean values
                movies = self.request_db(data)
                movies = self.clean_df(movies)                
                # Convert to df
                df_movies = pd.DataFrame(movies)
                # Select last movies (from newMovies size)
                #df_movies = df_movies.tail(newMovies).reset_index(drop=True)
                index = index[-newMovies:]
                # Add timestamp from trakt
                df_movies['timestamp'] = pd.Series(df_trakt.iloc[index].reset_index()['timestamp']) 
                df_movies = from_timestamp(df_movies)     
                # Rename columns
                df_movies.columns = foo.columns
                # Concatenate to original df
                df = pd.concat([foo, df_movies], ignore_index=True)
        # If foo don't exit
        elif not isinstance(foo, pd.core.frame.DataFrame):
            df = pd.DataFrame(movies)
            # Add timestamp from trakt
            df['timestamp'] = pd.Series(df_trakt['timestamp'])
            df = from_timestamp(df)
        return df.reset_index(drop=True)
        
    def get_imdb_data(self, src_path, newMovies, get_data, foo, df_trakt):
        """Load db file and connection to IMDb API."""
        handler = False
        # Get data from file
        try:
            df = self.load(src_path + 'imdb.txt') # Load file
            df = self.convert_columns(df) # Clean df            
            print('DF shape: ' + str(df.shape))
            print()        
        except (AttributeError, OSError, IOError, TypeError) as e:
            df = 3
            print('No data available: ' + str(e))
    
        # Connection to db    
        # Get new movies - newMovies is the difference between df and trakt data  
        if isinstance(df, pd.core.frame.DataFrame) and newMovies > 0:
            # Check difference
            _, newMovies = check_new_movies(df, df_trakt)        
            if newMovies != 0:
                print('Adding new movies to df... \n')
                df = self.df_handler(df, df_trakt)
                handler = True
            else:
                print('No new movies! Continue.')           
        else:
            if isinstance(df, pd.core.frame.DataFrame) and df.size != 0: # in case db file exist but don't want to retrieve new data
                print('Leave IMDb alone!')
            elif isinstance(df, int): # no files saved (get all movies)
                if get_data:
                    print('Get all data. \n')
                    # Copy foo if pickle exists and there's no new movies
                    df = self.df_handler(foo, df_trakt)
                    handler = True
                else:
                    print("Variable get_data == 'False'.")
            else:
                print('Leave IMDb alone!')
        
        # If new data, convert values and prepare columns 
        if handler:       
            # Convert and rearrange columns
            df = self.convert_columns(df)
            df.columns = [x.lower() for x in df.columns]    
    
        return df

In [12]:
## IMDb API query
#ia = IMDb()
#ia.get_movie_main('14821150')

## Query output example
#{'data': {'localized title': 'Vortex',
#  'cast': [<Person id:0000783[http] name:_Dario Argento_>,
#   <Person id:0495950[http] name:_Françoise Lebrun_>,
# ...
#   <Person id:2582006[http] name:_Jean-Baptiste Thoret_>],
#  'genres': ['Drama'],
#  'runtimes': ['142'],
#  'countries': ['France', 'Belgium', 'Monaco'],
#  'country codes': ['fr', 'be', 'mc'],
#  'language codes': ['fr', 'en'],
#  'color info': ['Color'],
#  'aspect ratio': '1.33 : 1',
#  'certificates': ['Argentina:18',
#   'Canada:PG::(Alberta)',
# ...
#   'United Kingdom:15'],
#  'original air date': '27 Jan 2022 (Portugal)',
#  'rating': 7.5,
#  'votes': 2597,
#  'cover url': 'https://m.media-amazon.com/images/M/MV5BNjMxYTFmNTctNDgyYi00NzQwLTk1NTItMmVkYjk5Yjc1NmI2XkEyXkFqcGdeQXVyMTAyMjQ3NzQ1._V1_SY150_CR1,0,101,150_.jpg',
#  'imdbID': '14821150',
#  'languages': ['French', 'English'],
#  'title': 'Vortex',
#  'year': 2021,
#  'kind': 'movie',
#  'original title': 'Vortex',
#  'director': [<Person id:0637615[http] name:_Gaspar Noé_>],
#  'writer': [<Person id:0637615[http] name:_Gaspar Noé_>],
#  'producer': [<Person id:1902436[http] name:_Toufik Ayadi_>,
#   <Person id:2820211[http] name:_Christophe Barral_>,
# ...
#   <Person id:0917946[http] name:_Edouard Weil_>],
#  'cinematographer': [<Person id:0213424[http] name:_Benoît Debie_>],
#  'editor': [<Person id:1721361[http] name:_Denis Bedlow_>],
#  'editorial department': [<Person id:10730865[http] name:_Sofiane Benabdallah_>,
#   <Person id:1147144[http] name:_Marc Boucrot_>,
# ...
#   <Person id:4871217[http] name:_Kévin Laperrière_>],
#  'production design': [<Person id:0704740[http] name:_Jean Rabasse_>],
#  'art direction': [<Person id:12399578[http] name:_Anna Prat_>],
#  'set decoration': [<Person id:1387131[http] name:_Nathalie Roubaud_>],
#  'costume designer': [<Person id:0115366[http] name:_Corinne Bruand_>],
#  'make up': [<Person id:6267423[http] name:_Joran Muratori_>],
#  'assistant director': [<Person id:7167774[http] name:_Claire Corbetta-Doll_>,
# ...
#   <Person id:0701260[http] name:_David Maria Putorti_>],
#  'art department': [<Person id:8196031[http] name:_Léopold Bossuet_>,
#   <Person id:9223580[http] name:_Louis Boulan_>,
# ...
#   <Person id:9493832[http] name:_Karin Scuderi_>],
#  'sound crew': [<Person id:0009571[http] name:_Jonathan Acbard_>,
#   <Person id:0099125[http] name:_Bertrand Boudaud_>,
# ...
#   <Person id:0946675[http] name:_Ken Yasumoto_>],
#  'visual effects': [<Person id:11269132[http] name:_Mathieu Barbe_>,
#   <Person id:10441690[http] name:_Jérôme Battistelli_>,
# ...
#   <Person id:8947055[http] name:_Annabelle Zoellin_>],
#  'stunt performer': [<Person id:2285249[http] name:_Jérôme Gaspard_>],
#  'camera and electrical department': [<Person id:4569287[http] name:_Emmanuelle Alaitru_>,
#   <Person id:4652245[http] name:_Julien Chassaignon_>,
# ...
#   <Person id:10308825[http] name:_Louis Stoltz_>],
#  'costume department': [<Person id:7956743[http] name:_Constance Allain_>,
#   <Person id:12598757[http] name:_Léa Peixoto_>],
#  'music department': [<Person id:6612986[http] name:_Steve Bouyer_>,
#   <Person id:2247261[http] name:_Pascal Mayer_>],
#  'miscellaneous crew': [<Person id:0231270[http] name:_Laetitia Dom_>,
#   <Person id:4378164[http] name:_François-Xavier Ecochard_>,
#   <Person id:13433456[http] name:_Tamara Saint Léger_>],
#  'thanks': [<Person id:8598239[http] name:_Héloïse Noé_>],
#  'akas': ['Au bord du monde (France)',
# ...
#   "Ma'arbolet (Israel, Hebrew title)"],
#  'production companies': [<Company id:0126452[http] name:_Rectangle Productions_>,
#   <Company id:0889391[http] name:_Wild Bunch International_>,
# ...
#   <Company id:0169751[http] name:_Tax Shelter du Gouvernement Fédéral Belge_>],
#  'distributors': [<Company id:0331328[http] name:_Picturehouse Entertainment_>,
#   <Company id:0091802[http] name:_Xenix Filmdistribution_>,
# ...
#   <Company id:0747471[http] name:_Utopia_>],
#  'special effects': [<Company id:0060901[http] name:_BUF_>],
#  'other companies': [<Company id:0907205[http] name:_Monark_>]},
# 'titlesRefs': {},
# 'namesRefs': {}}

# Functions

## General

In [13]:
# Dataframe styling
# https://www.analyticsvidhya.com/blog/2021/06/style-your-pandas-dataframe-and-make-it-stunning/

In [14]:
def load_pickle():
    """Load pickle file."""
    is_pickle = False
    # Load pickle file
    try:
        foo = pickle.load(open(pkl_path, "rb"))
    except FileNotFoundError:
        print('File not found!')
        foo = 3
    # If foo is df
    if isinstance(foo, pd.core.frame.DataFrame):
        is_pickle = True
        foo.columns = foo.columns.str.lower()
        # Convert to datetime type and sort by date
        foo['timestamp'] = pd.to_datetime(foo['timestamp'], utc=True)
        foo.sort_values(by='timestamp', inplace=True)
        foo = foo.reset_index(drop=True)
        foo.info()
    else:
        print('No pickle.')    
    return foo

In [15]:
def check_new_movies(foo, df_trakt):
    """Check if foo size is different than df_track size (new movies added)."""
    # Initialize data
    data = []
    newMovies = 0
    # If foo exist and want new data
    if isinstance(foo, pd.core.frame.DataFrame) and get_data:              
        # New movies - difference between df sizes
        newMovies = len(df_trakt) - len(foo)
        if newMovies > 0:   
            # Select data to search in IMDb
            data = df_trakt.iloc[-newMovies:]['id']
    # If foo don't exist
    elif not isinstance(foo, pd.core.frame.DataFrame):       
        newMovies = len(df_trakt)
        # Select data to query IMDb API
        data = df_trakt['id']        
    else:
        print('Do nothing.')        
    return data, newMovies

In [16]:
def explode(df, lst_cols, fill_value='', preserve_index=False):
    """Convert string with multiple values to array of values.
    
    # Example of preparation for "exploding"
    # df_explode_genre.Genre = df_explode_genre.Genre.str.split(',').apply(lambda x: [e.strip() for e in x])
    
    # https://stackoverflow.com/questions/12680754/split-explode-pandas-dataframe-string-entry-to-separate-rows
    """
    # make sure 'lst_cols' is list-alike
    if (lst_cols is not None
        and len(lst_cols) > 0
        and not isinstance(lst_cols, (list, tuple, np.ndarray, pd.Series))):
        lst_cols = [lst_cols]
    # all columns except 'lst_cols'
    idx_cols = df.columns.difference(lst_cols)
    # calculate lengths of lists
    lens = df[lst_cols[0]].str.len()
    # preserve original index values    
    idx = np.repeat(df.index.values, lens)
    # create "exploded" df
    res = (pd.DataFrame({
                col:np.repeat(df[col].values, lens)
                for col in idx_cols},
                index=idx)
             .assign(**{col:np.concatenate(df.loc[lens>0, col].values)
                            for col in lst_cols}))
    # append those rows that have empty lists
    if (lens == 0).any():
        # at least one list in cells is empty
        res = (res.append(df.loc[lens==0, idx_cols], sort=False)
                  .fillna(fill_value))
    # revert the original index order
    res = res.sort_index()
    # reset index if requested
    if not preserve_index:        
        res = res.reset_index(drop=True)
    
    return res

In [17]:
def explode_date(df):
    """Explode date into multiple columns."""    
    df['Date'] = df['Timestamp'].dt.date    
    df['DateYear'] = df['Timestamp'].dt.year    
    df['Month'] = df['Timestamp'].dt.month
    df['MonthName'] = df['Timestamp'].dt.month_name()    
    df['Weekday'] = df['Timestamp'].dt.weekday
    df['WeekdayName'] = df['Timestamp'].dt.strftime("%A")    
    df['Day'] = df['Timestamp'].dt.day    
    return df

In [18]:
def split_explode(df, col):
    """Split string and explode result."""    
    df[col] = df[col].str.split(',').apply(lambda x: [e.strip() for e in x])       
    return explode(df, [col])

In [19]:
def total_by_year(df, all_watched=False, docs=False):
    """Get total movies watched by year."""
    cols = ['Title','Genre','Director','Timestamp']
    df_aux = df[cols].copy()
    # Get documentaries mask
    condition = 'Documentary'
    mask = df_aux.Genre.str.contains(condition)
    # Select data to analyze
    # Want everything?
    if all_watched:
        df_ = df_aux.copy()
    else:
        if docs: df_ = df_aux[mask] # only documentaries
        else: df_ = df_aux[~mask] # don't include documentaries
    # Drop columns
    # Get year
    df_['Year'] = df_['Timestamp'].dt.year
    df_.drop(columns='Timestamp', inplace=True)
    # To avoid incorrect match from movies with same title (watched in the same year, like Swan Song (2021))
    # Add 'Director' to 'Title' to create unique string
    df_['Title'] = df_['Title'] + ' - ' + df_['Director']
    df_.drop(columns='Director', inplace=True)    
    # Group by Year
    df_by_year = df_.groupby(by='Year').count()
    # Unique and count all movies watched
    df_by_year_uc = df_.groupby(by='Year').agg({'Title':['count']}) # 'nunique' not needed currently since the IMDb API query only returns last watched date
    # Add total row
    rowtotal = df_by_year_uc.sum()
    rowtotal.name = 'All'
    df_by_year_uc.append(rowtotal)    
    return df_by_year, df_by_year_uc

In [20]:
def top_by_year(df, col, years, TOP=20):
    """Top columns values by year."""
    # Create dataframe
    for year in years:
        if year == years[0]:
            df_counter_year = counter_display(df[df['Timestamp'].dt.year == year], col, 'Title', None).copy()
            df_counter_year = df_counter_year.reset_index()
            df_counter_year.columns = ['index', year]
        else:
            add_country = counter_display(df[df['Timestamp'].dt.year == year], col, 'Title', None).reset_index()
            add_country.columns = ['index',year]
            # Merge new year counter
            df_counter_year = df_counter_year.merge(add_country, how='outer', on='index')
    # Rearrange dataframe
    df_counter_year.rename(columns={'index':col}, inplace=True)
    df_counter_year.set_index(col, inplace=True)
    df_counter_year = df_counter_year.fillna(0).astype(int)
    
    # Add 'All' column with sum by country
    df_counter_year.loc[:,'All'] = df_counter_year.sum(axis=1)
    # Show TOP countries
    df_counter_year = df_counter_year.nlargest(TOP, 'All', keep='all')
    show_all(df_counter_year.style.background_gradient(subset=df_counter_year.columns[:-1], axis=1))

In [21]:
def from_timestamp(df):
    """Extract date and time from timestamp."""
    # Convert to datetime
    df['timestamp'] = pd.to_datetime(df['timestamp'], utc=True)
    df['timestamp'] = df['timestamp'].dt.tz_localize(None) # remove timezone
    df['date'] = df['timestamp'].dt.date
    # Extract time from timestamp
    df['time'] = [dt.datetime.time(d) for d in df['timestamp']]        
    return df

In [22]:
#def hard_correct_movie_data(df, loc, idmovie):
#    """Single movie hard correct."""
#    temp = []
#    response = []
#
#    idmovie = idmovie.replace(r'tt', '')
#    ia = IMDb()
#    response = ia.get_movie(idmovie, info='main')
#    temp.append({key: response.get(key) for key in self.cols})
#    correct = self.clean_df(temp)
#    df.iloc[loc, :] = correct.loc[0]
#    
#    return df

In [23]:
def skip_days(df, calendar, year):
    """Get days by month/by year without a movie watched."""
    # Explode df
    df_explode = explode_date(df)
    # Filter by year
    df_explode = df_explode[df_explode['Timestamp'].dt.year == year]
    # Crosstab days by month
    df_explode = pd.crosstab(df_explode['Day'], [df_explode['Month'], df_explode['MonthName']])
      
    # Get days of complete months in year
    calyear = calendar[calendar.date.dt.year == year].reset_index(drop=True)
    days_in_months = calyear.daysinmonths
    # Complete months this year
    len_months = len(calyear)
        
    # Get month columns
    col_months = df_explode.columns
    # Initialize
    no_moviedays = []
    pct_no_moviedays = 0
    start_month = col_months[0][0]
    # Fill movie days by month
    for i in range(start_month, len_months+1): # up to month 12
        moviedays = 0
        # Sum all days by month with 0 - False (no movies watched)
        moviedays = df_explode[i].astype(bool).sum(axis=0)
        # Append sum to array (sum by month)
        no_moviedays.append(int(days_in_months[i-1] - moviedays)) # -1 because index start=0
    
    # Percentage of days by year without a movie watched
    if days_in_months.sum() != 0:
        pct_no_moviedays = int((sum(no_moviedays) / days_in_months[start_month-1:].sum()) * 100)
    else:
        pct_no_moviedays = 0
        
    print(str(year))
    print('By month: ' + str(no_moviedays))
    print('Total: ' + str(sum(no_moviedays)) + '/' + str(calyear[start_month-1:].daysinmonths.sum()))
    print(str(pct_no_moviedays) + ' %')
    print('\n')

In [24]:
def crosstab_by_year(df, index=None, year=None):
    """Create crosstab filtering by year."""
    cols = ['Title','Timestamp']
    
    # If year == None get all years
    if year != None:
        df_ = df[df['Timestamp'].dt.year == year][cols].copy()
    else:
        df_ = df[cols].copy()    
    df_ = explode_date(df_)
    
    # Create index levels
    if index == None:
        multiIndex = df_['DateYear']
    else:
        multiIndex = [df_[index], df_[index+'Name']]
        
    # Create df using index and columns by type of request
    if index == 'Weekday' or index == None:
        return pd.crosstab(multiIndex, [df_['Month'], df_['MonthName']], margins=True)
    elif index == 'Month':
        return pd.crosstab(multiIndex, df_['Day'])
    else:
        print('Error!')

In [25]:
def ratio_weekend(df, year=None):    
    """Movie ratio by movies watched on weekends vs movies watched on weekdays."""
    df_ = create_weekday_df(df, year)

    # Create Weekdays df with weekdays
    allweek = df_.Weekday.values #['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday']    
    df_week = pd.DataFrame(0, index=allweek, columns=['Movies'])    
    df_week.reset_index(inplace=True)
    df_week.rename(columns={"index": "Weekday"}, inplace=True)

    # Concatenate dfs
    df_concat = pd.concat([df_week, df_]).groupby(['Weekday']).sum().reset_index()
    df_concat = df_concat.set_index('Weekday').loc[allweek]

    # Weekdays vs Weekend
    dropdays = allweek[1:-1] # ['Tuesday','Wednesday','Thursday','Friday','Saturday']
    # weekdays sum
    df_concat.loc['Monday'] += df_concat.iloc[1:5].sum()
    # weekend sum
    df_concat.loc['Sunday'] += df_concat.iloc[5].sum()
    # drop unecessary columns
    df_concat.drop(dropdays, inplace=True)
    # rename columns
    df_concat.rename(index={'Monday': 'Weekdays'}, inplace=True)
    df_concat.rename(index={'Sunday': 'Weekend'}, inplace=True)
    
    # Get ratio
    ratio = df_concat.reset_index()['Movies']
    pct_ratio = ((ratio.iloc[1] / (ratio.iloc[0] + ratio.iloc[1])) * 100).astype(int)    
    print(str(pct_ratio) + '% of the movies were watched on the weekend!')

In [26]:
def counter_display(df, by, col, TOP=None):
    """Count occurrences of string."""
    # Check type
    sample = df[by].sample().values[0]
    
    if isinstance(sample, str):
        series = df[by].str.split(', ').apply(lambda x: [e.strip() for e in x])
        # Remove duplicates
        series = series.map(lambda x: list(set(x)))
        series = pd.Series(series.map(Counter).sum())
    else: # list
        series = pd.Series(df[by].value_counts())
           
    # Organize df to export
    df_export = pd.DataFrame(series, columns=[col])    
    # Show everything
    if TOP == None: 
        return df_export.sort_values(by=col, ascending=False)
    # Show TOP
    else: 
        return df_export.nlargest(TOP, col, keep='all')

In [27]:
def get_mean_value(df, by, get_avg, TOP=None):   
    """"Average movie rating by column.
    # If column values have multiple substrings in a string (examples: 'Genre', 'Actors', etc) use explode() to separate all values
    """
    # Create df to explode by 'by'
    df_explode = df.copy() # remove some not needed columns
    # Split multiple persons in one string
    df_explode[by] = df_explode[by].str.split(', ').apply(lambda x: [e.strip() for e in x])

    # Remove duplicates
    df_explode[by] = df_explode[by].map(lambda x: list(set(x)))
    
    # Explode df base on 'by'
    df_explode = explode(df_explode, [by])
    
    df_export = df_explode.groupby(by).agg({'Title':['count','; '.join], get_avg:'mean'})
    df_export = df_export.droplevel(0, axis=1)
    df_export = df_export.rename(columns={'join':'Movies', 'count':'Total', 'mean':get_avg})
    
    # if 'imdbRating'
    if get_avg.find('Rating') != -1:
        df_export[get_avg] = round(df_export[get_avg], 1)
    # else 'Runtime' or 'imdbVotes'
    else:
        df_export[get_avg] = df_export[get_avg].astype(int)
    # Everything  
    if TOP == None: 
        return df_export.sort_values('Total', ascending=False)
    # Show TOP  
    else: 
        return df_export.nlargest(TOP, 'Total', keep='all')

In [28]:
def director_actors(data, col, TOP):
    """Pair director - actor."""
    colList = counter_display(data, col, 'Actors', None).reset_index()['index']
    d = {}

    for i in range(len(colList)):
        value = colList[i]        
        d[value] = [data[data[col].str.contains(colList[i])]['Actors'].values]

    return pd.DataFrame.from_dict(d, orient='index', columns=['Actors']).head(TOP)

In [29]:
def movies_per_day(df, year):   
    """"Check movies by day."""
    nMovies = df.loc[year]['Title']
    now = pd.Timestamp('now')
    if year == now.year:
        lastDay = dt.date.today()
    else:
        lastDay = dt.date(year, 12, 31)
    
    firstDay = dt.date(year, 1, 1)
    delta = lastDay - firstDay

    moviesByDay = nMovies / (delta.days + 1) # 1st of January
    print(moviesByDay.round(2), 'movies per day in', year)

In [30]:
def weighted_rating(v, R, m, C):
    """Function that computes the weighted rating of each movie."""
    # v - votes
    # R - Rating
    # m - minimun number of votes
    # C - mean

    # Calculation based on the IMDB formula
    return (v/(v+m) * R) + (m/(m+v) * C).round(3)

In [31]:
# Genre columns to use in the functions below
cols_genre = ['Genre','Title','imdbRating','Timestamp']

def most_watched_genre(df, TOP, year):
    """Genre with most watched movies."""
    df_explode_genre = df[df['Timestamp'].dt.year == year][cols_genre].copy()       
    df_explode_genre = split_explode(df_explode_genre, 'Genre')     
    df_explode_genre_rating = get_mean_value(df_explode_genre, 'Genre', 'imdbRating')
    
    # print
    show_all(df_explode_genre_rating.nlargest(TOP, 'Total', keep='all'))    
    return df_explode_genre_rating

def most_watched_genre_combo(df_, TOP, year):
    """Most watched genre combo."""
    df_genre = df_[df_['Timestamp'].dt.year == year][cols_genre].copy()
    
    gb_genre = df_genre.groupby(by='Genre')
    # Genre combination with most movies, showing worst and best rated movies
    df_genre_agg = gb_genre.agg({'imdbRating': ['min','max','mean'], 
                                 'Title': 'count'})
    df_genre_agg.reset_index(inplace=True)
    
    df_genre_agg_worst = gb_genre['imdbRating'].idxmin().fillna(0).astype(int)
    df_genre_agg[('Title','worst')] = df_genre.reindex(df_genre_agg_worst).reset_index(drop=True)['Title']
    
    df_genre_agg_best = gb_genre['imdbRating'].idxmax().fillna(0).astype(int)
    df_genre_agg[('Title','best')] = df_genre.reindex(df_genre_agg_best).reset_index(drop=True)['Title']
    
    df_genre_agg = df_genre_agg.set_index([('Genre','')]).round(1)
    df_genre_agg.index.rename('Genre', inplace=True)

    # print
    show_all(df_genre_agg.nlargest(TOP, [('Title','count')], keep='all'))    
    return df_genre_agg

In [32]:
def double_group_info(df, cols, TOP=3):  
    """Double group showing count, list of movies and average rating."""
    # Remove top 2 countries from dataframe
    list_remove_countries = ['United States','United Kingdom']
    # Explode columns
    for col in cols:
        df = split_explode(df, col)
        # Remove countries
        if col == 'Country':
            df = df[~df[col].str.contains('|'.join(list_remove_countries))]        
        # Clean strings
        df[col] = df[col].apply(lambda x : x.lstrip(' ').rstrip(' '))

    # Group by
    gb_df = df.groupby(cols).agg({'Title':['count','; '.join], 'imdbRating':'mean'})
    gb_df[('imdbRating','mean')] = gb_df[('imdbRating','mean')].round(2)    
    # Select only cases with more than 1 movie
    gb_df = gb_df[gb_df[('Title','count')] > 1]
    
    # Sort values by group
    return gb_df.groupby(level=0, group_keys=False).apply(lambda x: x.sort_values(('Title','count'), ascending=False).nlargest(TOP, ('Title','count'), keep='all')) #.head(TOP))

In [33]:
def pairing(df, col, pair=2):
    """Common column values pairing."""
    list_ = df[col].apply(lambda x: x.split(','))
    list_ = [[x.lstrip().rstrip() for x in l] for l in list_] # strip ' ' from strings
     # Counter
    d  = Counter()
    for sub in list_:
        if len(list_) < pair:
            continue
        sub.sort()
        for comb in combinations(sub, pair):
            d[comb] += 1

    # Create df
    df_pairs = pd.DataFrame.from_dict(d, orient='index').reset_index()
    df_pairs.columns = ['Pairs','Count']
    return df_pairs.set_index('Pairs')

## Plotting

In [34]:
# Set colormap
plot_cmap = 'YlGnBu'

In [35]:
# Date related lists
days = np.arange(1, 32, 1) # 1 to 31
weekdays = list(calendar.day_name)
months = list(calendar.month_name)[1:]

In [36]:
def show_all(df):
    """Display everything."""
    display(HTML(df.to_html()))

In [37]:
def add_labels(x,y):
    """Add value labels to plot."""
    for i in range(len(x)):
        plt.text(i, y[i], y[i], ha = 'center')

In [38]:
def add_ticks_vbarplot(x, y, ax):
    """Add ticks to vertical bar plot."""
    # Plot
    bars = ax.bar(x, y, width=0.8)
    # Bar parameters
    for bar in bars:
        height = bar.get_height()
        label_xpos = bar.get_x() + bar.get_width() / 2
        ax.text(label_xpos, height+0.1, s=f'{height}', ha='center', va='bottom', fontdict={'fontsize':10})
    # Tick parameters
    for tick in ax.xaxis.get_major_ticks():
        tick.label.set_fontsize(10)
    
    # Axis parameters
    start = min(x)
    end = max(x)+1    
    #start, end = ax.get_xlim()
    ax.xaxis.set_ticks(np.arange(start, end))
    #ax.tick_params(axis='x', labelsize=11)
    ax.set_xticklabels(np.arange(start, end), rotation=90)   
    ax.yaxis.get_major_locator().set_params(integer=True)    
    return ax

In [39]:
def plot_watched_movies(df_all_watched):
    """Plot watched movies by year."""
    # Select data
    x = df_all_watched.index
    y = df_all_watched[('Title','count')]
    label = 'mean: ' + str(int(y.mean())) # set label to print mean value
    # Plot
    _, ax = plt.subplots(figsize=(len(x),6))
    add_ticks_vbarplot(x, y, ax)    
    plt.title('# Watched movies by Year')
    plt.xlabel('Year')
    plt.ylabel('# Movies')
    ax.axhline(y=y.mean(), color='red', ls='--', label=label)
    plt.legend()
    plt.show()

In [40]:
def plot_movies_month_by_year(df_byear):
    """Plot watched movies each month by year."""
    plot_mby = crosstab_by_year(df_byear).droplevel('Month', axis=1).drop('', axis=1).T
    # Plot
    ax = plot_mby.drop('All', axis=1).plot(kind='bar', stacked=True, figsize=(len(plot_mby), 6));
    ax.bar_label(ax.containers[len(years)-2])
    plt.title('# Watched movies by Month by Year')
    plt.xlabel('Month')
    plt.ylabel('# Movies')
    plt.legend(title='Year')
    plt.show()

In [41]:
def plot_movies_trend(df_byear):
    """Plot watched movies trend by year."""
    plot_ty = crosstab_by_year(df_byear).droplevel('Month', axis=1).drop('All').drop('', axis=1).T
    plot_ty.iloc[:,-1] = plot_ty.iloc[:,-1].replace(0, np.nan) # replace 0 values to nan values from months yet to come, for figure purposes
    # Plot
    ax = plot_ty.plot(figsize=(len(plot_ty),6));
    ax.xaxis.set_ticks(np.arange(len(plot_ty.index)))
    ax.xaxis.set_ticklabels(plot_ty.index)
    ax.set_xticklabels(ax.get_xticklabels(), rotation = 90) 
    plt.title('# Watched movies trend by Year')
    plt.xlabel('Months')
    plt.ylabel('# Movies')    
    plt.show()

In [42]:
def plot_movie_genre_trend(df_trend_genre_year):
    """Movie genre count by release year."""
    plot_trend_genres = split_explode(df_trend_genre_year, 'Genre')
    plot_trend_genres = plot_trend_genres.groupby(['Year','Genre']).agg({'Title':'count'}).sort_values(['Year','Title'], ascending=False)
    # Select data
    plot_ct_trend_genres = plot_trend_genres.unstack().T.reset_index().set_index('Genre').drop(['level_0'], axis=1)
    xsize = len(plot_ct_trend_genres.columns) // 2
    ysize = len(plot_ct_trend_genres.index) // 2
    # Plot
    _, ax = plt.subplots(figsize=(xsize,ysize))
    sns.heatmap(plot_ct_trend_genres, annot=True, linewidths=.5, ax=ax)
    ax.set_xticklabels(plot_ct_trend_genres.columns, rotation=90) 
    plt.title('Genre count by Movie Release Year')
    plt.xlabel('Release Year')
    plt.show();

In [43]:
def plot_heatmap(df, index, year):
    """Plot a heatmap with the numeric values in each cell."""
    DAYS = 31
    MONTHS = len(months)

    # Create crosstab
    plotme = crosstab_by_year(df, index, year)
    plotme = plotme.droplevel(0, axis=0) # Remove "Month" integer index

    # Weekday by Month
    if index == 'Weekday':
        # Drop rows used to organize data and 
        plotme = plotme.iloc[:-1,:]
        plotme = plotme.droplevel(0, axis=1).iloc[:,:-1] # Remove "All" column
        
        # If less than 12 months, append missing months
        if len(plotme.columns) < MONTHS:           
            # Create dataframe filled with 0 and 12 months
            df_12months = pd.DataFrame(0, index=plotme.index.values, columns=months[len(plotme.columns):])
            plotme = pd.concat([plotme, df_12months], join='outer', axis=1)
            
        # Set plot
        ylabel = index # for plt.ylabel()
        plt.subplots(figsize=(9,4))
        if year != None:
            plt.title('# Movies over ' + ylabel + ' by Month (in ' + str(year) +')')
        else:
            plt.title('# Movies over  ' + ylabel + ' by Month')
        
    # Days by Month (whole month)
    elif index == 'Month':
        # If less than 12 months, append missing months
        if len(plotme.index) < MONTHS:
            # Create dataframe filled with 0 and 12 months
            df_12months = pd.DataFrame(0, index=months[len(plotme.index):], columns=plotme.columns.values)
            plotme = plotme.append(df_12months)
            
        # Use days instead of weekdays
        plotme = plotme.T
        # Add missing days
        add_rows = list(set(np.arange(1, DAYS+1, 1)) - set(plotme.index))        
        plotme = plotme.append(pd.DataFrame(0, index=add_rows, columns=plotme.columns))
        plotme.sort_index(inplace=True)
        
        # Set plot
        ylabel = 'Day' # for plt.ylabel()
        plt.subplots(figsize=(8,15))
        if year != None:
            plt.title('# Movies by Day of ' + index + ' (in ' + str(year) +')')
        else:
            plt.title('# Movies by Day of ' + index)

    # Plot
    # Set ticks step
    shrink = 0.7
    diff =  plotme.values.max() - plotme.values.min()
    if diff < 9: step = 1
    elif diff < 30: step = 2        
    else: step = 5
    # Set range
    min_val = plotme.values.min()
    max_val = plotme.values.max()
    ticks = np.arange(min_val, max_val+1, step) # colorbar ticks
    boundaries = np.arange(min_val, max_val+0.1, 0.01) # colorbar range
    sns.heatmap(plotme, annot=True, fmt="d", cbar_kws={"shrink":shrink, "ticks":ticks, "boundaries":boundaries})
    plt.xlabel('Month')
    plt.ylabel(ylabel)    
    plt.show()

In [44]:
def create_weekday_df(df, year):
    """Create week dataframe with total movies watched by weekday."""
    # Select 'All' column and remove 'All' row
    df_weekday = pd.DataFrame(crosstab_by_year(df, 'Weekday', year)[('All','')]).reset_index().iloc[:-1, :] 
    # Get columns names ('Weekday' and 'WeekdayName')
    df_weekday.columns = df_weekday.columns.droplevel(1)    
    # Select and rename columns
    df_weekday = df_weekday[['WeekdayName','All']]    
    
    return df_weekday.rename(columns={"WeekdayName": "Weekday", "All": "Movies"})        

In [45]:
def plot_weekday_bar(df, year): 
    """Plot # movies by weekday."""
    df_weekday = create_weekday_df(df, year)

    # Prepare plot
    cmap = plt.get_cmap(plot_cmap)
    norm = plt.Normalize(df_weekday['Movies'].min(), df_weekday['Movies'].max())
    values = df_weekday['Movies'].values
    
    # Plot
    ax = plt.figure(figsize=(len(values),6)).gca()
    sns.barplot('Weekday', 'Movies', data=df_weekday, palette=cmap(norm(values)))
    ax.set_xticklabels(ax.get_xticklabels(), rotation = 90) 
    ax.set(ylim=(0, values.max()+values.std()))
    ax.yaxis.get_major_locator().set_params(integer=True)

    if year != None:
        plt.title('# Watched movies by Weekday (in ' + str(year) +')')
    else:
        plt.title('# Watched movies by Weekday')
    plt.ylabel('# Movies')
    
    for i, n in enumerate(df_weekday['Movies']):
        plt.text(i, n+0.5, n, fontdict={'fontsize':10})
    
    plt.show()
    
    # Plot ratio
    ratio_weekend(df, year)

In [46]:
def plot_week_bar(df, year): 
    """Plot watched movies by week of year."""
    NWEEKS = 52
    nweeks = np.arange(1, NWEEKS+1, 1)
    
    cols = ['Title','Timestamp']
    df_woy = df[cols].copy()
    df_woy['Weekofyear'] = df_woy['Timestamp'].dt.weekofyear
    
    # No movies from week 53 in the beginning of the year
    temp = df_woy[df_woy['Timestamp'].dt.year == year]
    temp = temp[~((temp['Timestamp'].dt.month == 1) & (temp['Timestamp'].dt.weekofyear >= NWEEKS))]
    
    # Week starts on monday and some values may fall on week 52 or 53 of previous year
    if df_woy[df_woy['Timestamp'].dt.year == year+1].size > 0:
        appendthis = df_woy[df_woy['Timestamp'].dt.year == year+1][df_woy['Timestamp'].dt.weekofyear >= NWEEKS]
        temp = temp.append(appendthis)
        
    # Group by
    df_woy = temp.copy()
    gb_woy = df_woy.groupby('Weekofyear').agg({'Title':'count'}).reset_index()    
    # If not all weeks of year present, add them
    add_rows = list(set(nweeks) - set(gb_woy.Weekofyear))
    gb_woy = gb_woy.append(pd.DataFrame({'Weekofyear': add_rows, 'Title': 0}))
    gb_woy.sort_values('Weekofyear', inplace=True)

    # Prepare plot
    cmap = plt.get_cmap(plot_cmap)
    norm = plt.Normalize(gb_woy['Title'].min(), gb_woy['Title'].max())
    values = gb_woy['Title'].values
    # Convert
    gb_woy.Weekofyear = gb_woy.Weekofyear.astype(int)
    gb_woy.rename(columns={'Title':'# Movies'}, inplace=True)
    
    # Plot     
    plotme = gb_woy.set_index('Weekofyear').T
    xsize = NWEEKS // 2
    plt.figure(figsize=(xsize, 2))
    sns.heatmap(plotme, annot=True, fmt="d", cbar_kws={"orientation": "horizontal", "pad": 0.5})    
    plt.title('# Movies watched by Week of Year (in ' + str(year) +')')
    plt.xlabel('Week of Year')
    #plt.ylabel('# Movies') 
    
    plt.show()

In [47]:
def plot_hour_month_bar(df, year):
    """Plot how many hours spent watching movies by month."""
    MONTHS = len(months)

    # Group by month (get .month to maintain month order)
    group_month = [df['Timestamp'].dt.month, df['Timestamp'].dt.month_name()]    
    df_month = df[df['Timestamp'].dt.year==year].groupby(group_month).agg({'Runtime':'sum'}).droplevel(0, axis=0).reset_index()
    df_month['Runtime'] = (df_month['Runtime'] / 60).astype(int)
    df_month = df_month.rename(columns={'Timestamp':'Month'})

    # If less than 12 months, append missing months
    if len(df_month) < MONTHS:
        df_12months = pd.DataFrame({'Month':months[now.month-MONTHS:], 'Runtime':0})
        df_month = df_month.append(df_12months.iloc[now.month-MONTHS:,:]).reset_index(drop=True)
            
    # Prepare plot
    cmap = plt.get_cmap(plot_cmap)
    norm = plt.Normalize(df_month.Runtime.min(), df_month.Runtime.max())
    values = df_month.Runtime.values

    # Plotting
    ax = plt.figure(figsize=(MONTHS,6)).gca()
    sns.barplot(x='Month', y='Runtime',data=df_month, palette=cmap(norm(values)))    
    ax.set_xticklabels(ax.get_xticklabels(), rotation = 90) 
    ax.set(ylim=(0, values.max()+values.std()))
    ax.yaxis.get_major_locator().set_params(integer=True)    
    plt.title('# Hours by Month (in ' + str(year) +')')
    plt.ylabel('# Hours')
    
    for i, n in enumerate(values):
        plt.text(i, n+0.1, n, fontdict={'fontsize':10})
        
    plt.show()

In [48]:
def plot_box_swarm(x, y, data, order):
    """Plot swarm box plot with x based on release year or decade."""
    # Plot
    xsize = len(order) / 1.5    
    _, ax = plt.subplots(figsize=(xsize, 6))
    ax = sns.boxplot(x, y, data=data, order=order)
    ax = sns.swarmplot(x, y, data=data, order=order, color=".4")
    # Add grid lines
    ax.yaxis.grid(True)
    ax.xaxis.grid(True)
   
    if x == 'Year':
        x = 'Release Year'
        ax.set_xlabel(x)
        
    title = y + ' by ' + x    
    ax.set_title(title)    
    ax.set_xticklabels(ax.get_xticklabels(), rotation = 90) 

In [49]:
def plot_movie_trend(df, year=None):
    """Plot number of movies by release year."""
    if year is not None:
        plot_movies_year = df[df['Timestamp'].dt.year == year].groupby(by='Year').agg({'Title': 'count'}).reset_index().copy()
    else:
        plot_movies_year = df.copy()
    
    # Plot
    x = plot_movies_year['Year']
    y = plot_movies_year['Title']
    # https://towardsdatascience.com/how-to-make-bar-and-hbar-charts-with-labels-using-matplotlib-b701ce70ba9c   
    _, ax = plt.subplots(figsize=(30, 6))
    add_ticks_vbarplot(x, y, ax)    
    ax.set(ylim=(0, y.max() + y.std()))
    
    if year is not None:
        plt.title('# Movies watched by Release Year (in ' + str(year) +')')
    else:
        plt.title('# Movies watched by Release Year')
        
    plt.xlabel('Release Year')
    plt.ylabel('# Movies')    
    plt.show()

In [50]:
def plot_previous_month_stats(df, cols, TOP=5):
    """Plot previous month analysis."""
    # Plot annotation function
    def annotation(x, axs):
        axs[j].set_xlabel(xlabel)
        for i, rate in enumerate(x):
            axs[j].text(rate+0.1, i+0.1, str(rate), fontdict={'fontsize':12})
        return axs
    
    # Define xlabel 
    xlabel = '%'
    # Create decade column for last month dataframe
    df_aux = df[['Title','Year']].copy()
    df_aux['Decade'] = df_aux['Year'].astype(int)//10*10 # convert to decade
    # Last month's watched movies' decade count
    df_decade_aux = pd.DataFrame(df_aux['Decade'].value_counts(normalize=True))
    df_decade_aux = df_decade_aux.reset_index().sort_values('index', ascending=False)
    df_decade_aux.rename(columns={'Decade':xlabel, 'index':'Decade'}, inplace=True)
    df_decade_aux = df_decade_aux.nlargest(TOP, xlabel, keep='all')
    df_decade_aux[xlabel] = df_decade_aux[xlabel].mul(100).round(1)
    df_decade_aux.set_index('Decade', inplace=True)   
    
    # Figure
    subplot_size = len(cols) # get column size to resize figure
    fig, axes = plt.subplots(1, subplot_size+1, figsize=(subplot_size*8, 4))  
    axs = axes.ravel()
    # Plot decade (outside of cols list)
    j=0
    x = df_decade_aux[xlabel]
    y = df_decade_aux.index.astype(str) # to order by greater count value and not decade (int)
    sns.barplot(x, y, orient='h', palette=sns.color_palette('cool'), ax=axs[j])
    axs[j].set_title('Release Decade' + ' (top ' + str(TOP) + ')')
    axs[j].set_ylabel('Decade')
    annotation(x, axs)    
    # Plot col subplot
    for j, col in enumerate(cols, start=1):
        # Create last month cols dataframe
        df_subplots= df[cols].copy()
        # Explode col in order to count values
        df_subplots[col] = df_subplots[col].str.split(',').apply(lambda x: [e.strip() for e in x])
        df_subplots = explode(df_subplots, [col])             
        # Count and convert to %
        plot_subplots = pd.DataFrame(df_subplots[col].value_counts(normalize=True))
        plot_subplots.rename(columns={col:xlabel}, inplace=True)    
        plot_subplots = plot_subplots.nlargest(TOP, xlabel, keep='first')
        plot_subplots[xlabel] = plot_subplots[xlabel].mul(100).round(1)        
        # Plot subplot
        x = plot_subplots[xlabel]
        y = plot_subplots.index     
        sns.barplot(x, y, orient='h', palette=sns.color_palette('cool'), ax=axs[j])
        axs[j].set_title(str(col) + ' (top ' + str(TOP) + ')')
        axs[j].set_ylabel(col)
        annotation(x, axs)
    
    fig.suptitle("Previous Month Analysis")
    fig.tight_layout() # separation between subplots
    plt.show()