# MMDb Functions

## Imports

In [42]:
import pandas as pd
import numpy as np

import datetime as dt
from datetime import date
from collections import Counter
from itertools import combinations
from dateutil.relativedelta import relativedelta

import time
import requests
import json
import pickle

# https://imdbpy.readthedocs.io/en/latest/usage/data-interface.html
from imdb import IMDb

# plotting
from IPython.core.display import HTML
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns

In [43]:
import warnings
warnings.filterwarnings('ignore')

## Classes

### Trakt

In [44]:
# Extract these columns
cols_trakt = ['title', 'tag', 'timestamp', 'duration']

In [45]:
# Replace movie titles for correct querying OMDb
correct_titles = {
    # good matches but appear as different movies on list comparison (vs. - vs)
    'Unbreakable Kimmy Schmidt: Kimmy vs. the Reverend':'Unbreakable Kimmy Schmidt: Kimmy vs the Reverend', 
    'The Mitchells vs. The Machines':'The Mitchells vs The Machines', 
    # mismatched titles
    'Possessor Uncut':'Possessor', 
    "Bram Stoker's Dracula":'Dracula', 
    'Star Wars: The Rise of Skywalker':'Star Wars: Episode IX - The Rise of Skywalker',
    'Pusher II: With Blood on My Hands':'Pusher II', 
    'Crip Camp: A Disability Revolution':'Crip Camp', 
    'Journey to the West: Conquering the Demons':'Journey to the West', 
    'One Child Nation':'Born in China',
    'Charlie Countryman':'The Necessary Death of Charlie Countryman',  
    'Biking Borders - eine etwas andere Reise':'Biking Borders',
    'Born in China':'One Child Nation',
    
    # portuguese
    'Soldier Millions':'Hero on the Front', 

    # spanish / basque
    'Skins':'Pieles', 
    'Timecrimes':'Los cronocrímenes', 
    'Akelarre':'Coven',
    
    # french
    'Atlantics':'Atlantique', 
    'Irreversible':'Irréversible',
    'District B13':'Banlieue 13',
  
    # scandinavian
    'Nightwatch':'Nattevagten',
    
    # other european languages
    'Untamed Romania':'România neîmblânzitã', 
    'Solaris':'Solyaris',
    'Apples':'Mila',
    'Malena':'Malèna',
    
    # middle eastern
    "Where Is My Friend's House?":"Where Is the Friend's House?", 

    # asian
    'Sin-gwa ham-kke: Jwi-wa beol':'Along With the Gods: The Two Worlds', 
    'Sympathy for Lady Vengeance':'Lady Vengeance',
}

translations = {
    'Ã†': 'Ae'
}

In [46]:
# TRAKT class
class Trakt:
    #
    def __init__(self):
        self.__df = pd.DataFrame()
    
    #
    def load(self, filename):    
        # Trakt connection with Zenobase (work with exported json file)        
        # Load data using Python JSON module
        with open(filename, 'r') as f:
            json_data = json.loads(f.read())            
            
        # Normalizing data
        df = self.__df.append(pd.json_normalize(json_data, record_path=['events']))
        
        # Select columns
        cols = ['resource.title','tag','timestamp','duration']        
        return df[cols].rename(columns={'resource.title': 'title'}).sort_values(by='timestamp').reset_index(drop=True)           

    @staticmethod
    def extract_year_from_title(x):
        # Extract everything between two parenthesis
        #df['year'] = df['title'].str.extract('\((.*?)\)')
        return x.str.extract('\((\d+)\)')
    
    @staticmethod
    def convert_duration(x):
        return pd.to_numeric(x / 1000 / 60).astype(int)    
    
    def convert_timestamp_to_datetime(self, s):
        # timestamp series (s) as input 
        return pd.to_datetime(s, utc=True).apply(lambda x: x.replace(tzinfo=None))
    
    def convert_timestamp(self, x):    
        # Convert timestamp to datetime - extract first, convert later
        # https://programmersought.com/article/74164354616/
        # .iloc[:,0] to get series from dataframe
        return self.convert_timestamp_to_datetime(x.str.extract(r'(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2})').iloc[:,0])
    
    @staticmethod
    def convert_title(x):
        x = x.str.replace(' \((.*?)\)','')  
        x = x.replace(translations, regex=True)
        x = x.str.encode('latin-1').str.decode('utf8')
        return x.replace(correct_titles, regex=True)   
    
    #
    def convert_columns(self, df):
        # Convert columns to appropriate formats
        df['year'] = self.extract_year_from_title(df.title)
        df['title'] = self.convert_title(df.title)
        df['duration'] = self.convert_duration(df.duration)
        df['timestamp'] = self.convert_timestamp(df.timestamp)
        
        return df        

### OMDb

In [47]:
# apiKey from datacamp
# https://campus.datacamp.com/courses/intermediate-importing-data-in-python/interacting-with-apis-to-import-data-from-the-web-2?ex=7
apiKey = '72bc447a'

#Fetch Movie Data
data_URL = 'http://www.omdbapi.com/?apikey='+apiKey

In [48]:
# Titles that need query by iMDB id (generic name; hard to get correct info)
id_titles = {
    'Time':'tt11416746',
    'Little Fish':'tt9735470',
    'Ghost in the Shell':'tt0113568',
    'Suspiria':'tt1034415',
    'Patrick':'tt7618604',
    'The Climb':'tt8637440',
    'The Call':'tt10530176',
    'The Silence':'tt1361835',
    'The Man Who Saved the World':'tt2277106',
    '1984':'tt0087803',
    'Cruella':'tt3228774',
    'Saint George':'tt4895668',
    'Joy':'tt8917752',
    'Luca':'tt12801262',
    'Black Widow':'tt3480822',
    'Fear Street Part One: 1994':'tt6566576',
    'Fear Street Part Two: 1978':'tt9701940',
    'Fear Street Part Three: 1666':'tt9701942',
    'Twelve Monkeys':'tt0114746',
    'The Life Ahead':'tt10627584',
    'Vivo':'tt6338498',
    'The Suicide Squad':'tt6334354',
    'Shadow':'tt6864046',
    'Kate':'tt7737528',
    'Torment':'tt0109731',
    "Dogs Don't Wear Pants":'tt9074574',
    'Lux Aeterna':'tt10272534',
    'Limbo':'tt9138170',
    'Christine':'tt4666726',
    'Finch':'tt3420504',
    'Léon: The Professional':'tt0110413',
    'Leviathan':'tt2802154',
    'The Hunt':'tt8244784',
    'On My Skin':'tt7121252',
    'Los cronocrímenes':'tt0480669',
    'Dune':'tt0087182',
    'The Wind Rises':'tt2013293',
    'Nineteen Eighty-Four':'tt0087803',
    'Banlieue 13':'tt0414852',
    "Don't Look Up":'tt11286314',
    'Long Shot':'tt7344360',
    'Rams':'tt3296658',
    'The Wave':'tt3616916',
    'The Guilty':'tt6742252', 
    'Shorta':'tt11081050',
    'The Trip':'tt13109952',
    'La Belle Verte':'tt0115650',
    'Durante La Tormenta':'tt6908274',
    'Atlantique':'tt10199586',
    'The Necessary Death of Charlie Countryman':'tt1196948',
    'Tigers Are Not Afraid':'tt4823434',
    'Variações: Guardian Angel':'tt2155399',
    'The Bar':'tt5121816',
    'Crazy About Her':'tt11698630',
    'New Order':'tt12474056',
    "Je T'Aime, Je T'Aime":'tt0063152',
    'Mirror':'tt0072443',
    'Irréversible':'tt0290673',
    'Solyaris':'tt0069293',
    'Dr. Strangelove':'tt0057012',
    'Born in China':'tt8923482',
    'Boiling Point':'tt11127680',
    'The Innocents':'tt4028464',
    'Dracula':'tt0103874',
    'Nightmare Alley':'tt7740496'
}

id_titles_query = [item for item in list(id_titles.keys())]

In [49]:
# Change back movie names for analysis
# original titles : translated titles
replace_titles = {
    'Dracula':"Bram Stoker's Dracula",           
    'România neîmblânzitã':'Untamed Romania',    
    'Koirat eivät käytä housuja':"Dogs Don't Wear Pants",    
    'Mila':'Apples',   
    'Hauru no ugoku shiro':"Howl's Moving Castle",
    'Gake no ue no Ponyo':'Ponyo',    
    'Kôkaku Kidôtai':'Ghost in the Shell',
}

In [50]:
# OMDb class
class OMDB:
    #
    def __init__(self):
        self.__df = pd.DataFrame()
    
    #    
    def convert_to_int(self, x):
        # clean columns
        # ['Year','Runtime','Metascore','imdbVotes']        
        return x.replace('N/A', 0, regex=True).astype(int)
    
    @staticmethod
    def convert_imdb_rating(x):
        return x.apply(pd.to_numeric, errors='coerce').astype(float)
    
    def convert_imdb_votes(self, x):
        # remove ',' from votes (ex: 6,000 to 6000)
        return self.convert_to_int(x.astype(str).str.replace(',',''))
    
    @staticmethod
    def convert_imdb_id(x):
        # only numeric ID to join
        return x.astype(str).str.replace('tt','')
    
    def convert_runtime(self, x):
        # remove 'min', get only integers
        return self.convert_to_int(x.astype(str).replace('N/A', '000 min', regex=True).str.replace(r'\D', ''))
    
    @staticmethod
    def convert_timestamp(x):
        # remove timezone
        return pd.to_datetime(x, utc=True).apply(lambda x: x.replace(tzinfo=None))

    def convert_boxoffice(self, x):
        # clean box office values
        return self.convert_to_int(x.astype(str).str.lstrip('$').str.replace(',',''))
    
    @staticmethod
    def convert_country(x):
        # convert country names
        replace_country = {
            'United Kingdom':'UK',
            'United States':'USA'
        }
        return x.replace(replace_country, regex=True)

    @staticmethod
    def convert_language(x):
        # convert language names
        replace_lang = {
            'American Sign Language':'American Sign',
            'American Sign':'American Sign Language',
            'Korean Sign':'Korean Sign Language'
        }
        return x.replace(replace_lang, regex=True)
    
    #
    def convert_columns(self, df):
        # Convert columns to appropriate formats
        df['Runtime'] = self.convert_runtime(df.Runtime)
        df['Language'] = self.convert_language(df.Language)
        df['Country'] = self.convert_country(df.Country)
        df['Metascore'] = self.convert_to_int(df.Metascore) # convert_to_int
        df['imdbRating'] = self.convert_imdb_rating(df.imdbRating)
        df['imdbVotes'] = self.convert_imdb_votes(df.imdbVotes)
        df['imdbID'] = self.convert_imdb_id(df.imdbID)
        df['Timestamp'] = self.convert_timestamp(df.Timestamp)        
        return df
    
    @staticmethod
    def retranslate_titles(x):   
        # change back movie names for analysis
        return x.replace(replace_titles)      
    
    #
    def request_db(self, df): 
        # https://www.omdbapi.com/
        # Connect with OMDb API
        response = []
        movies = []
        i=0
        
        df.reset_index(inplace=True) # to add timestamp later
        nMovies = len(df)    
        # get titles that need id query
        match = list(set(id_titles_query).intersection(df.title))
        
        for i in range(nMovies):
            params = {}
            movieTitle = df['title'][i]
            #yearTitle = df['year'][i] # for cases like 'Dune'
            
            if movieTitle in match:
                params = {
                    'type':'movie',
                    'i':id_titles.get(movieTitle)    
                }
            else:
                params = {
                    't':movieTitle,
                    'type':'movie',
                    #'y':yearTitle
                }
            i=i+1;
    
            response = requests.get(data_URL, params=params).json()
            movies.append(response)
        
        return movies

In [51]:
# Test queries
testing = False

if testing:
    # single movie request
    stitle = "Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb" #"Kingdom: Ashin of the North"
    idd = 'tt3420504'
    
    params = {
        't':stitle,
        'type':'movie',
        #'y': 2021 #syear
        #'i':idd
    }
    
    responseTest = requests.get(data_URL, params=params).json()
    responseTest

else:
    pass

### IMDb

In [52]:
# Extract these columns
cols_imdb = ['title', 'year', 'imdbID', 'runtimes', 'genres', 'director', 'writer', 'cinematographers', 'cast', 'countries', 'languages', 'rating', 'votes', 'plot outline', 'production companies']

In [53]:
# IMDb class
class IMDB:
    #
    def __init__(self):
        self.__df = pd.DataFrame(columns=cols_imdb)
        
        self.__cols_int = ['year','runtimes','votes']
        self.__cols_str = ['imdbID','genres','director','writer','cinematographers','cast','countries','languages','production companies']
    
    #        
    def load(self, filename):
        # Load imdb file
        return pd.read_csv(filename)
    
    #@staticmethod
    #def convert_year(x):
    #    return x.fillna(0).astype(int)
    #
    #@staticmethod
    #def convert_runtime(x):
    #    return x.fillna(0).astype(int)
    #
    #@staticmethod
    #def convert_imdb_votes(x):
    #    return x.fillna(0).astype(int)
    #
    #@staticmethod
    #def convert_imdb_id(x):
    #    return x.astype(str)
    #
    #@staticmethod
    #def convert_writer(x):
    #    return x.astype(str)
    
    #
    def convert_columns(self, df):
        # Convert columns to appropriate formats
        df[self.__cols_int] = df[self.__cols_int].fillna(0).astype(int)
        df[self.__cols_str] = df[self.__cols_str].fillna('N/A').astype(str)
        
        return df
        
    @staticmethod    
    def get_info_list(x, cols, d):
        # Extract information from imdb ia object
        # x - series
        # cols - columns
        # d - dictionary
        for i in range(len(cols)):
            if x[cols[i]] != None:
                d[cols[i]] = x[cols[i]]
            else:
                if (cols[i] == 'rating') or (cols[i] == 'votes'):
                    d[cols[i]] = 0
                elif cols[i] == 'plot outline':
                    d[cols[i]] = 'N/A'
                else:
                    d[cols[i]] = ['N/A']            
        return d
    
    @staticmethod
    def get_name_list(x, cols, d):
        # Extract information from imdb ia object
        # x - series
        # cols - columns
        # d - dictionary
        for i in range(len(cols)):
            if x[cols[i]] != None:
                temp = []
                for info in x[cols[i]]:
                    if info != None:
                        temp.append(info)
                    else:
                        pass

                # sometimes same person appears multiple times
                if cols[i] == 'writer':
                    # clean list
                    d[cols[i]] = list(filter(None, list(set(temp))))
                else:
                    d[cols[i]] = temp 
          
            else:
                d[cols[i]] = ['N/A']            
        return d

    #    
    def clean_df(self, movies):  
        # Clean output from db request
        df = pd.DataFrame(columns=cols_imdb)
        
        # columns
        cols_info = ['runtimes','genres','countries','languages','rating','votes','plot outline']
        cols_name = ['director','writer','cinematographers','cast','production companies']
    
        # dictionaries
        dict_info = {}
        dict_name = {}
        
        for i in range(len(movies)):
            # intiliaze
            temp_movie = []
            movie = movies[i]

            # set values
            title = movie['title']
            year = movie['year']
            imdbID = movie['imdbID']
            
            dict_info = self.get_info_list(movie, cols_info, dict_info)
            dict_name = self.get_name_list(movie, cols_name, dict_name)
    
            # organize columns according to DF
            temp_movie = [title, year, imdbID, 
                          dict_info['runtimes'], dict_info['genres'], 
                          dict_name['director'], dict_name['writer'],
                          dict_name['cinematographers'], dict_name['cast'],
                          dict_info['countries'], dict_info['languages'], 
                          dict_info['rating'], dict_info['votes'], 
                          dict_info['plot outline'], dict_name['production companies']
                         ]
        
            df.loc[len(df)] = temp_movie
       
        df = list_to_string(df, cols_info[:-3])
        df = list_to_string(df, cols_name)    
        
        return df
    
    #
    def add_movies(self, df_trakt, df):
        # add new movies (trakt - imdb) to list
        titles = df_trakt['title']
        add_titles = []
        
        # index differences (number of movies added)
        idxdif = titles.last_valid_index() - df['og'].last_valid_index()
        
        # if new movies
        if idxdif != 0:
            # get titles
            add_titles = titles.iloc[-idxdif:].values + ' (' + df_trakt.iloc[-idxdif:]['year'].values + ')'
        else:
            pass
    
        return add_titles
    
    #
    def request_df(self, movies, ids=False):
        # https://www.imdb.com/
        # Connect with IMDb API
        all_movies = []    
        
        # create an instance of the IMDb class
        ia = IMDb()
        
        for i in range(len(movies)):
            search = []
                  
            # if searching by title (not searching by imdbID)
            if ids == False:
                # better to search by title + year : get first result
                search = ia.search_movie(movies[i]) # Dune case (multiple movies)        
                temp_title = movies[i].split(' (')[0]
                
                if (search == []) or (not search[0]['title'].startswith(temp_title)): # if search result is empty or title doesn't start with the name of the movie
                    search = ia.search_movie(temp_title)        
                else:
                    pass
                
                # Select correct title
                temp = search[0]    
                ## Fetch data from correct title and update info into var
                ia.update(temp, info='main', override=1)
                
            # if searching by imdbID verification is not needed
            else:
                temp = ia.get_movie(movies[i], info='main')
            
            # gather data to export
            all_movies.append({key: temp.get(key) for key in cols_imdb})
            
        return all_movies    

## General

In [54]:
# Convert column of lists into column of strings
def list_to_string(df, cols):
    for i in range(len(cols)):
        df[cols[i]] = df[cols[i]].agg(lambda x: ', '.join(map(str, x)))
        
    return df

In [55]:
# Get total movies watched by Year
def totals_by_year(df, docs=False):
    # Get documentaries
    df_genres = df.Genre.str.contains('Documentary')
    if docs:
        df_ = df[df_genres] # include documentaries
    else:
        df_ = df[~df_genres] # do not include documentaries

    # Get year
    df_['Year'] = df_.Date.dt.year
    df_.drop(columns='Date', inplace=True)
    
    # Group by Year
    df_by_year = df_.groupby(by='Year').count()
    # Unique and count all movies watched
    df_by_year_uc = df_.groupby(by='Year').agg({'Title':['nunique','count']})
    
    # Add total row
    rowtotal = df_by_year_uc.sum()
    rowtotal.name = 'All'
    df_by_year_uc.append(rowtotal)
    
    return df_by_year, df_by_year_uc

In [56]:
# Check if size is different (new movies viewed)
# last exported file size (new json file from zenobase)
def check_new_movies(foo, df):
    newMovies = 0
    cols = ['title','year']
    
    if type(foo) == pd.core.frame.DataFrame and (len(foo) != len(df)):
        newMovies = len(df.title) - len(foo.Title) 
        
        # get added movies
        if newMovies != 0:
            diffMovies = list(set(df.title.str.lower()) - set(foo.Title.str.lower()))
            
            diffMovies = [item.lower() for item in diffMovies]
            df_diffMovies = df[df['title'].str.lower().str.contains('|'.join(diffMovies))]
    
            ## Select data to search in OMDB
            data = df_diffMovies[cols]
                        
    elif type(foo) != pd.core.frame.DataFrame:
        print('Get everything!')
        
        ## Select data to search in OMDB
        data = df[cols]
        
    else:
        print('Do nothing.')
        data = 0
        pass
    
    return data, newMovies

In [57]:
# Check new data with existing df from pickle file
def look_iama_pickle(foo, df_trakt, movies, newMovies, index, forceRequest):
    
    # copy foo if pickle exists and there's no new movies
    if type(foo) == pd.core.frame.DataFrame and newMovies == 0 and forceRequest == False:
        df = foo.copy()
        df['Timestamp'] = pd.Series(df_trakt['timestamp'])
        
    elif type(foo) == pd.core.frame.DataFrame and newMovies != 0 and forceRequest == False:
        index = index[-newMovies:]
        df_movies = pd.DataFrame(movies)
        df_movies = df_movies.tail(newMovies).reset_index(drop=True)
        df_movies['Timestamp'] = pd.Series(df_trakt.iloc[index].reset_index()['timestamp']) 
        
        df_movies['Date'] = df_movies.Timestamp.dt.date
        df_movies['Date'] = pd.to_datetime(df_movies['Date'])
        # extracting time from timestamp
        df_movies['Time'] = [dt.datetime.time(d) for d in df_movies['Timestamp']]
        
        df = foo.append(df_movies)
    
    elif type(foo) != pd.core.frame.DataFrame or forceRequest == True:
        df = pd.DataFrame(movies)
        # use OMDB query result and convert to dataframe
        df['Timestamp'] = pd.Series(df_trakt['timestamp'])
        df['Timestamp'] = pd.to_datetime(df.Timestamp, utc=True)
        
        df['Date'] = df.Timestamp.dt.date
        df['Date'] = pd.to_datetime(df['Date'])
        # extracting time from timestamp
        df['Time'] = [dt.datetime.time(d) for d in df['Timestamp']]
    
    else:
        pass
    
    return df

In [58]:
# Update movie information on pickle file
def correct_pickle(df, cols, pair2change):
    title2change = pair2change[0]
    id2change = pair2change[1]
    
    movieCols = cols[:-3]
    movie_change = df[df.Title.str.contains(title2change)][movieCols]
    movieId = movie_change.index

    # output
    show_all(movie_change)
    
    ## Connect to OMDB
    # single movie request
    params = {
        #'t':title2change,
        'type':'movie',
        'i':id2change
    }

    responseTest = requests.get(data_URL, params=params).json()
    
    # replace movie
    add_movie = pd.DataFrame(responseTest)
    add_movie = add_movie.head(1)
    show_all(add_movie)
    df.loc[movieId, movieCols] = add_movie[movieCols].values
    
    df = df[cols]

    # output
    show_all(df.loc[movieId, movieCols])
    
    return df

In [59]:
# Example of preparation for "exploding"
# Convert string with multiple values to array of values
# df_explode_genre.Genre = df_explode_genre.Genre.str.split(',').apply(lambda x: [e.strip() for e in x])

# https://stackoverflow.com/questions/12680754/split-explode-pandas-dataframe-string-entry-to-separate-rows
def explode(df, lst_cols, fill_value='', preserve_index=False):
    # make sure `lst_cols` is list-alike
    if (lst_cols is not None
        and len(lst_cols) > 0
        and not isinstance(lst_cols, (list, tuple, np.ndarray, pd.Series))):
        lst_cols = [lst_cols]
    # all columns except `lst_cols`
    idx_cols = df.columns.difference(lst_cols)
    # calculate lengths of lists
    lens = df[lst_cols[0]].str.len()
    # preserve original index values    
    idx = np.repeat(df.index.values, lens)
    # create "exploded" DF
    res = (pd.DataFrame({
                col:np.repeat(df[col].values, lens)
                for col in idx_cols},
                index=idx)
             .assign(**{col:np.concatenate(df.loc[lens>0, col].values)
                            for col in lst_cols}))
    # append those rows that have empty lists
    if (lens == 0).any():
        # at least one list in cells is empty
        res = (res.append(df.loc[lens==0, idx_cols], sort=False)
                  .fillna(fill_value))
    # revert the original index order
    res = res.sort_index()
    # reset index if requested
    if not preserve_index:        
        res = res.reset_index(drop=True)
    
    return res

In [60]:
# Explode date into multiple columns
def explode_date(df):
    df['DateYear'] = df.Date.dt.year
    
    df['Month'] = df.Date.dt.month
    df['MonthName'] = df.Date.dt.month_name()
    
    df['Weekday'] = df.Date.dt.weekday
    df['WeekdayName'] = df.Date.dt.strftime("%A")
    
    df['Day'] = df.Date.dt.day
    #df.drop(columns='Date', inplace=True)
    
    return df

In [61]:
#
def skip_days(df, calendar, year):
    # explode df
    df_explode = explode_date(df)
    df_explode = df_explode[df_explode.Date.dt.year == year]
    df_explode = pd.crosstab(df_explode.Day, [df_explode.Month, df_explode.MonthName])
    
    col_index = df_explode.columns
       
    calyear = calendar[calendar.date.dt.year == year].reset_index(drop=True)
    daysinmonths = calyear.daysinmonths
    
    months = len(calyear)  
    no_moviedays = []
    pct_no_moviedays = 0
    
    for i in range(months):
        moviedays = 0
        moviedays = df_explode[col_index[i]].astype(bool).sum(axis=0)  
        
        no_moviedays.append(int(daysinmonths[i] - moviedays))
    
    if daysinmonths.sum() != 0:
        pct_no_moviedays = int((sum(no_moviedays) / daysinmonths.sum()) * 100)
    else:
        pct_no_moviedays = 0
        
    print(str(year))
    print('By month: ' + str(no_moviedays))
    print('Total: ' + str(sum(no_moviedays)))
    print(str(pct_no_moviedays) + ' %')
    print('\n')

In [62]:
# Create crosstab filtering by year
def crosstab_by_year(df, index=None, year=None):
    # if year=None get all years
    cols = ['Title','Date']
    
    if year != None:
        df_ = df[df.Date.dt.year == year][cols].copy()
    else:
        df_ = df[cols].copy()
    
    df_ = explode_date(df_)
    if index == None:
        # Movies by month/year
        multiIndex = df_['DateYear']
    else:
        multiIndex = [df_[index], df_[index+'Name']]
    
    if index == 'Weekday' or index == None:
        ct = pd.crosstab(multiIndex, [df_['Month'], df_['MonthName']], margins=True)
    elif index == 'Month':
        ct = pd.crosstab(multiIndex, df_['Day'])
    
    return ct

In [63]:
# Movie ratio by movies watched on weekends vs movies watched on weekdays
def ratio_weekend(df, year=None):    
    df_ = create_weekday_df(df, year)

    # Create Weekdays df with weekdays' names
    allweek = df_.Weekday.values #['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday']    
    df_week = pd.DataFrame(0, index=allweek, columns=['Movies'])    
    df_week.reset_index(inplace=True)
    df_week.rename(columns={"index": "Weekday"}, inplace=True)

    # Concatenate dfs
    df_concat = pd.concat([df_week, df_]).groupby(['Weekday']).sum().reset_index()
    df_concat = df_concat.set_index('Weekday').loc[allweek]

    # Weekdays vs Weekend
    dropdays = allweek[1:-1] # ['Tuesday','Wednesday','Thursday','Friday','Saturday']
    # weekdays sum
    df_concat.loc['Monday'] += df_concat.iloc[1:5].sum()
    # weekend sum
    df_concat.loc['Sunday'] += df_concat.iloc[5].sum()
    # drop unecessary columns
    df_concat.drop(dropdays, inplace=True)
    # rename columns
    df_concat.rename(index={'Monday': 'Weekdays'}, inplace=True)
    df_concat.rename(index={'Sunday': 'Weekend'}, inplace=True)
    
    # Get ratio
    ratio = df_concat.reset_index()['Movies']
    pct_ratio = ((ratio.iloc[1] / (ratio.iloc[0] + ratio.iloc[1])) * 100).astype(int)    
    print(str(pct_ratio) + '% of the movies were watched on the weekend!')

In [None]:
# Count occurrences of string
def counter_display(df, by, col, top=None):
    ## Fill with 'nan'
    #df[by] = df[by].fillna('nan')

    # Check type
    sample = df[by].sample().values[0]
    
    if isinstance(sample, str):
        series = pd.Series(df[by].str.split(', ').apply(lambda x: [e.strip() for e in x]).map(Counter).sum())
    else: # list
        series = pd.Series(df[by].value_counts())
        
    # Organize df to export
    df_export = pd.DataFrame(series, columns=[col])
    
    if top == None:
        # Everything
        df_export = df_export.sort_values(by=col, ascending=False)
    else:
        # Show top
        df_export = df_export.nlargest(top, col, keep='all')
   
    return df_export

In [None]:
# Average movie rating by column
# If column values have multiple substrings in a string (examples: 'Genre', 'Actors', etc) use explode() to separate all values
def get_mean_value(df, by, get_avg, top=None):   
#    ## Set max limit to 60 persons (cast members)
#    LIMIT = 60

    # Create df to explode by 'by'
    df_explode = df.copy() # remove some not needed columns
    # Split multiple persons in one string
    df_explode[by] = df_explode[by].str.split(', ').apply(lambda x: [e.strip() for e in x])
#    ## Set limit
#    df_explode[by] = df_explode[by].transform(lambda x: x[:LIMIT])
    # Remove duplicates
    df_explode[by] = df_explode[by].map(lambda x: list(set(x)))
    # Explode df base on 'by'
    df_explode = explode(df_explode, [by])
    
    df_export = df_explode.groupby(by).agg({'Title':[list,'count'], get_avg:'mean'})
    df_export = df_export.droplevel(0, axis=1)
    df_export = df_export.rename(columns={'list':'Titles', 'count':'Total', 'mean':get_avg})
    
    # if 'imdbRating'
    if get_avg.find('Rating') != -1:
        df_export[get_avg] = round(df_export[get_avg], 2)
    # else 'Runtime' or 'imdbVotes'
    else:
        df_export[get_avg] = df_export[get_avg].astype(int)
         
    if top == None:
        # Everything
        df_export = df_export.sort_values('Total', ascending=False)
    else:
        # Show top
        df_export = df_export.nlargest(top, 'Total', keep='all')
    
    return df_export

In [66]:
# Movies by actor
def count_actor(series, top):     
    seriesNew = []
    
    for i in range(len(series)):
        # initialize an empty string
        series.iloc[i]
        str1 = ', ' .join(series.iloc[i])
        seriesNew.append(Counter([x.strip() for x in str1.split(',')]).most_common(top))
                      
    return pd.Series(seriesNew)

In [67]:
# Pair director - actor
def director_actors(data, col, top):
    colList = counter_display(data, col, 'Actors', None).reset_index()['index']
    d = {}

    for i in range(len(colList)):
        value = colList[i]        
        d[value] = [data[data[col].str.contains(colList[i])]['Actors'].values]

    data_export = pd.DataFrame.from_dict(d, orient='index', columns=['Actors']).head(top)
    
    return data_export

In [68]:
#
def movies_per_day(df, year):    
    nMovies = df.loc[year]['Title']

    now = pd.Timestamp('now')
    if year == now.year:
        lastDay = date.today()
    else:
        lastDay = date(year, 12, 31)
    
    firstDay = date(year, 1, 1)
    delta = lastDay - firstDay

    moviesPerDay = nMovies / (delta.days + 1) # 1st of January
    print(moviesPerDay.round(2), 'movies per day in', year)

In [69]:
#
def getRatio(a, b):
    a = a.lower()
    b = b.lower()
    total  = len(a) + len(b)
    counts = (Counter(a) - Counter(b)) + (Counter(b) - Counter(a))
    return 100 - 100 * sum(counts.values()) / total

In [70]:
# https://stackoverflow.com/questions/56603572/grouping-similar-strings-together-from-a-list

#data = ['MONTREAL EDUCATION BOARD', 'Ile de Montreal', 'Montreal',
#       'Ville de Montreal', 'MONTREAL CITY', 'Monrteal', 'Mont-real',
#       'Toronto', 'Toronto city', 'Tornoto', 'What is this', 'Bananasplit',
#       'Banana', 'StLouis', 'St Louis', 'Saint Louis']

def pair_groups(data, threshold, minGroupSize):

    paired = { c:{c} for c in data }
    for a, b in combinations(data, 2):
        if getRatio(a,b) < threshold: continue
        paired[a].add(b)
        paired[b].add(a)

    groups    = list()
    ungrouped = set(data)
    while ungrouped:
        bestGroup = {}
        for city in ungrouped:
            g = paired[city] & ungrouped
            for c in g.copy():
                g &= paired[c] 
            if len(g) > len(bestGroup):
                bestGroup = g
        if len(bestGroup) < minGroupSize : break  # to terminate grouping early change minGroupSize to 3
        ungrouped -= bestGroup
        groups.append(bestGroup)
    
    return groups

In [71]:
# Function that computes the weighted rating of each movie
def weighted_rating(v, R, m, C):
    # v - votes
    # R - Rating
    # m - minimun number of votes
    # C - mean
    score = []
    # Calculation based on the IMDB formula
    score = (v/(v+m) * R) + (m/(m+v) * C).round(3)
        
    return score

In [72]:
#
def most_watched_genre(df, top, year):
    cols = ['Genre','Title','imdbRating','Date']
    df_explode_genre = df[df.Date.dt.year == year][cols].copy()
    
    df_explode_genre.Genre = df_explode_genre.Genre.str.split(',').apply(lambda x: [e.strip() for e in x])
    df_explode_genre = explode(df_explode_genre, ['Genre'])
    
    df_explode_genrerat = get_mean_value(df_explode_genre, 'Genre', 'imdbRating')
    
    show_all(df_explode_genrerat.nlargest(top, 'Total', keep='all'))
    
    return df_explode_genrerat

In [73]:
#
def most_watched_genre_combo(df_, top, year):
    cols = ['Genre','Title','imdbRating','Date']
    df_genre = df_[df_.Date.dt.year == year][cols].copy()
    
    gb_genre = df_genre.groupby(by='Genre')
    # Genre combination with most movies, showing worst and best rated movies
    df_genre_agg = gb_genre.agg({'imdbRating': ['min','max','mean'], 
                                 'Title': 'count'})
    df_genre_agg.reset_index(inplace=True)
    
    df_genre_agg_worst = gb_genre['imdbRating'].idxmin().fillna(0).astype(int)
    df_genre_agg[('Title','worst')] = df_genre.reindex(df_genre_agg_worst).reset_index(drop=True)['Title']
    
    df_genre_agg_best = gb_genre['imdbRating'].idxmax().fillna(0).astype(int)
    df_genre_agg[('Title','best')] = df_genre.reindex(df_genre_agg_best).reset_index(drop=True)['Title']
    
    df_genre_agg = df_genre_agg.set_index([('Genre','')]).round(1)
    df_genre_agg.index.rename('Genre', inplace=True)
    
    show_all(df_genre_agg.nlargest(top, [('Title','count')], keep='all'))
    
    return df_genre_agg

## Plotting

In [74]:
# Display everything
def show_all(df):
    display(HTML(df.to_html()))

In [75]:
# Map colors

#'Accent', 'Accent_r', 'Blues', 'Blues_r', 'BrBG', 'BrBG_r', 'BuGn', 'BuGn_r', 'BuPu', 'BuPu_r', 
#'CMRmap', 'CMRmap_r', 'Dark2', 'Dark2_r', 'GnBu', 'GnBu_r', 'Greens', 'Greens_r', 'Greys', 'Greys_r', 
#'OrRd', 'OrRd_r', 'Oranges', 'Oranges_r', 'PRGn', 'PRGn_r', 'Paired', 'Paired_r', 'Pastel1', 'Pastel1_r', 'Pastel2', 'Pastel2_r', 
#'PiYG', 'PiYG_r', 'PuBu', 'PuBuGn', 'PuBuGn_r', 'PuBu_r', 'PuOr', 'PuOr_r', 'PuRd', 'PuRd_r', 'Purples', 'Purples_r', 
#'RdBu', 'RdBu_r', 'RdGy', 'RdGy_r', 'RdPu', 'RdPu_r', 'RdYlBu', 'RdYlBu_r', 'RdYlGn', 'RdYlGn_r', 'Reds', 'Reds_r', 
#'Set1', 'Set1_r', 'Set2', 'Set2_r', 'Set3', 'Set3_r', 'Spectral', 'Spectral_r', 'Wistia', 'Wistia_r', 
#'YlGn', 'YlGnBu', 'YlGnBu_r', 'YlGn_r', 'YlOrBr', 'YlOrBr_r', 'YlOrRd', 'YlOrRd_r', 
#'afmhot', 'afmhot_r', 'autumn', 'autumn_r', 'binary', 'binary_r', 'bone', 'bone_r', 'brg', 'brg_r', 'bwr', 'bwr_r', 
#'cividis', 'cividis_r', 'cool', 'cool_r', 'coolwarm', 'coolwarm_r', 'copper', 'copper_r', 'cubehelix', 'cubehelix_r', 
#'flag', 'flag_r', 'gist_earth', 'gist_earth_r', 'gist_gray', 'gist_gray_r', 'gist_heat', 'gist_heat_r', 'gist_ncar', 'gist_ncar_r', 
#'gist_rainbow', 'gist_rainbow_r', 'gist_stern', 'gist_stern_r', 'gist_yarg', 'gist_yarg_r', 'gnuplot', 'gnuplot2', 'gnuplot2_r', 'gnuplot_r', 
#'gray', 'gray_r', 'hot', 'hot_r', 'hsv', 'hsv_r', 'icefire', 'icefire_r', 'inferno', 'inferno_r', 'jet', 'jet_r', 
#'magma', 'magma_r', 'mako', 'mako_r', 'nipy_spectral', 'nipy_spectral_r', 'ocean', 'ocean_r', 'pink', 'pink_r', 
#'plasma', 'plasma_r', 'prism', 'prism_r', 'rainbow', 'rainbow_r', 'rocket', 'rocket_r', 'seismic', 'seismic_r', 
#'spring', 'spring_r', 'summer', 'summer_r', 'tab10', 'tab10_r', 'tab20', 'tab20_r', 'tab20b', 'tab20b_r', 'tab20c', 'tab20c_r', 
#'terrain', 'terrain_r', 'twilight', 'twilight_r', 'twilight_shifted', 'twilight_shifted_r', 'viridis', 'viridis_r', 'vlag', 'vlag_r', 
#'winter', 'winter_r'

In [76]:
# function to add value labels
def add_labels(x,y):
    for i in range(len(x)):
        plt.text(i, y[i], y[i], ha = 'center')

In [77]:
def add_ticks_vbarplot(x, y, ax):
    bars = ax.bar(x, y, width=0.8)

    for bar in bars:
        height = bar.get_height()
        label_x_pos = bar.get_x() + bar.get_width() / 2
        ax.text(label_x_pos, height, s=f'{height}', ha='center', va='bottom')
        
    for tick in ax.xaxis.get_major_ticks():
        tick.label.set_fontsize(12)
    
    start = min(x)
    end = max(x)+1
    
    #start, end = ax.get_xlim()
    ax.xaxis.set_ticks(np.arange(start, end))
    ax.tick_params(axis='x', labelsize=10)
    #ax.xaxis.set_major_formatter(ticker.FormatStrFormatter('%1d'))
    ax.set_xticklabels(np.arange(start, end), rotation=90)   
    
    return ax

In [78]:
## Draw a heatmap with the numeric values in each cell
def plot_ct_heatmap(df, index, year):
    plotme = crosstab_by_year(df, index, year)
    # Drop row used to organize data
    plotme = plotme.droplevel(0, axis=0)
    
    if index == 'Weekday':
        plotme = plotme.iloc[0:7, :-1]
        # Drop column used to organize data
        plotme = plotme.droplevel(0, axis=1)
        ylabel = index

        _, ax = plt.subplots(figsize=(9, 5))
        plt.title('Movie ' + index + ' count by Month')

    elif index == 'Month':
        # this uses days instead of weekdays
        plotme = plotme.T
        ylabel = 'Day'

        _, ax = plt.subplots(figsize=(10, 15))
        plt.title('Movie day count by ' + index)
    
    # Plot
    sns.heatmap(plotme, annot=True, fmt="d", linewidths=.5, ax=ax)
    plt.xlabel('Month')
    plt.ylabel(ylabel)

In [79]:
# Create week dataframe with total movies watched by weekday
def create_weekday_df(df, year):
    # Select 'All' column and remove 'All' row
    df_weekday = pd.DataFrame(crosstab_by_year(df, 'Weekday', year)[('All','')]).reset_index().iloc[:-1, :] 
    # Get columns names ('Weekday' and 'WeekdayName')
    df_weekday.columns = df_weekday.columns.droplevel(1)    
    # Select and rename columns
    df_weekday = df_weekday[['WeekdayName','All']]
    df_weekday.rename(columns={"WeekdayName": "Weekday", "All": "Movies"}, inplace=True)    
    return df_weekday


# Movies by Weekday
def plot_weekday_bar(df, year):    
    df_weekday = create_weekday_df(df, year)

    # Prepare plot
    norm = plt.Normalize(df_weekday.Movies.min(), df_weekday.Movies.max())
    cmap = plt.get_cmap("rocket")
    values = df_weekday.Movies.values
    
    # Plotting
    plt.figure(figsize=(10,8))
    sns.barplot('Weekday', 'Movies', data=df_weekday, palette=cmap(norm(values)))
    plt.title('# Movies watched by Weekday')
    plt.ylabel('# Movies')
    
    for i, n in enumerate(df_weekday['Movies']):
        plt.text(i, n+0.3, n)

In [80]:
# Movies by Week of Year
def plot_week_bar(df_, year):  
    df_weekofyear = df_[['Title','Date']].copy()
    df_weekofyear['Weekofyear'] = df_weekofyear.Date.dt.weekofyear
    
    # no movies from week 53 in the beginning of the year
    temp = df_weekofyear[df_weekofyear.Date.dt.year == year]
    temp = temp[~((temp.Date.dt.month == 1) & (temp.Date.dt.weekofyear >= 52))]
    
    # week starts on monday and some values may fall on week 52 or 53 of previous year
    if df_weekofyear[df_weekofyear.Date.dt.year == year+1].size > 0:
        appendthis = df_weekofyear[df_weekofyear.Date.dt.year == year+1][df_weekofyear.Date.dt.weekofyear >= 52]
        temp = temp.append(appendthis)
        
    # after filters
    df_weekofyear = temp.copy()
    gb_df_weekofyear = df_weekofyear.groupby('Weekofyear').agg({'Title':'count'}).reset_index()   

    if gb_df_weekofyear.size != 0:
        # Prepare plot
        norm = plt.Normalize(gb_df_weekofyear.Title.min(), gb_df_weekofyear.Title.max())
        cmap = plt.get_cmap("magma")
        values = gb_df_weekofyear.Title.values
        
        # Plotting
        ax = plt.figure(figsize=(20,8)).gca()
        sns.barplot(x='Weekofyear', y='Title', data=gb_df_weekofyear, palette=cmap(norm(values))) #palette='cool')
        ax.yaxis.set_major_locator(ticker.MaxNLocator(integer = True))
        
        plt.title('# Movies watched by Week of Year (in ' + str(year) +')')
        plt.xlabel('Week of Year')
        plt.ylabel('# Movies')
        
        for i, n in enumerate(gb_df_weekofyear['Title']):
            plt.text(i, n+0.3, n)
            
        plt.show()
    else:
        print('No data!')
        
    return gb_df_weekofyear

In [81]:
# Plot how many hours spent watching movies by month
def plot_month_bar(df, year):
    # Group by month (get .month to maintain month order)
    gb_month = [df.Date.dt.month, df.Date.dt.month_name()]
    
    df_month = df[df.Date.dt.year==year].groupby(gb_month).agg({'Runtime':'sum'}).droplevel(0, axis=0).reset_index()
    df_month['Runtime'] = (df_month['Runtime'] / 60).astype(int)
    df_month = df_month.rename(columns={'Date':'Month'})

    # Prepare plot
    norm = plt.Normalize(df_month.Runtime.min(), df_month.Runtime.max())
    cmap = plt.get_cmap("magma")
    values = df_month.Runtime.values

    # Plotting
    ax = plt.figure(figsize=(15,8)).gca()
    sns.barplot(x='Month',y='Runtime',data=df_month, palette=cmap(norm(values)))
    ax.yaxis.set_major_locator(ticker.MaxNLocator(integer = True))
    plt.title('# Hours of movies watched by Month (in ' + str(year) +')')
    plt.ylabel('# Hours')
    
    for i, n in enumerate(values):
        plt.text(i, n, n)
        
    plt.show()

In [82]:
# Plot swarm box plot with x based on Release Year or Decade
def plot_box_swarm(x, y, data, order):
    # Plot
    _, ax = plt.subplots(figsize=(30, 8))
    ax = sns.boxplot(x, y, data=data, order=order)
    ax = sns.swarmplot(x, y, data=data, order=order, color=".4")
    
    # add grid lines
    ax.yaxis.grid(True)
    ax.xaxis.grid(True)
   
    if x == 'Year':
        x = 'Release Year'
        ax.set_xlabel(x)
        
    title = y + ' by ' + x    
    ax.set_title(title)
    
    ax.set_xticklabels(ax.get_xticklabels(), rotation = 90) 

In [83]:
# Plot number of Movies by Release Year
def plot_movie_trend(df, year):
    plot_movies_year = df[df.Date.dt.year == year].groupby(by='Year').agg({'Title': 'count'}).reset_index()
    x = plot_movies_year.Year
    y = plot_movies_year.Title
    
    # https://towardsdatascience.com/how-to-make-bar-and-hbar-charts-with-labels-using-matplotlib-b701ce70ba9c
    _, ax = plt.subplots(figsize=(20, 8))
    add_ticks_vbarplot(x, y, ax)     
    
    plt.title('# Movies watched by Release Year (in ' + str(year) +')')
    plt.xlabel('Release Year')
    plt.ylabel('# Movies')
    
    plt.show()