# Explore Fantasy Football Data using Pandas (Python Data Analysis Library) Utility Notebook <a id="return"></a>

This notebook contains all the functions used in the 01-explore_ff_league_data.ipynb notebook.
<br><br/>

**Notebook Sections:**
1. [Import Packages](#section1)
2. [Function for Reading in Data](#section2)
3. [Functions for Data Exploration](#section3)
4. [Function to Plot Histograms](#section4)
5. [Function to Compute Correlations](#section5)
6. [Functions to Compute Categorical Relationships](#section6)
7. [Functions to Compute Numerical/Categorical Relationships](#section7)
8. [Function to Plot Dates](#section8)

## Import Packages <a id="section1"></a>

[Return to Top](#return)

In [None]:
# import packages to create/manipulate dataframes
import pandas as pd
import numpy as np
import itertools

# import package to used today's date
from datetime import date

# import package to calculate categorical relationships
import scipy.stats as ss
from collections import Counter
import math
from scipy import stats
from scipy.stats import chi2_contingency
from scipy.stats import chi2
import warnings

# import packages to create visualizations
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import seaborn as sns
from datetime import datetime, timedelta
from matplotlib import dates as mpl_dates

## Function for Reading in Data <a id="section2"></a>

[Return to Top](#return)

In [None]:
# create function to read in data from flat file, pickle object, or database
def read_data(file_path: str = None, db_path: str = None, db_conn: object = None):
    
    ''' 
        This function will check for a file name or a database table name.  If a file name is passed then this function will check for 'csv' or 'pkl' in the file name string
        and return a pandas dataframe. If the file name string doesn't contain 'csv' or 'pkl' the function will try to open and read the file and throw an exception if unsuccessful.
        
        If a table name is passed then the function will query the whole table and return a pandas dataframe.
        
        param file_path: string containing the full file path where data resides
        param db_path: string containing the full database path where data resides
        param db_conn: database connection object. needed to read data from database
        
        returns: pandas dataframe
    '''
    
    # check for file path
    if file_path:
    
        # check for csv file
        if 'csv' in file_path.lower():
            
            # read csv
            df = pd.read_csv(file_path)
            
        # check for pickle file
        elif 'pkl' in file_path.lower():
            
            # read pickle
            df = pd.read_pickle(file_path)
           
        # if no csv or pickle file then try opening/reading file
        else:
            
            # try opening file with python's read
            try:

                # open file
                with open(file_path) as f:

                    # create contents object
                    contents = f.read()

                    # create pandas dataframe
                    df = pd.DataFrame(contents)

            except FileNotFoundError:

                # print file not found message
                msg = "Sorry, the file " + filename + "does not exist."
                print(msg)
          
    # check for table name
    if db_path:
        
        # read table data
        df = pd.read_sql(f'SELECT * FROM {db_path}', db_conn)
    
    # initially explore data
    display(df.info(verbose = True, show_counts = True))
    
    # return dataframe
    return df

## Functions for Data Exploration <a id="section3"></a>

[Return to Top](#return)

Create function to fully print a pandas dataframe.

In [None]:
# create function to define how to print pandas dataframe
def print_full(pd_df: object):
    
    # set pandas print options
    with pd.option_context('display.max_rows', 100
                          ,'display.max_columns', 500
                          ,'display.precision', 3
                          ,'display.colheader_justify', 'center'
                          ):
        
        # print pandas dataframe
        display(pd_df)

Create function to explore overall data.

In [None]:
# create function to explore overall data
def explore_all_data(data: object
                    ,print_flag: bool = None
                    ,file_name_flag: bool = None
                    ,data_name_var: str = None
                    ,output_dir: str = None
                    ):
    '''
    Function to explore overall data
    
    param data: pandas dataframe
          print_flag: boolean flag; determines whether or not to print output within notebook
          file_name_flag: boolean flag; determines whether or not to remove last period and all subsequent characters from a string (i.e. '.csv')
          data_name_var: string containing data source name.  typically the file name or database table name
          output_dir: string containing output data directory
    '''
    
    # calculate row and column counts
    row_count, column_count = data.shape[0], data.shape[1]
    
    # create numerical column list
    num_col = [i for i in (data.select_dtypes(include=['float64', 'int64']).columns)]
    
    # create categorical column list
    cat_col = [i for i in (data.select_dtypes(include=['string', 'object', 'category']).columns)]
    
    # create date column list
    date_col = [i for i in (data.select_dtypes(include=['datetime64[ns]']).columns)]
    
    # create boolean column list
    bool_col = [i for i in (data.select_dtypes(include=['boolean']).columns)]
    
    # create unknown column list
    unk_col = list(set(list(data.columns)) - set(num_col) - set(cat_col) - set(date_col) - set(bool_col))
    
    # calculate percentage null
    perc_null = round(100 * data.isnull().sum().sum() / (len(data) * len(list(data.columns))), 2)
    
    # calculate percentage empty
    perc_empty = round(100 * data.eq('').sum().sum() / (len(data) * len(list(data.columns))), 2)
    
    # calculate percentage zero
    perc_zero = round(100 * data[num_col].eq(0).sum().sum() / (len(data) * len(list(data.columns))), 2)
    
    # calculate percentage negative
    perc_neg = round(100 * data[num_col].lt(0).sum().sum() / (len(data) * len(list(data.columns))), 2)
    
    # calculate percentage positive
    perc_pos = round(100 * data[num_col].gt(0).sum().sum() / (len(data) * len(list(data.columns))), 2)
    
    # create dataframe of data types and their respective counts
    data_type_df = data.dtypes.value_counts().reset_index()
    
    # rename columns
    data_type_df.columns = ['Data Type', 'Count']
    
    # remove index
    blank_index = [''] * len(data_type_df)
    data_type_df.index = blank_index
    
    # print output created above or save it to csv
    if print_flag:
        
        # print output
        print(f'Number of Rows: {row_count}\nNumber of Columns: {column_count}')
        print(f'\nNumerical Columns: {num_col}')
        print(f'\nCategorical Columns: {cat_col}')
        print(f'\nDate Columns: {date_col}')
        print(f'\nBoolean Columns: {bool_col}')
        print(f'\nUnknown Columns: {unk_col}')
        print(f'\nTotal Percentage of Null Values: {perc_null}')
        print(f'\nTotal Percentage of Empty Values: {perc_empty}')
        print(f'\nTotal Percentage of Zero Values: {perc_zero}')
        print(f'\nTotal Percentage of Negative Values: {perc_neg}')
        print(f'\nTotal Percentage of Positive Values: {perc_pos}\n')
        print_full(data_type_df)
        
    else:
    
        # create dataframe with output created above
        df = pd.DataFrame.from_dict({'Row Count': [row_count]
                                    ,'Column Count': [column_count]
                                    ,'Numerical Columns': [num_col]
                                    ,'Categorical Columns': [cat_col]
                                    ,'Date Columns': [date_col]
                                    ,'Boolean Columns': [bool_col]
                                    ,'Unknown Columns': [unk_col]
                                    ,'Percentage Null': [perc_null]
                                    ,'Percentage Empty': [perc_empty]
                                    ,'Percentage Zero': [perc_zero]
                                    ,'Percentage Negative': [perc_neg]
                                    ,'Percentage Positive': [perc_pos]
                                    ,'Data Type Counts': [data_type_df]
                                    }).T

        # if there's a period in the data_name_var and if it's a flat file then remove the last period and all subsequent characters (example: '.pkl' or '.csv)
        file_title = data_name_var.rpartition('.')[0] if '.' in data_name_var and file_name_flag else data_name_var

        # save to csv
        df.to_csv(output_dir + f'01-{file_title}-overall_data_summary_{date.today()}.csv', header = False)

Create function to run data exploration functions below and then save or print the output.

In [None]:
# create function to run the data exploration functions below and save or print the output
def run_explore_func(data: object
                    ,func: object
                    ,print_flag: bool = None
                    ,file_name_flag: bool = None
                    ,data_name_var: str = None
                    ,output_dir: str = None
                    ):
    
    '''
    Function to run the data exploration functions created in this notebook
    
    param data: pandas dataframe
          func: data exploration function object 
          print_flag: boolean flag; determines whether or not to print output within notebook
          file_name_flag: boolean flag; determines whether or not to remove last period and all subsequent characters from a string (i.e. '.csv')
          data_name_var: string containing data source name.  typically the file name or database table name
          output_dir: string containing output data directory
    '''
    
    # run data exploration function
    df, freq_df_list, first_values_df_list, last_values_df_list = func(data, 5)

    # print output or save it to csv
    if print_flag:
        
        # print data summary
        print("Summary:\n")
        print_full(df)
        
        # print most frequent values for each variable
        print("\nMost Frequent n Values:\n")
        for i in range(0, len(freq_df_list)):
            print_full(freq_df_list[i])
            
        # print first values for each variable
        print("\nFirst n Values:\n")
        for j in range(0, len(first_values_df_list)):
            print_full(first_values_df_list[j])
            
        # print last values for each variable
        print("\nLast n Values:\n")
        for k in range(0, len(last_values_df_list)):
            print_full(last_values_df_list[k])
            
        print('\n') 
            
    else:
        
        # create most frequent values, first values, and last values columns
        df['most_freq'] = freq_df_list
        df['first_values'] = first_values_df_list
        df['last_values'] = last_values_df_list
        
        # if there's a period in the data_name_var and if it's a flat file then remove the last period and all subsequent characters (example: '.pkl' or '.csv)
        file_title = data_name_var.rpartition('.')[0] if '.' in data_name_var and file_name_flag else data_name_var
        
        # determine which function is being used and create variables for file name
        if 'num' in func.__name__:
            data_explore_type = 'num'
            file_num = '02'
        else:
            data_explore_type = 'cat'
            file_num = '03'
        
        # save data summary to csv
        df.to_csv(output_dir + f'{file_num}-{file_title}-{data_explore_type}_data_summary_{date.today()}.csv')

Create function to explore numerical data.

In [None]:
# create function to explore numerical data
def explore_num_data(data: object, n: int):
    '''
    Function to explore numerical data
    
    param data: pandas dataframe
    param n: integer to determine the number of most frequent, first, and last values to return (when sorted) for each numerical variable
    
    returns: a pandas dataframe which includes row count, number of distinct values, 5-number summary, 
             mean, standard deviation, sum, percentage null, percentage zero, percentage positive, 
             and percentage negative for each numerical variable
             
             lists of pandas dataframes which include the most frequent values, the first n values,
             and the last n values for each numerical variable 
    '''
    
    # create list of numerical columns
    columns = [i for i in (data.select_dtypes(include=['float64', 'int64']).columns)]
    
    # check if numerical columns exist
    if len(columns) == 0:
        print('No numerical data.')
        return
    
    # select numerical columns within dataframe
    data = data[columns]
    
    # call describe function to calculate count, mean, std, and 5-number summary
    describe_df = data.describe()
    
    # create list of number of distinct values within each numerical column
    dist_num = [len(data[i].dropna().unique()) for i in columns]
    
    # create list of the sum of all values within each numerical column
    total_sum = [data[i].sum() for i in columns]
    
    # calculate total number of rows
    total_count = len(data)
    
    # calculate percentage null for each numerical column
    null_perc = [100.0 * data[i].isna().sum() / total_count for i in columns]
    
    # calculate percentage zero for each numerical column
    zero_perc = [100.0 * len(data.loc[data[i] == 0, i]) / total_count for i in columns]
    
    # calculate percentage positive for each numerical column
    pos_perc = [100.0 * len(data.loc[data[i] > 0, i]) / total_count for i in columns]
    
    # calculate percentage negative for each numerical column
    neg_perc = [100.0 * len(data.loc[data[i] < 0, i]) / total_count for i in columns]
    
    # create temporary dataframe for statistics created above
    temp_df = pd.DataFrame({'dist_num': dist_num
                           ,'total_sum': total_sum
                           ,'null_perc': null_perc
                           ,'zero_perc': zero_perc
                           ,'pos_perc': pos_perc
                           ,'neg_perc': neg_perc
                           }).transpose()
    
    # set columns
    temp_df.columns = columns
    
    # concatenate describe_df and temp_df
    df = pd.concat([describe_df, temp_df], sort = False)
    
    # set index name
    df.index.set_names('summary', inplace = True)
    
    # transpose dataframe
    df = df.transpose()
    
    # reorder dataframe
    df = df[['count'
            ,'dist_num'
            ,'min'
            ,'25%'
            ,'50%'
            ,'75%'
            ,'max'
            ,'mean'
            ,'std'
            ,'total_sum'
            ,'null_perc'
            ,'zero_perc'
            ,'pos_perc'
            ,'neg_perc'
            ]]
    
    # rename count column
    df.rename(columns={'count':'row_count'}, inplace=True)
    
    # calculate most frequent values within each numerical column
    freq = []
    for i in columns:
        temp_freq = data.groupby(i).agg(count = (i, 'count')).sort_values('count', ascending = False).reset_index().head(n)
        temp_freq['freq'] = temp_freq['count'] / total_count
        
        # remove index
        blank_index = [''] * len(temp_freq)
        temp_freq.index = blank_index
        freq.append(temp_freq)
    
    # calculate the first n values within each numerical column when sorted
    first_values = []
    for j in columns:
        temp_first_values = data.groupby(j).agg(count = (j, 'count')).sort_values(j, ascending = True).reset_index().head(n)
        temp_first_values['freq'] = temp_first_values['count'] / total_count
        
        # remove index
        blank_index = [''] * len(temp_first_values)
        temp_first_values.index = blank_index
        first_values.append(temp_first_values)
    
    # calculate the last n values within each numerical column when sorted
    last_values = []
    for k in columns:
        temp_last_values = data.groupby(k).agg(count = (k, 'count')).sort_values(k, ascending = False).reset_index().head(n)
        temp_last_values['freq'] = temp_last_values['count'] / total_count
        
        # remove index
        blank_index = [''] * len(temp_last_values)
        temp_last_values.index = blank_index
        last_values.append(temp_last_values)
    
    return df, freq, first_values, last_values

Create function to explore categorical data.

In [None]:
# create function to explore categorical data
def explore_cat_data(data: object, n: int):
    '''
    Function to explore categorical data
    
    param data: pandas dataframe
    param n: integer to determine the number of most frequent, first, and last values to return (when sorted) for each categorical variable
    
    returns: a pandas dataframe which includes minimum string length, maximum string length, row count, 
             number of distince values, percentage null, and percentage empty for each categorical variable
             
             lists of pandas dataframes which include the most frequent values, the first n values,
             and the last n values for each categorical variable 
    '''
    
    # create list of categorical columns
    columns = [i for i in (data.select_dtypes(include=['string', 'object', 'category', 'boolean']).columns)]
    
    # check if categorical columns exist
    if len(columns) == 0:
        print('No categorical data.')
        return
    
    # select categorical columns within dataframe
    data = data[columns]
    
    # calculate row count
    count_rows = [data[i].dropna().count() for i in columns]
    
    # calculate number of distinct values
    dist_num = [data[i].dropna().value_counts().count() for i in columns]
    
    # calculate total number of rows
    total_count = len(data)
    
    # calculate minimum length for each categorical column
    min_length = [data[i].astype(str).str.len().min() for i in columns]
    
    # calculate maximum length for each categorical column
    max_length = [data[i].astype(str).str.len().max() for i in columns]
    
    # calculate percentage null for each categorical column
    null_perc = [100.0 * data[i].isna().sum() / total_count for i in columns]
    
    # calculate percentage empty for each categorical column
    empty_perc = [100.0 * len(data.loc[data[i] == '', i]) / total_count for i in columns]    
    
    # combine above stats into a pandas dataframe
    df = pd.DataFrame({'row_count': count_rows
                      ,'dist_num': dist_num
                      ,'min_length': min_length
                      ,'max_length': max_length
                      ,'null_perc': null_perc
                      ,'empty_perc': empty_perc
                      }).transpose()
					  
    # add column names
    df.columns = columns
    
    # set index name
    df.index.set_names('summary', inplace = True)
    
    # transpose dataframe
    df = df.transpose()
    
    # calculate most frequent values within each categorical column
    freq = []
    for i in columns:
        temp_freq = data.groupby(i).agg(count = (i, 'count')).sort_values('count', ascending = False).reset_index().head(n)
        temp_freq['freq'] = temp_freq['count'] / total_count
        
        # remove index
        blank_index = [''] * len(temp_freq)
        temp_freq.index = blank_index
        freq.append(temp_freq)
    
    # calculate the first n values within each categorical column when sorted
    first_values = []
    for j in columns:
        temp_first_values = data.groupby(j).agg(count = (j, 'count')).sort_values(j, ascending = True).reset_index().head(n)
        temp_first_values['freq'] = temp_first_values['count'] / total_count
        
        # remove index
        blank_index = [''] * len(temp_first_values)
        temp_first_values.index = blank_index
        first_values.append(temp_first_values)
    
    # calculate the last n values within each categorical column when sorted
    last_values = []
    for k in columns:
        temp_last_values = data.groupby(k).agg(count = (k, 'count')).sort_values(k, ascending = False).reset_index().head(n)
        temp_last_values['freq'] = temp_last_values['count'] / total_count
        
        # remove index
        blank_index = [''] * len(temp_last_values)
        temp_last_values.index = blank_index
        last_values.append(temp_last_values)
    
    return df, freq, first_values, last_values

## Function to Plot Histograms <a id="section4"></a>

[Return to Top](#return)

In [None]:
# create function to plot histograms for each numerical variable
def plot_hist(data: object
             ,print_flag: bool = None
             ,file_name_flag: bool = None
             ,data_name_var: str = None
             ,output_dir: str = None
             ):
    
    '''
    Function to plot histogram(s)
    
    param data: pandas dataframe
          print_flag: boolean flag; determines whether or not to print output within notebook
          file_name_flag: boolean flag; determines whether or not to remove last period and all subsequent characters from a string (i.e. '.csv')
          data_name_var: string containing data source name.  typically the file name or database table name
          output_dir: string containing output data directory
    '''
    
    # create list of numerical columns
    columns = [i for i in (data.select_dtypes(include=['float64', 'int64']).columns)]
    
    # check if numerical columns exist
    if len(columns) == 0:
        print('No numerical data.')
        return
    
    # check for print flag
    if print_flag:
        
        # create histogram for each numerical variable
        for col in columns:
            f = plt.figure()
            plt.hist(data[f'{col}'].dropna(), bins = 50, color = 'green')
            plt.title(f'{col}')
            plt.show()
            
    else:
        
        # if there's a period in the data_name_var and if it's a flat file then remove the last period and all subsequent characters (example: '.pkl' or '.csv)
        file_title = data_name_var.rpartition('.')[0] if '.' in data_name_var and file_name_flag else data_name_var
        
        # create pdf object
        pdf_obj = PdfPages(output_dir + f'02-{file_title}-histograms_{date.today()}.pdf')

        # create histogram for each numerical variable
        for col in columns:
            f = plt.figure()
            plt.hist(data[f'{col}'].dropna(), bins = 50, color = 'green')
            plt.title(f'{col}')
            pdf_obj.savefig(f)
            plt.close()
        pdf_obj.close()

## Function to Compute Correlations <a id="section5"></a>

[Return to Top](#return)

In [None]:
# create function to plot correlation matrix
def corr_matrix(data: object
               ,print_flag: bool = None
               ,file_name_flag: bool = None
               ,data_name_var: str = None
               ,output_dir: str = None
               ):
    
    '''
    Function to plot correlation matrix
    
    param data: pandas dataframe
          print_flag: boolean flag; determines whether or not to print output within notebook
          file_name_flag: boolean flag; determines whether or not to remove last period and all subsequent characters from a string (i.e. '.csv')
          data_name_var: string containing data source name.  typically the file name or database table name
          output_dir: string containing output data directory
    '''
    
    # create list of numerical columns
    columns = list(data.select_dtypes(include=['float64', 'int64']).columns)
    
    # check if numerical columns exist
    if len(columns) == 0:
        print('No numerical data.')
        return
    
    # create a correlation matrix for all numerical columns
    corr = data.corr()    
    
    # check for print flag and display heatmap if the columns are less than or equal 15, otherwise, the heatmap is too big to easily read within notebook
    if print_flag :
        
        # create heatmap
        fig, ax = plt.subplots(figsize = (8, 8))
        g = sns.heatmap(corr, annot = True, fmt = '.2f', cmap = plt.get_cmap('coolwarm'), cbar = False, ax = ax)
        plt.xticks(rotation=45)
        plt.yticks(rotation=45)
        plt.show()
        
    else:
        
        # if there's a period in the data_name_var and if it's a flat file then remove the last period and all subsequent characters (example: '.pkl' or '.csv)
        file_title = data_name_var.rpartition('.')[0] if '.' in data_name_var and file_name_flag else data_name_var
        
        if len(columns) <= 15:

            # create a correlation matrix with a heatmap and export to pdf
            fig, ax = plt.subplots(figsize = (8, 8))
            g = sns.heatmap(corr, annot = True, fmt = '.2f', cmap = plt.get_cmap('coolwarm'), cbar = False, ax = ax) #, linewidths=0.1, linecolor='gray')
            plt.xticks(rotation=45)
            plt.yticks(rotation=45)
            plt.savefig(output_dir + f'02-{file_title}-correlation_analysis_{date.today()}.pdf') # , bbox_inches='tight', pad_inches=0.0)
            plt.close(fig)  
            
        else:
            print('Too many variables to effectively visualize plot with a notebook.')
          
        # save to cvs
        corr.to_csv(output_dir + f'02-{file_title}-correlation_analysis_{date.today()}.csv', index = True)

## Functions to Compute Categorical Relationships <a id="section6"></a>

[Return to Top](#return)

Create function to run the categorical relationships functions below and then save or print the output.

In [None]:
# create function to run the categorical relationships functions below
def run_cat_rel_func(data: object
                    ,print_flag: bool = None
                    ,file_name_flag: bool = None
                    ,data_name_var: str = None
                    ,output_dir: str = None
                    ):
    
    '''
    Function to run the categorical relationships functions created in this notebook
    
    param data: pandas dataframe
          print_flag: boolean flag; determines whether or not to print output within notebook
          file_name_flag: boolean flag; determines whether or not to remove last period and all subsequent characters from a string (i.e. '.csv')
          data_name_var: string containing data source name.  typically the file name or database table name
          output_dir: string containing output data directory
    '''
    
    # create list of categorical columns
    columns = [i for i in (data.select_dtypes(include=['string', 'object', 'category', 'boolean']).columns)]
    
    # check number of categorical columns
    if len(columns) == 0:
        print('No categorical data.')
        return
    
    if len(columns) > 15:
        print('Too many variables to efficiently the run categorical relationships functions.')
        return
    
    # create empty list to store all results of the categorical relationships functions
    results = []
    
    # create dataframe to store Cramer's V results in order to create heatmap
    cramers_df = pd.DataFrame(np.zeros((len(columns), len(columns))), columns = columns, index = columns)
    
    # create dataframe to store Theil's U results in order to create heatmap
    theils_df = pd.DataFrame(np.zeros((len(columns), len(columns))), columns = columns, index = columns)
    
    # for each unique categorical column pair run the Chi-Squared, Cramer's V, and Theil's U statistical tests
    for column_pair in itertools.combinations(columns, 2):
        
        # run chi-squared function
        test_stat, p_value, dof, crosstab, expected_df, assumption_flag, dependent_var = chi_squared(data[column_pair[0]], data[column_pair[1]])
        
        # create chi-squared result variable
        chi_result = 'Dependent (reject H0)' if dependent_var and assumption_flag else 'Independent (fail to reject H0)' if assumption_flag else 'Chi-Squared Assumptions Not Met'
        
        # run Cramer's V function
        cramers = cramers_v(data[column_pair[0]], data[column_pair[1]])
        
        # add Cramer's V value to cramers_df
        cramers_df.at[column_pair[0], column_pair[1]] = cramers
        cramers_df.at[column_pair[1], column_pair[0]] = cramers
        
        # adding 1.00 to the diagonal since each variable is 100% associated with itself
        cramers_df.at[column_pair[0], column_pair[0]] = 1.00
        cramers_df.at[column_pair[1], column_pair[1]] = 1.00
        
        # run Theil's U function
        theils = theils_u(data[column_pair[0]], data[column_pair[1]])
        
        # reverse column pairs and run Theil's U function due to the test's asymmetrical nature
        rev_theils = theils_u(data[column_pair[1]], data[column_pair[0]])
        
        # add Theil's U value to theils_df
        theils_df.at[column_pair[0], column_pair[1]] = theils
        theils_df.at[column_pair[1], column_pair[0]] = rev_theils
        
        # adding 1.00 to the diagonal since each variable is 100% associated with itself
        theils_df.at[column_pair[0], column_pair[0]] = 1.00
        theils_df.at[column_pair[1], column_pair[1]] = 1.00
        
        # append results of the categorical relationships functions
        results.append((column_pair[0], column_pair[1], test_stat, p_value, dof, chi_result, cramers, theils, rev_theils))
        
    # create dataframe
    df = pd.DataFrame(results, columns = ['x1', 'x2', 'test_stat', 'p_value', 'dof', 'chi_result', 'cramers', 'theils (x2 predicts x1)', 'rev_theils (x1 predicts x2)'])
    
    # check for print flag
    if print_flag:
        
        # print dataframe
        print_full(df)
        print('\n')
        
        # create Cramer's V heatmap
        fig, ax = plt.subplots(figsize = (8, 8))
        g = sns.heatmap(cramers_df, annot = True, fmt = '.2f', cmap = plt.get_cmap('coolwarm'), cbar = False, ax = ax)
        ax.set_title("Cramér's V")
        plt.xticks(rotation=45)
        plt.yticks(rotation=45)
        plt.show()
        print('\n')
        
        # create Theil's U heatmap
        fig, ax = plt.subplots(figsize = (8, 8))
        g = sns.heatmap(theils_df, annot = True, fmt = '.2f', cmap = plt.get_cmap('coolwarm'), cbar = False, ax = ax)
        ax.set_title("Theil's U")
        plt.xticks(rotation=45)
        plt.yticks(rotation=45)
        plt.show()
        
    else:
        
        # if there's a period in the data_name_var and if it's a flat file then remove the last period and all subsequent characters (example: '.pkl' or '.csv)
        file_title = data_name_var.rpartition('.')[0] if '.' in data_name_var and file_name_flag else data_name_var

        # save to csv
        df.to_csv(output_dir + f'03-{file_title}-categorical_relationships_{date.today()}.csv', index = False)
        
        # create a  heatmap and export to pdf
        fig, ax = plt.subplots(figsize = (8, 8))
        g = sns.heatmap(cramers_df, annot = True, fmt = '.2f', cmap = plt.get_cmap('coolwarm'), cbar = False, ax = ax)
        ax.set_title("Cramér's V")
        plt.xticks(rotation=45)
        plt.yticks(rotation=45)
        plt.savefig(output_dir + f'03-{file_title}-cramers_v_{date.today()}.pdf', bbox_inches='tight', pad_inches=0.0)
        plt.close(fig)  
           
        # create a heatmap and export to pdf
        fig, ax = plt.subplots(figsize = (8, 8))
        g = sns.heatmap(theils_df, annot = True, fmt = '.2f', cmap = plt.get_cmap('coolwarm'), cbar = False, ax = ax)
        ax.set_title("Theil's U")
        plt.xticks(rotation=45)
        plt.yticks(rotation=45)
        plt.savefig(output_dir + f'03-{file_title}-theils_u_{date.today()}.pdf', bbox_inches='tight', pad_inches=0.0)
        plt.close(fig)  

Create function to compute Chi-Squared Test of Independence.

**Sources:** 
* https://towardsdatascience.com/the-search-for-categorical-correlation-a1cf7f1888c9
* https://github.com/shakedzy/dython/blob/master/dython/nominal.py

In [None]:
# create function to compute chi-squared test of independence
def chi_squared(x: object, y: object):
    
    '''
    Function to calculate chi-squared test of independence using both SciPy's chi2_contingency() and pandas’s crosstab()
    
    The null hypothesis (H0) and alternative hypothesis (H1) of the Chi-Square Test of Independence can be expressed in two different but equivalent ways:

    1. H0: "[Variable 1] is independent of [Variable 2]"
    2. H1: "[Variable 1] is not independent of [Variable 2]"

    OR

    1. H0: "[Variable 1] is not associated with [Variable 2]"
    2. H1: "[Variable 1] is associated with [Variable 2]"
    
    param x: list / NumPy ndarray / Pandas Series / A sequence of measurements
          y: list / NumPy ndarray / Pandas Series / A sequence of measurements
          
    returns test statistic: float
            p-value: float
            degrees of freedom: float
            expected counts: dataframe
            chi-squared assumptions flag: boolean
            dependent variable flag: boolean
    '''
    
    # create cross tabulation dataframe
    crosstab = pd.crosstab(x, y)
    
    # run chi-squared test
    test_stat, p_value, dof, expected_arr = chi2_contingency(crosstab)
    
    # create dataframe from the arrays of expected frequencies
    expected_df = pd.DataFrame(expected_arr, index = crosstab.index, columns = crosstab.columns)
    
    # calculate number of individual expected counts that are less than 1
    exp_cnts_1 = len([value for array in expected_arr for value in array if value < 1])
    
    # calculate percentage of individual expected counts that are less than 5
    exp_perc_5 = len([value for array in expected_arr for value in array if value < 5]) / len([value for array in expected_arr for value in array])
    
    # check chi-squared assumption (all individual expected counts are 1 or greater and no more than 20% of expected counts are less than 5)
    if exp_cnts_1 > 0 and exp_perc_5 > 0.2:
        
        # create chi-squared assumptions flag (True = assumptions met; False = assumptions not met)
        assumption_flag = False
        
        # return chi-squared test results and chi-squared assumptions flag
        return test_stat, p_value, dof, crosstab, expected_df, assumption_flag, None
    
    # check chi-squared assumption (cross tabulation table is at least 2x2)
    elif len(crosstab.columns) < 2 and len(crosstab.index) < 2:
        
        # create chi-squared assumptions flag (True = assumptions met; False = assumptions not met)
        assumption_flag = False
        
        # return chi-squared test results and chi-squared assumptions flag
        return test_stat, p_value, dof, crosstab, expected_df, assumption_flag, None
    
    # if the assumptions above are met then interpret test-statistic and p-value
    else:
        
        # create chi-squared assumptions flag (True = assumptions met; False = assumptions not met)
        assumption_flag = True
        
        # set probability of 95%
        prob = 0.95
        
        # calculate critical value
        critical = chi2.ppf(prob, dof)
        
        # calculate alpha value
        alpha = 1.0 - prob
        
        # interpret test-statistic and p-value
        if abs(test_stat) >= critical or p <= alpha:
            
            # set dependent variable flag (True = dependent (reject H0); False = independent (fail to reject H0))
            dependent_var = True
            
            # return chi-squared test results, chi-squared assumptions flag, and dependent variable flag
            return test_stat, p_value, dof, crosstab, expected_df, assumption_flag, dependent_var
            
        else:
            
            # set dependent variable flag
            dependent_var = False
            
            # return chi-squared test results, chi-squared assumptions flag, and dependent variable flag
            return test_stat, p_value, dof, crosstab, expected_df, assumption_flag, dependent_var

Create functions to drop or replace null values.

**Sources:** 
* https://towardsdatascience.com/the-search-for-categorical-correlation-a1cf7f1888c9
* https://github.com/shakedzy/dython/blob/master/dython/nominal.py

In [None]:
# create function to drop null values
def remove_incomplete_samples(x: object, y: object):
    
    # replace None with numpy's nan value
    x = [v if v is not None else np.nan for v in x]
    y = [v if v is not None else np.nan for v in y]
    
    # create numpy array
    arr = np.array([x, y]).transpose()
    
    # remove nan values
    arr = arr[~np.isnan(arr).any(axis=1)].transpose()
    
    # if x is a list then return numpy arrays as list; else return arrays
    if isinstance(x, list):
        return arr[0].tolist(), arr[1].tolist()
    else:
        return arr[0], arr[1]

# create function to replace null values with value n
def replace_nan_with_value(x: object, y: object, value: int):
    
    # replace null values with value n
    x = np.array([v if v == v and v is not None else value for v in x])  # NaN != NaN
    
    # replace null values with value n
    y = np.array([v if v == v and v is not None else value for v in y])
    
    return x, y

Create function to compute Cramer's V.

**Sources:** 
* https://towardsdatascience.com/the-search-for-categorical-correlation-a1cf7f1888c9
* https://github.com/shakedzy/dython/blob/master/dython/nominal.py

In [None]:
# create function to calculate Cramer's V
def cramers_v(x: object,
              y: object,
              bias_correction=True,
              nan_strategy = 'replace',
              nan_replace_value = 0.0):
    """
    Calculates Cramer's V statistic for categorical-categorical association.
    This is a symmetric coefficient: V(x,y) = V(y,x)
    Original function taken from: https://stackoverflow.com/a/46498792/5863503
    Wikipedia: https://en.wikipedia.org/wiki/Cram%C3%A9r%27s_V
    
    Parameters:
    -----------
    x : list / NumPy ndarray / Pandas Series / A sequence of categorical measurements
    y : list / NumPy ndarray / Pandas Series / A sequence of categorical measurements
    bias_correction : Boolean, default = True
        Use bias correction from Bergsma and Wicher,
        Journal of the Korean Statistical Society 42 (2013): 323-328.
    nan_strategy : string, default = 'replace'
        How to handle missing values: can be either 'drop' to remove samples
        with missing values, or 'replace' to replace all missing values with
        the nan_replace_value. Missing values are None and np.nan.
    nan_replace_value : any, default = 0.0
        The value used to replace missing values with. Only applicable when
        nan_strategy is set to 'replace'.
        
    Returns:
    --------
    float in the range of [0,1]
    """
    
    # check for null handeling strategy
    if nan_strategy == 'replace':
        
        # run replace nulls function
        x, y = replace_nan_with_value(x, y, nan_replace_value)
        
    elif nan_strategy == 'drop':
        
        # run remove nulls function
        x, y = remove_incomplete_samples(x, y)
        
    # create cross tabulation dataframe
    confusion_matrix = pd.crosstab(x, y)
    
    # compute chi-squared test statistic
    chi2 = ss.chi2_contingency(confusion_matrix)[0]
    
    # calculate matrix size
    n = confusion_matrix.sum().sum()
    
    # calculate phi
    phi2 = chi2 / n
    
    # calculate number of rows and columns
    r, k = confusion_matrix.shape
    
    # check for bias correction
    if bias_correction:
        
        # calculate Cramer's V using bias correction
        phi2corr = max(0, phi2 - ((k - 1) * (r - 1)) / (n - 1))
        rcorr = r - ((r - 1) ** 2) / (n - 1)
        kcorr = k - ((k - 1) ** 2) / (n - 1)
        if min((kcorr - 1), (rcorr - 1)) == 0:
            warnings.warn(
                "Unable to calculate Cramer's V using bias correction. Consider using bias_correction=False",
                RuntimeWarning)
            return np.nan
        else:
            v = np.sqrt(phi2corr / min((kcorr - 1), (rcorr - 1)))
    else:
        
        # calculate Cramer's V without bias correction
        v = np.sqrt(phi2 / min(k - 1, r - 1))
        
    # check if Cramer's V is negative or greater than 1
    if -1e-13 <= v < 0. or 1. < v <= 1. + 1e-13:
        
        # round Cramer's V
        rounded_v = 0. if v < 0 else 1.
        
        # print warning and return rounded Cramer's V
        warnings.warn(f'Rounded V = {v} to {rounded_v}. This is probably due to floating point precision issues.', RuntimeWarning)
        return rounded_v
    else:
        
        # return Cramer's V
        return v

Create function to compute Conditional Entropy.

**Sources:** 
* https://towardsdatascience.com/the-search-for-categorical-correlation-a1cf7f1888c9
* https://github.com/shakedzy/dython/blob/master/dython/nominal.py

In [None]:
# create function to calculate conditional entropy
def conditional_entropy(x: object,
                        y: object,
                        nan_strategy = 'replace',
                        nan_replace_value=0.0,
                        log_base: float = math.e):
    
    """
    Calculates the conditional entropy of x given y: S(x|y)
    Wikipedia: https://en.wikipedia.org/wiki/Conditional_entropy
    
    Parameters:
    -----------
    x : list / NumPy ndarray / Pandas Series / A sequence of measurements
    y : list / NumPy ndarray / Pandas Series / A sequence of measurements
    nan_strategy : string, default = 'replace'
        How to handle missing values: can be either 'drop' to remove samples
        with missing values, or 'replace' to replace all missing values with
        the nan_replace_value. Missing values are None and np.nan.
    nan_replace_value : any, default = 0.0
        The value used to replace missing values with. Only applicable when
        nan_strategy is set to 'replace'.
    log_base: float, default = e
        specifying base for calculating entropy. Default is base e.
        
    Returns: 
    --------
    float
    """
    
    # check for null handeling strategy
    if nan_strategy == 'replace':
        
        # run replace nulls function
        x, y = replace_nan_with_value(x, y, nan_replace_value)
        
    elif nan_strategy == 'drop':
        
        # run remove nulls function
        x, y = remove_incomplete_samples(x, y)
        
    # create dictionary where the key is a class in a categorical column (y) and the value is the count of that class
    y_counter = Counter(y)
    
    # create Counter dict using paied classes from both x and y 
    xy_counter = Counter(list(zip(x, y)))
    total_occurrences = sum(y_counter.values())
    entropy = 0.0
    
    # calculate conditional entropy
    for xy in xy_counter.keys():
        p_xy = xy_counter[xy] / total_occurrences
        p_y = y_counter[xy[1]] / total_occurrences
        entropy += p_xy * math.log(p_y / p_xy, log_base)
        
    # return conditional entropy
    return entropy

Create function to compute Theil's U.

**Sources:** 
* https://towardsdatascience.com/the-search-for-categorical-correlation-a1cf7f1888c9
* https://github.com/shakedzy/dython/blob/master/dython/nominal.py

In [None]:
# create function to compute Theil's U
def theils_u(x: object,
             y: object,
             nan_strategy = 'replace',
             nan_replace_value = 0.0):
    
    '''
    Calculates Theil's U statistic (Uncertainty coefficient) for categorical-
    categorical association. This is the uncertainty of x given y: value is
    on the range of [0,1] - where 0 means y provides no information about
    x, and 1 means y provides full information about x.
    This is an asymmetric coefficient: U(x,y) != U(y,x)
    Wikipedia: https://en.wikipedia.org/wiki/Uncertainty_coefficient
    
    Parameters:
    -----------
    x : list / NumPy ndarray / Pandas Series / A sequence of categorical measurements
    y : list / NumPy ndarray / Pandas Series / A sequence of categorical measurements
    nan_strategy : string, default = 'replace'
        How to handle missing values: can be either 'drop' to remove samples
        with missing values, or 'replace' to replace all missing values with
        the nan_replace_value. Missing values are None and np.nan.
    nan_replace_value : any, default = 0.0
        The value used to replace missing values with. Only applicable when
        nan_strategy is set to 'replace'.
        
    Returns:
    --------
    float in the range of [0,1]
    '''
    
    # check for null handeling strategy
    if nan_strategy == 'replace':
        
        # run replace nulls function
        x, y = replace_nan_with_value(x, y, nan_replace_value)
        
    elif nan_strategy == 'drop':
        
        # run remove nulls function
        x, y = remove_incomplete_samples(x, y)
        
    # run  conditional entropy function
    s_xy = conditional_entropy(x, y)
    
    # create Counter dict using x
    x_counter = Counter(x)
    
    # sum all counts from x_counter
    total_occurrences = sum(x_counter.values())
    
    # divide each count value by total occurrences
    p_x = list(map(lambda n: n / total_occurrences, x_counter.values()))
    
    # calculate entropy
    s_x = ss.entropy(p_x)
    
    # return 1 if entropy if 0
    if s_x == 0:
        return 1.
    
    else:
        
        # calculate Theil's U
        u = (s_x - s_xy) / s_x
        
        # check if Theil's U is negative or greater than 1
        if -1e-13 <= u < 0. or 1. < u <= 1.+1e-13:
            
            # round Theil's U
            rounded_u = 0. if u < 0 else 1.
            
            # print warning and return rounded Theil's U
            warnings.warn(f'Rounded U = {u} to {rounded_u}. This is probably due to floating point precision issues.',RuntimeWarning)
            return rounded_u
        
        else:
            
            # return Theil's U
            return u

## Function to Compute Numerical/Categorical Relationships <a id="section7"></a>

**Sources:** 
* https://towardsdatascience.com/the-search-for-categorical-correlation-a1cf7f1888c9
* https://github.com/shakedzy/dython/blob/master/dython/nominal.py

[Return to Top](#return)

Create function to run  correlation ratio function below.

In [None]:
# create function to run the correlation ratio function below
def run_corr_ratio_func(data: object
                       ,print_flag: bool = None
                       ,file_name_flag: bool = None
                       ,data_name_var: str = None
                       ,output_dir: str = None
                       ):
    
    '''
    Function to run the correlation ratio function created in this notebook
    
    param data: pandas dataframe
          print_flag: boolean flag; determines whether or not to print output within notebook
          file_name_flag: boolean flag; determines whether or not to remove last period and all subsequent characters from a string (i.e. '.csv')
          data_name_var: string containing data source name.  typically the file name or database table name
          output_dir: string containing output data directory
    '''
    
    # create list of categorical columns    
    cat_col = [i for i in (data.select_dtypes(include=['string', 'object', 'category', 'boolean']).columns)]

    # create numerical column list
    num_col = [i for i in (data.select_dtypes(include=['float64', 'int64']).columns)]
    
    # check if categorical and numerical columns exist
    if len(cat_col) == 0 or len(num_col) == 0:
        print('Need both categorical and numerical data to compute correlation ratio.')
        return
    
    # create empty list to store results
    results = []
    
    # for each unique numerical and categorical column pair run the correlation ratio function
    for column_pair in itertools.product(cat_col, num_col):
        
        # run tcorrelation ratio function
        corr_ratio = correlation_ratio(data[column_pair[0]], data[column_pair[1]])
        
        # append results of the categorical relationships functions
        results.append((column_pair[0], column_pair[1], corr_ratio))
        
    # create dataframe
    df = pd.DataFrame(results, columns = ['x1', 'x2', 'corr_ratio'])
    
    # check for print flag
    if print_flag:
        
        # print dataframe
        print_full(df)
        
    else:
        
        # if there's a period in the data_name_var and if it's a flat file then remove the last period and all subsequent characters (example: '.pkl' or '.csv)
        file_title = data_name_var.rpartition('.')[0] if '.' in data_name_var and file_name_flag else data_name_var

        # save to csv
        df.to_csv(output_dir + f'04-{file_title}-cat_num_relationships_{date.today()}.csv', index = False)

Create function to convert data into useable format for the correlation ratio function.

**Sources:** 
* https://towardsdatascience.com/the-search-for-categorical-correlation-a1cf7f1888c9
* https://github.com/shakedzy/dython/blob/master/dython/nominal.py

In [None]:
# create function to convert data into useable format for the correlation_ratio ratio
def convert(data: object, to: str, copy: bool = True):
    
    # set converted to None
    converted = None
    
    # check if to string is 'array'
    if to == 'array':
        
        # check for type of array and convert data into useable format for the correlation_ratio ratio 
        if isinstance(data, np.ndarray):
            converted = data.copy() if copy else data
        elif isinstance(data, pd.Series):
            converted = data.values
        elif isinstance(data, list):
            converted = np.array(data)
        elif isinstance(data, pd.DataFrame):
            converted = data.values()
            
    # check if to string is 'list'
    elif to == 'list':
        
        # check for type of list and convert data into useable format for the correlation_ratio ratio 
        if isinstance(data, list):
            converted = data.copy() if copy else data
        elif isinstance(data, pd.Series):
            converted = data.values.tolist()
        elif isinstance(data, np.ndarray):
            converted = data.tolist()
            
    # check if to string is 'dataframe'
    elif to == 'dataframe':
        
        # check for type of dataframe and convert data into useable format for the correlation_ratio ratio 
        if isinstance(data, pd.DataFrame):
            converted = data.copy(deep=True) if copy else data
        elif isinstance(data, np.ndarray):
            converted = pd.DataFrame(data)
            
    # raise ValueError
    else:
        raise ValueError("Unknown data conversion: {}".format(to))
        
    # check if converted is None and raise TypeError; otherwise return converted
    if converted is None:
        raise TypeError('cannot handle data conversion of type: {} to {}'.format(type(data), to))
    else:
        return converted

Create function to compute Correlation Ratio.

**Sources:** 
* https://towardsdatascience.com/the-search-for-categorical-correlation-a1cf7f1888c9
* https://github.com/shakedzy/dython/blob/master/dython/nominal.py

In [None]:
# create function to compute correlation ratio
def correlation_ratio(categories: object
                      ,measurements: object
                      ,nan_strategy: str = 'replace'
                      ,nan_replace_value: int = 0.0):
    
    '''
    Calculates the Correlation Ratio (sometimes marked by the greek letter Eta)
    for categorical-continuous association.
    Answers the question - given a continuous value of a measurement, is it
    possible to know which category is it associated with?
    Value is in the range [0,1], where 0 means a category cannot be determined
    by a continuous measurement, and 1 means a category can be determined with
    absolute certainty.
    Wikipedia: https://en.wikipedia.org/wiki/Correlation_ratio
    
    Parameters:
    -----------
    categories : list / NumPy ndarray / Pandas Series / A sequence of categorical measurements
    measurements : list / NumPy ndarray / Pandas Series / A sequence of continuous measurements
    nan_strategy : string, default = 'replace'
        How to handle missing values: can be either 'drop' to remove samples
        with missing values, or 'replace' to replace all missing values with
        the nan_replace_value. Missing values are None and np.nan.
    nan_replace_value : any, default = 0.0
        The value used to replace missing values with. Only applicable when
        nan_strategy is set to 'replace'.
        
    Returns:
    --------
    float in the range of [0,1]
    '''
    
    # check for null handeling strategy
    if nan_strategy == 'replace':
        
        # run replace nulls function
        categories, measurements = replace_nan_with_value(categories, measurements, nan_replace_value)
        
    elif nan_strategy == 'drop':
        
        # run remove nulls function
        categories, measurements = remove_incomplete_samples(categories, measurements)
        
    # run convert function
    categories = convert(categories, 'array')
    measurements = convert(measurements, 'array')
    
    # factorize the categories
    fcat, _ = pd.factorize(categories)
    
    # calculate the number of categories plus 1
    cat_num = np.max(fcat) + 1
    
    # create an numpy arrays of zeros the length of cat_num
    y_avg_array = np.zeros(cat_num)
    n_array = np.zeros(cat_num)
    
    # loop thorugh each category and create the numerator and the denominator needed to compute correlation ratio
    for i in range(0, cat_num):
        
        cat_measures = measurements[np.argwhere(fcat == i).flatten()]
        n_array[i] = len(cat_measures)
        y_avg_array[i] = np.average(cat_measures)
        
    y_total_avg = np.sum(np.multiply(y_avg_array, n_array)) / np.sum(n_array)
    numerator = np.sum(np.multiply(n_array, np.power(np.subtract(y_avg_array, y_total_avg),2)))
    denominator = np.sum(np.power(np.subtract(measurements, y_total_avg), 2))
    
    # check numerator
    if numerator == 0:
        return 0.
    
    else:
        
        # compute correlation ratio 
        eta = np.sqrt(numerator / denominator)
        
        # check if Theil's U is negative or greater than 1
        if 1. < eta <= 1.+1e-13:
            
            warnings.warn(f'Rounded eta = {eta} to 1. This is probably due to floating point precision issues.', RuntimeWarning)
            return 1.
        else:
            
            # return correlation ratio
            return eta

## Function to Plot Dates <a id="section8"></a>

[Return to Top](#return)

In [None]:
# create function to plot dates
def plot_dates(data: object
              ,print_flag: bool = None
              ,file_name_flag: bool = None
              ,data_name_var: str = None
              ,output_dir: str = None
              ):
    
    '''
    Function to plot dates
    
    param data: pandas dataframe
          print_flag: boolean flag; determines whether or not to print output within notebook
          file_name_flag: boolean flag; determines whether or not to remove last period and all subsequent characters from a string (i.e. '.csv')
          data_name_var: string containing data source name.  typically the file name or database table name
          output_dir: string containing output data directory
    '''
    
    # create list of date columns
    columns = list(data.select_dtypes(include=['datetime64[ns]']).columns)
    
    # check if date columns exist
    if len(columns) == 0:
        print('No dates.')
        return
    
    # check for print flag
    if print_flag :
        
        
        # loop through each datetime column and plot distributions
        for column in columns:

            # group by date and create counts
            date_df = data.groupby(column).agg(count = (column, 'count')).sort_values(column).reset_index()

            # create date plot
            plt.plot_date(date_df[column], date_df['count'], linestyle='solid')
                
            # format dates and layout
            plt.gcf().autofmt_xdate()
            date_format = mpl_dates.DateFormatter('%d-%m-%Y')
            plt.gca().xaxis.set_major_formatter(date_format)
            plt.tight_layout()
                
            # create plot title and x/y labels
            plt.title(f'{column} Distributions')
            plt.xlabel('Date')
            plt.ylabel('Count')
            plt.show()
        
    else:
        
        # if there's a period in the data_name_var and if it's a flat file then remove the last period and all subsequent characters (example: '.pkl' or '.csv)
        file_title = data_name_var.rpartition('.')[0] if '.' in data_name_var and file_name_flag else data_name_var
        
        # create pdf object
        date_pdf_obj = PdfPages(output_dir + f'05-{file_title}-date_plots_{date.today()}.pdf')

        # create plot for each date variable
        for column in columns:
            
            # create figure object
            date_fig = plt.figure()
            
            # group by date and create counts
            date_df = data.groupby(column).agg(count = (column, 'count')).sort_values(column).reset_index()
 
            # create date plot
            plt.plot_date(date_df[column], date_df['count'], linestyle='solid')
                
            # format dates and layout
            plt.gcf().autofmt_xdate()
            date_format = mpl_dates.DateFormatter('%d-%m-%Y')
            plt.gca().xaxis.set_major_formatter(date_format)
            plt.tight_layout()
                
            # create plot title and x/y labels
            plt.title(f'{column} Distributions')
            plt.xlabel('Date')
            plt.ylabel('Count')
            
            # save figure
            date_pdf_obj.savefig(date_fig, bbox_inches='tight', pad_inches=0.0)
            
            # close plot
            plt.close()
            
        # close pdf object
        date_pdf_obj.close()    