## basic EDA analysis using pandas built-in functions

In [1]:
# only run once!
import os
try: hasrun
except NameError:
    os.chdir("../")
    hasrun = True

os.getcwd()

In [2]:
import mkdocs_gen_files
import io
import pandas as pd

# set floats to 2 decimal points
pd.options.display.float_format = '{:,.2f}'.format

In [56]:
# return a md compatible string for each function
def get_dfinfo(df: pd.DataFrame) -> str:
    doc = "## Info \n\n"
    buffer = io.StringIO()
    df.info(buf=buffer)
    doc += "```\n" + buffer.getvalue() + "\n```\n"
    return doc

def get_head(df: pd.DataFrame) -> str:
    doc = "## Table Head \n\n``` \n"
    doc += df.head().to_string()
    doc += "\n```\n\n"
    return doc

def get_tail(df: pd.DataFrame) -> str:
    doc = "## Table Tail \n\n``` \n"
    doc += df.tail().to_string()
    doc += "\n```\n\n"
    return doc

def get_info(df: pd.DataFrame) -> str:
    doc = "## Info \n"
    doc += get_info(df)
    doc += "\n\n"
    return doc

def get_describe(df: pd.DataFrame) -> str:
    doc = "## Describe \n\n```\n"
    doc += df.describe().to_string()
    doc += "\n```\n\n"
    return doc
    
def get_nan(df: pd.DataFrame) -> str:
    doc = "## NaN counts \n"
    doc += "```\n" + df.isna().sum().to_string() + "\n```\n"
    doc += "\n\n"
    return doc
    
def get_unique(df: pd.DataFrame) -> str:
    doc = "## Unique Values \n"
    doc += "```\n" + df.nunique().to_string() + "\n```\n"
    doc += "\n\n"
    return doc


In [57]:
# concatonate all strings and save to md file
def create_basic_data_doc(dataset_name: str):
    df = catalog.load(dataset_name)
    
    doc = f"# pandas.DataFrame: {dataset_name} \n"
    doc += get_dfinfo(df)
    doc += get_head(df)
    doc += get_tail(df)
    doc += get_describe(df)
    doc += get_nan(df)
    doc += get_unique(df)
    
    filepath = "data/" + dataset_name + ".md"
    with mkdocs_gen_files.open(filepath, "w") as f:
        print(doc, file=f)
    

In [58]:
create_basic_data_doc("companies")

In [59]:
create_basic_data_doc("shuttles")

In [60]:
create_basic_data_doc("reviews")

In [61]:
create_basic_data_doc("companies")

In [62]:
create_basic_data_doc("model_input_table")