In [1]:
import os, sys
import numpy as np
import pandas as pd
import scipy as sp
from datetime import datetime
import matplotlib.pyplot as plt
from tqdm import tqdm, tqdm_notebook
from sklearn.model_selection import train_test_split

In [2]:
# define paths
main_root = "../../data/main/"

In [3]:
caption = pd.read_csv(os.path.join(main_root, "var_des.csv"))
caption = caption[~pd.isnull(caption.order)]
caption.set_index('var_name', inplace=True)
caption = caption.drop('order', axis=1)

In [4]:
main = pd.read_csv(os.path.join(main_root, "pred.csv"))
main.set_index(['id', 'date'], inplace=True)
des = main.describe().T
des = des[['count', 'mean', 'std']]
des = des.assign(missing = 100 * (1 - des['count'] / main.shape[0]))
caption = caption.join(des)

In [5]:
caption.loc['full-sample', 'Description'] = "\\multicolumn{{7}}{{l}}{{\\textbf{{Full Sample, {:d} Stations, {:d} Days}}}}\\\\ \\midrule &".format(
    len(main.index.get_level_values('id').unique()),
    len(main.index.get_level_values('date').unique()))

In [6]:
main = pd.read_csv(os.path.join(main_root, "train_test.csv"))
main.set_index(['id', 'date'], inplace=True)
des = main.loc[:, main.columns.str.startswith('target_')].describe().T
des = des[['count', 'mean', 'std']]
des = des.assign(missing = 100 * (1 - des['count'] / main.shape[0]))
caption.update(des)

In [7]:
caption.loc['train-sample', 'Description'] = "\\midrule \\multicolumn{{7}}{{l}}{{\\textbf{{Training Sample, {:d} Stations, {:d} Days}}}}\\\\ \\midrule &".format(
    len(main.index.get_level_values('id').unique()),
    len(main.index.get_level_values('date').unique()))

In [8]:
caption = caption[2:].drop('count', axis=1)

In [9]:
caption['mean'] = caption['mean'].round(2)
caption['std'] = caption['std'].round(2)
caption['missing'] = caption['missing'].round(2)
caption['std'] = caption['std'].apply(lambda x: x if pd.isnull(x) else "(" + str(x) + ")")
caption['missing'] = caption['missing'].apply(lambda x: x if pd.isnull(x) else str(x) + "\\%")

In [10]:
caption.columns = ['Description', 'Kernel', 'Moving Average', 'Adjustment', 'Mean', '(Std. Dev.)', 'Missing']

In [11]:
pd.set_option('display.max_colwidth', 1000)
caption.to_latex("../../draft/descriptives_output.tex", index=False,
                 escape=False, na_rep="", column_format="lcccccc")

In [13]:
! sed -i -e 's/\&[[:space:]]\+\&[[:space:]]\+\&[[:space:]]\+\&[[:space:]]\+\&[[:space:]]\+\&[[:space:]]\+\&[[:space:]]\+\\\\/ /g' ../../draft/descriptives_output.tex

In [14]:
! sed -i -e 's/\&[[:space:]]\+\&[[:space:]]\+\&[[:space:]]\+\&[[:space:]]\+\&[[:space:]]\+\&[[:space:]]\+\\\\/ /g' ../../draft/descriptives_output.tex