# Exploratory Data Analysis

In [None]:
import pandas as pd
import numpy as np
import pickle

import seaborn as sns
import matplotlib.pyplot as plt
sns.set(palette='husl',
        rc={'figure.figsize':(11.7,8.27)})

%matplotlib inline

In [None]:
file_path = '../data/MergedData2014-2016.xlsx'
data = pd.read_excel(file_path, parse_dates=['Date'], index_col=0)
data.rename(columns={'# items demanded':'demand',
                  'Avg temp in 0.1oC': 'temperature',
                  'Rainfall in 24h in 0.1mm':'precipitation',
                  }, inplace=True)

In [None]:
data['t-7'] = data['demand'].shift(7) # Add data for 7 days ago
data['t-14'] = data['demand'].shift(14) # Add data for 14 days ago
data['day of week'] = data.index.weekday_name # Add weekday as a string as extra columns

# Average (with stddev) demand per weekday

In [None]:
# Plot with average demand per weekday
with sns.axes_style("white"):
    # Set up the matplotlib figure
    f, g = plt.subplots(figsize=(15, 12))
    g = sns.barplot(x=data['day of week'], y=data['demand']);
    g.set_title("Mean and std dev demand per weekday");
    plt.xticks(rotation=30);
#     g.figure.savefig('../figs/avgdemand-per-weekday.svg', 
#                      format='svg', dpi=250);

## Boxplot per variable

In [None]:
# Scale temp and precipitation data to fit on a similar range as demand
data['scaled_temperature'] = data['temperature'] / 25
data['scaled_precipitation'] = data['precipitation'] / 25

# Define columns to draw boxplots for
cols_boxplot = ['demand',
                'scaled_temperature',
                'scaled_precipitation',
                   ]

# Plot and save figure
with sns.axes_style("white"):
    f, ax = plt.subplots(figsize=(15, 12))
    ax = sns.boxplot(data=data.loc[:,cols_boxplot], palette='husl').set_title(" \
        Boxplot for demand, temperature and precipitation");
    plt.show();
#     ax.figure.savefig('../figs/boxplots.png', format='png', dpi=250);

## Correlation plot between variables

In [None]:
# Define columns to be used in correlation plot
cols_correlation = ['demand',
                    'temperature',
                    'precipitation',
                    't-7',
                    't-14',
                   ]

# Compute the correlation matrix based on cols
corr = data.loc[:, cols_correlation].corr()

mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)] = True
with sns.axes_style("white"):
    # Set up the matplotlib figure
    f, ax = plt.subplots(figsize=(15, 12))
    ax = sns.heatmap(corr, mask=mask, vmax=.3, square=True, \
        cmap="YlGnBu").set_title("Correlation heatmap between variables")
    plt.show();
#     ax.figure.savefig('../figs/corr_heatmap.png', format='png', dpi=250);

## Inspect highest absolute correlation in scatter plot

In [None]:
with sns.axes_style("white"):
    # Set up the matplotlib figure
    f, ax = plt.subplots(figsize=(15, 12))
    ax = sns.scatterplot(x=data['temperature'],
                        y=data['demand']).set_title("Scatterplot demand and temperature")
    plt.show();
#     ax.figure.savefig('../figs/scatter_dem_temp.png', format='png');

## Visualize the search space for _Z_

In [None]:
# Plot behaviour of cost for different values of z
z_results = pickle.load(open('./z_results_df.p', 'rb'))

with sns.axes_style("white"):
    # Set up the matplotlib figure
    f, g = plt.subplots(figsize=(15, 12))
    g = sns.barplot(y=z_results['Cost'], x=[i for i in np.arange(0.5, 6, 0.5)]);
    g.set_title("Cost function for different values of Z");
    g.set_xlabel('Value for safety factor (z)')
    plt.xticks(rotation=30);
#     g.figure.savefig('../figs/cost_against_z.png', 
#                      format='png', dpi=200);