# Project Report
Notebook with images for report

In [None]:
from days_statistics import DaysStatistics
from data_helper import DataHelper
import pickle
import pandas as pd
import numpy as np
from datetime import datetime
from datetime import timedelta
import matplotlib.pyplot as plt
from matplotlib.ticker import (AutoMinorLocator, MultipleLocator)
import matplotlib._color_data as mcd
%matplotlib inline
dh = DataHelper()

In [None]:
def prepare_plot(size=(16,9), ax=None, y_max=250, every_hour=True, no_y_change=False):
    if ax is None:
        fig, ax = plt.subplots(figsize=size)

    ax.set_xlim(60, 276)
    ax.set_ylim(0, y_max)
    
    if False == no_y_change:
        ax.yaxis.set_major_locator(MultipleLocator(50))
        ax.yaxis.set_minor_locator(AutoMinorLocator(5))
    
    if every_hour:
        ax.xaxis.set_major_locator(MultipleLocator(12))
        ax.xaxis.set_minor_locator(AutoMinorLocator(4))
    else:
        ax.xaxis.set_major_locator(MultipleLocator(24))
        ax.xaxis.set_minor_locator(AutoMinorLocator(9))
    
    # Turn grid on for both major and minor ticks and style minor slightly
    # differently.
    ax.grid(which='major', color='#CCCCCC', linestyle='--')
    ax.grid(which='minor', color='#CCCCCC', linestyle=':')
    ax.set_xlabel('Time [hour:minute]', fontsize=14)
    ax.set_ylabel('Pool attendance', fontsize=14)
    return ax

In [None]:
def plot_attandace_vs(data, column):
    values = data[column].unique()
    if len(values) > 10:
        print('Too many values to plot. Consider clustering. Number of unique values for %s is %d'%(column, len(values)))
        return
   
    bad_dates = ['2018-02-20','2018-06-05','2018-06-06','2018-06-07','2018-06-08','2018-06-11',
                 '2018-06-12','2018-06-13','2018-06-14','2018-09-05','2018-03-17','2018-05-05',
                 '2018-06-10','2018-12-01']
    values.sort()
    histogram = dict()
    n = dict()
    
    for value in values:
        histogram[value] = [0]*288
        n[value] = [0]*288

    for index, row in data.iterrows():
        ts = datetime.strptime(row['time'], '%Y-%m-%d %H:%M:%S')
        slot_id = (ts.hour*12) + int(ts.minute/5)
        if row['day_of_week'] < 9 and row['pool'] > 0 and ts.strftime('%Y-%m-%d') not in bad_dates:
#             if row[column] > 4 and slot_id < 120:
#                 print(row['pool'], row['time'])
            histogram[row[column]][slot_id] += row['pool']
            n[row[column]][slot_id] += 1
    
    for value in values:
        for index, slot in enumerate(histogram[value]):
            if n[value][index] > 0:
                histogram[value][index] = histogram[value][index] / n[value][index]

    return histogram

df = pd.read_csv(dh.csv_path)
histogram = plot_attandace_vs(df, 'day_of_week')

In [None]:
a = datetime(2000, 1, 1, 23, 55, 0, 342380)
y = []
for i in range(288):
    a = a + timedelta(minutes=5)
    y.append(a.strftime('%H:%M'))

days = ['Monday', 'Tuesday', 'Wednesday', 'Thuresday', 'Friday', 'Saturday', 'Sunday']
n_start = 60
n_stop = 275
ax = prepare_plot()
for i, key in enumerate(histogram.keys()):
    ax.plot(y, histogram[key], label=days[i])

ax.legend(prop={'size': 18})
# plt.show()    
plt.savefig('averages.png', dpi=300, bbox_inches='tight')

In [None]:
ds = DaysStatistics()
months = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
fig, axs = plt.subplots(6, 2, figsize=(16,25), gridspec_kw={'hspace': 0.4})
column = 0
row = 0
for i in range(12):
    data = ds.get_average_for_month(i, False)   
    data_weekend = ds.get_average_for_month(i, True) 
    prepare_plot((8,3), axs[row, column], 300, False)
    axs[row, column].set_title(months[i], fontsize=14)
    l1 = axs[row, column].plot(y, data)
    l2 = axs[row, column].plot(y, data_weekend)
    column += 1
    if column > 1:
        column = 0
        row += 1
plt.savefig('monthly_averages.png', dpi=300, bbox_inches='tight')

In [None]:
ds = DaysStatistics()
months = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
colors = ['#1f77b4','#ff7f0e','#2ca02c','#d62728','#9467bd','#8c564b','#e377c2','#7f7f7f','#bcbd22','#17becf','#000000','#FFD700']

fig, axs = plt.subplots(2, 1, figsize=(16,16))
ax_weekday = prepare_plot((16,7), axs[0])
ax_weekday.set_title('Weekday monthly average attendance', fontsize=14)
ax_weekend = prepare_plot((16,9), axs[1], 300)
ax_weekend.set_title('Weekend monthly average attendance', fontsize=14)

for i in range(12):
    data = ds.get_average_for_month(i, False)   
    data_weekend = ds.get_average_for_month(i, True) 
    
    ax_weekday.plot(y, data, color=colors[i])
    ax_weekend.plot(y, data_weekend, color=colors[i], label=months[i])

ax_weekend.legend(prop={'size': 18})
plt.savefig('monthly_averages_together.png', dpi=300, bbox_inches='tight')

In [None]:
days = dh.get_all_days_list()
train_days = dh.get_training_days(False)
test_days = dh.get_testing_days()
valid_days = dh.get_validation_days()

In [None]:
days[17].data.head()

In [None]:

days[17].data.describe()

In [None]:
n_samples = 0
pool = 0
pool_n = 0

for day in days:
    n_samples += len(day.data)
    pool += sum(day.data['pool'])
    pool_n += len(day.data['pool'])
    

In [None]:
print('There are %d days in complete dataset. %d training days, %d testing days and %d validation days'%(len(days), len(train_days), len(valid_days), len(test_days)))
print('There are %d data samples'%(n_samples))
print('Average pool attendance is %d people'%(pool/pool_n))

In [None]:
all_df = []
for day in days:
    all_df.append(day.data)
big_df = pd.concat(all_df)

In [None]:
reserved = []
for column in big_df.columns:
    if column.startswith('reserved_'):
        reserved.append(column)
print(reserved)

In [None]:
big_df['reserved'] = 0
for column in reserved:
    big_df['reserved'] += big_df[column]
big_df.drop(reserved, axis=1,inplace=True)
print(big_df.columns)

In [None]:
big_df.describe()

In [None]:
ds = DaysStatistics()
n_weekday = [0]*288
sums_weekday = [0]*288
n_weekend = [0]*288
sums_weekend = [0]*288

for index, row in big_df.iterrows():
    day_id = ds.get_list_id(row['hour'], row['minute'])
    if row['day_of_week'] < 5:
        sums_weekday[day_id] += int(row['lines_reserved'])
        n_weekday[day_id] += 1
    else:
        sums_weekend[day_id] += int(row['lines_reserved'])
        n_weekend[day_id] += 1

avg_weekday = [0]*288
avg_weekend = [0]*288
for i in range(288):
    if sums_weekday[i] > 0:
        avg_weekday[i] = sums_weekday[i]/n_weekday[i]
    if sums_weekend[i] > 0:
        avg_weekend[i] = sums_weekend[i]/n_weekend[i]

In [None]:
ax = prepare_plot((16,9), None, 3, no_y_change=True)
ax.plot(y, avg_weekday, label='Weekdays')
ax.plot(y, avg_weekend, label='Weekend days')
ax.set_ylabel('Reserved lines', fontsize=14)
ax.legend(prop={'size': 18})  
plt.savefig('avg_lines.png', dpi=300, bbox_inches='tight')