In [None]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pylab import *
import os
import seaborn as sns
from scipy import stats
import locale
locale.setlocale(locale.LC_ALL, '')

from sklearn import model_selection
from sklearn import preprocessing
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier


#Functions we created:
from modules import dem_fx
from modules import transaction_fx as trns
from modules import plot_functions as plt_fx
from modules import ml_functions as ml_fx
from modules import time_fx as tm_fx
os.getcwd()

#Allows reload of modules:
%load_ext autoreload
%autoreload

#### Load the data

In [None]:
weekly_cart_df_filtered_labels = pd.read_csv("saved_structures/weekly_cart_df_filtered_labels.csv", sep = '\t')

products = pd.read_csv("saved_structures/updated_prod.csv", sep = "\t")

hh_demographic = pd.read_csv(
    '../data/dunnhumby_complete_csv/hh_demographic.csv', sep=',')

transaction_data = pd.read_csv(
    '../data/dunnhumby_complete_csv/transaction_data.csv', sep=',')


In [None]:
weekly_cart_df_filtered_labels.head()

In [None]:
transaction_data.head()

#### Extract the time information for the transaction table

In [None]:
"""This cell is just here to get an idea of what the column 'TRANS_TIME' looks like.
We undertand that we have an interger which represents the time of the day as such:
2248 = 22h48
623 = 06h23
9 = 00h09
We thus have to transform these intergers into meaningful time points."""

time = transaction_data.TRANS_TIME.unique()
time = time.tolist()
time.sort()
#print(time)

In [None]:
"""Note : this cell can take several minutes to run."""

temp = transaction_data["TRANS_TIME"].astype("str")
for i in range(len(temp)):
    #if i%100000 == 0:     serves as a counter if we want to know where we are 
        #print(i)
    if len(temp[i]) == 3:
        temp[i] = "0" + temp[i]
    elif len(temp[i]) == 2:
        temp[i] = "00" + temp[i]
    elif len(temp[i]) == 1:
        temp[i] = "000" + temp[i]


In [None]:
transaction_data["TRANS_TIME_STR"] = temp
time = pd.to_datetime(transaction_data["TRANS_TIME_STR"], format='%H%M').dt.time
transaction_data["transaction_time_datetime"] = time
transaction_data.head()

In [None]:
"We plot the overall number of transactions depending on the time of the day."

transaction_data["transaction_time_datetime"].hist(bins = 24)
hours = [datetime.time(i) for i in range(24)]
plt.title("Total number of transactions according to the time of the day")
plt.xlabel("Time of the day")
plt.ylabel("Number of transactions")
plt.xticks(hours, rotation = 90)
plt.axvline(tm_fx.avg_time(transaction_data["transaction_time_datetime"]), 
            color='red', 
            linestyle='dashed', 
            linewidth=1, 
            label = "average transaction time")
plt.legend(loc = "upper left")

We can observe on the figure above that the transactions occur mostly during the afternoon, on average around 17h. The moment of the day where we have the fewer transactions is around 5h in the morning.

#### Average transaction time per household

In [None]:
time_per_hh = transaction_data.groupby(transaction_data.household_key)["transaction_time_datetime"].apply(list)
time_per_hh_df = pd.DataFrame(time_per_hh)
time_per_hh_df.columns = ["transaction_time_per_hh"]
time_per_hh_df["avg_transaction_time"] = 'todo'
time_per_hh_df["avg_transaction_time_in_seconds"] = 'todo'
#time_per_hh_df.head()

In [None]:
for i in range(1,2501):
    time_per_hh_df.avg_transaction_time[i] = tm_fx.avg_time(time_per_hh_df.transaction_time_per_hh[i])
    time_per_hh_df.avg_transaction_time_in_seconds[i] = tm_fx.time_to_seconds(time_per_hh_df.avg_transaction_time[i])
time_per_hh_df.head(10)

In [None]:
time_per_hh = time_per_hh_df.reset_index()

In [None]:
#Save to csv: 
if not os.path.exists("saved_structures"):
    os.makedirs("saved_structures")

time_per_hh.to_csv("saved_structures/time_per_hh.csv", sep ='\t', index = False)

In [None]:
#test of download
test = pd.read_csv("saved_structures/time_per_hh.csv", sep = '\t')

In [None]:
test.head()