In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
sns.set()
%matplotlib inline

In [None]:
np.random.seed(2355478)
tmp = np.random.poisson(lam=57.3, size=365*10*4)
tmp[np.random.choice(np.arange(tmp.shape[0]),size=365*32, replace=False)] = 0
df = pd.DataFrame(tmp.reshape((40,365)),
                  index=pd.MultiIndex.from_product([np.arange(1,11),np.arange(4)], names=['Customer','Product']),
                  columns=pd.date_range(start='2017-01-01', end='2017-12-31'))

# Assignment 6.1
The DataFrame `df` holds the customers spend on a daily basis for a number of products. The column names are the individual dates in 2017, while the index values are the combinations of customer IDs and product IDs.

Find the average amount spent by a customer between 2017-04-02 and 2017-08-17, both dates included.
<details><summary>Hint 1</summary>
    <p>
        Entering a start and end date separated by `:` in the `loc` method can be used to select a range of dates. Each date should be a string in the `YYYY-MM-DD` format
    </p>
</details>
<details><summary>Hint 2</summary>
    <p>
        Grouping by `Customer` and then summing twice, once over products and once of dates, gives the total amount spent for each customer in the time period
    </p>
</details>

In [None]:
df.loc[:,'2017-04-02':'2017-08-17'].groupby('Customer').sum().sum(axis=1).mean()

# Assignment 6.2
Using the DataFrame `df` again, find the total monthly spend per product. Return a DataFrame where the index is the product ID and the columns are the month number
<details><summary>Hint 1</summary>
    <p>
        A specific aggregation function can be set for the DataFrame method `pivot_table`.
    </p>
</details>
<details><summary>Hint 2</summary>
    <p>
        The `melt` method can transform a DataFrame from a wide format to a long format, making it easier manipulate multiple columns at the same time
    </p>
</details>
<details><summary>Hint 3</summary>
    <p>
        The `id_vars` parameter can be set to prevent some columns from being melted
    </p>
</details>
<details><summary>Hint 4</summary>
    <p>
        The `dt.month` attribute can be used to extract the month from a column containing datetime values
    </p>
</details>

In [None]:
tmp = df.reset_index().melt(id_vars=df.index.names)
tmp['month'] = tmp['variable'].dt.month
tmp.pivot_table(index='Product', columns='month', values='value', aggfunc='sum')

# Assignment 6.3
Find the average spend of the first three transactions (days with spend larger than zero) on a product for each customer. Create a DataFrame with the result, where the Customer ID is the index and the Product ID is the column name
<details><summary>Hint 1</summary>
    <p>
        The `head` method can be applied to a GroupBy object to get the first records for every unique combination of the grouping variables
    </p>
</details>
<details><summary>Hint 2</summary>
    <p>
        After melting the DataFrame the days without transactions can be removed by only taking the rows with `value` > 0
    </p>
</details>
<details><summary>Hint 3</summary>
    <p>
        The default aggregation function for the `pivot_table` method is a mean over the values
    </p>
</details>

In [None]:
tmp = df.reset_index().melt(id_vars=df.index.names).sort_values('variable')
(tmp[tmp['value']>0]
 .groupby(['Customer', 'Product'],
          as_index=False)
 .head(3)
 .pivot_table(values='value',
              index='Customer',
              columns='Product'))

# Assignment 6.4
Find the average days between purchases of a product for each customer. Return the result as a DataFrame with the Customer ID as index and the Product ID as column name
<details><summary>Hint 1</summary>
    <p>
        The `shift` method for a pandas Series moves all values up or down according to the parameter passed to it
    </p>
</details>
<details><summary>Hint 2</summary>
    <p>
        Using the `apply` method on a GroupBy object makes it possible to apply a function to the DataFrame made up of the records in each of the groups
    </p>
</details>
<details><summary>Hint 3</summary>
    <p>
        The DataFrame method `pivot` is a short-hand version of `pivot_table`, that does not support aggregation. Serves as a check that the assumption of uniquenes in the combination of row and column index is valid
    </p>
</details>

In [None]:
tmp = df.reset_index().melt(id_vars=df.index.names)
(tmp[tmp['value']>0]
 .groupby(['Customer', 'Product'])
 .apply(lambda d: (d['variable'].shift(-1)-d['variable']).dt.days.mean())
 .rename('Days between purchases')
 .reset_index()
 .pivot(index='Customer',
        columns='Product',
        values='Days between purchases'))

# Assignment 6.5
Create a DataFrame showing the product ID, where a customer has spent the greatest amount of money for each month. Set the Customer IDs as index and the month number as columns. 
<details><summary>Hint 1</summary>
    <p>
        Calculate how much each customer has spent on a product in a given month
    </p>
</details>
<details><summary>Hint 2</summary>
    <p>
        Find the maximum spend for a customer in a month and join it onto the result from the previous hint
    </p>
</details>
<details><summary>Hint 3</summary>
    <p>
        Use the maximum spend to filter the DataFrame to one row for every unique combination of `Customer` and `month`, before applying the `pivot` method
    </p>
</details>

In [None]:
tmp = df.reset_index().melt(id_vars=df.index.names)
tmp['month'] = tmp['variable'].dt.month
cust_prod_spend_per_month = tmp.groupby(['Customer', 'Product', 'month'])['value'].sum().reset_index()
cust_prod_spend_per_month = (cust_prod_spend_per_month
                             .merge(cust_prod_spend_per_month
                                    .groupby(['Customer', 'month'])['value']
                                    .max()
                                    .rename('max_value')
                                    .to_frame(),
                                    left_on=['Customer','month'],
                                    right_index=True))
(cust_prod_spend_per_month[cust_prod_spend_per_month['value'] == cust_prod_spend_per_month['max_value']]
 .pivot(index='Customer',
        columns='month',
        values='Product'))