In [1]:
import pandas as pd

# Assignment 1: Streamlined Data Ingestion

#### IMPORTING DATA - Streamlined Transaction Workflow (_Marketing Analytics_)

Now that we have a good idea of what we want the data prep on transactions looks like,
let's push that to the read_csv function. 

Keep an eye on the memory usage before and after. 

* Change the column names to 'Date', 'Store_Number', and 'Transaction_Count'.
* Skip the first row of data.
* Convert columns to the appropriate datatypes. 

Then create the columns we created in the assign assignment in Section 3, by chaining assign with read_csv. 

Some starter code has been provided for you below. Because the dataframe object returned by read_csv doesn't have a name, we need to use a lambda function to refer to the dataframe.

```python
transactions.assign(
    target_pct=transactions["transactions"] / 2500,
    met_target=(transactions["transactions"] / 2500) >= 1,
    bonus_payable=((transactions["transactions"] / 2500) >= 1) * 100,
    month=transactions["date"].dt.month,
    day_of_week=transactions["date"].dt.dayofweek,
)
```

The first one should look like:

```python
target_pct = lambda x: (x["Transaction_Count"] / 2500)
```


In [2]:
# Get a quick glance at the data types and memory usage 
# Our DataFrame is taking up 6.6 Megabytes, if we just read in as is

pd.read_csv("../retail/transactions.csv").info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 83488 entries, 0 to 83487
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   date          83488 non-null  object
 1   store_nbr     83488 non-null  int64 
 2   transactions  83488 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 6.6 MB


In [3]:
# preprossing on the data to reduce the memory usage

transactions = pd.read_csv(
    "../retail/transactions.csv",                                          
    header=0,                                                              # Suppress header (skip the header row) to allow custom names
    names=["Date", "Store_Number", "Transaction_Count"],                   # Specify new column names
    skiprows=[0],                                                          # Skip the first row of data
    parse_dates=["Date"],                                                  # parse date column
    dtype={"Store_Number": "Int8", "Transaction_Count": "Int16"}).assign(  # Downcast two integer columns
    target_pct = lambda x: (x["Transaction_Count"] / 2500),
    met_target = lambda x: (x["Transaction_Count"] / 2500 >= 1),
    bonus_payable = lambda x: (x["Transaction_Count"] / 2500 >= 1 * 100),
    month = lambda x: x["Date"].dt.month,
    day_of_week = lambda x: x["Date"].dt.dayofweek
).astype({                                                                 # Cast new columns to correct dtypes.
    "target_pct": "Float32",                                               # Note this could also be done in assign
    "month": "Int8",                                                      
    "day_of_week": "Int8"
})

In [4]:
# Df is significantly reduced in size! 

transactions.info(memory_usage="deep") 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 83487 entries, 0 to 83486
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   Date               83487 non-null  datetime64[ns]
 1   Store_Number       83487 non-null  Int8          
 2   Transaction_Count  83487 non-null  Int16         
 3   target_pct         83487 non-null  Float32       
 4   met_target         83487 non-null  boolean       
 5   bonus_payable      83487 non-null  boolean       
 6   month              83487 non-null  Int8          
 7   day_of_week        83487 non-null  Int8          
dtypes: Float32(1), Int16(1), Int8(3), boolean(2), datetime64[ns](1)
memory usage: 2.1 MB


In [5]:
transactions.head()

Unnamed: 0,Date,Store_Number,Transaction_Count,target_pct,met_target,bonus_payable,month,day_of_week
0,2013-01-02,1,2111,0.8444,False,False,1,2
1,2013-01-02,2,2358,0.9432,False,False,1,2
2,2013-01-02,3,3487,1.3948,True,False,1,2
3,2013-01-02,4,1922,0.7688,False,False,1,2
4,2013-01-02,5,1903,0.7612,False,False,1,2


In [9]:
transactions[["Transaction_Count"]].describe().round(2)

Unnamed: 0,Transaction_Count
count,83487.0
mean,1694.61
std,963.29
min,5.0
25%,1046.0
50%,1393.0
75%,2079.0
max,8359.0


# Assignment 2: Write to Excel Sheets

#### Export Transactions to Excel (_Accounting_)

Write the data in the transactions dataframe you created above into an Excel workbook.

Write out a separate sheet for each year of the data.

If you prefer, you can write each year of data to a separate csv file.

In [10]:
transactions.head()


Unnamed: 0,Date,Store_Number,Transaction_Count,target_pct,met_target,bonus_payable,month,day_of_week
0,2013-01-02,1,2111,0.8444,False,False,1,2
1,2013-01-02,2,2358,0.9432,False,False,1,2
2,2013-01-02,3,3487,1.3948,True,False,1,2
3,2013-01-02,4,1922,0.7688,False,False,1,2
4,2013-01-02,5,1903,0.7612,False,False,1,2


In [11]:
transactions.tail()

Unnamed: 0,Date,Store_Number,Transaction_Count,target_pct,met_target,bonus_payable,month,day_of_week
83482,2017-08-15,50,2804,1.1216,True,False,8,1
83483,2017-08-15,51,1573,0.6292,False,False,8,1
83484,2017-08-15,52,2255,0.902,False,False,8,1
83485,2017-08-15,53,932,0.3728,False,False,8,1
83486,2017-08-15,54,802,0.3208,False,False,8,1


In [14]:
# Open ExcelWriter to write multiple sheets

with pd.ExcelWriter("../data/DataForChandler.xlsx") as writer:
    for year in range(2013, 2018):                 # Specify years to filter by for each sheet and loop through them
       (transactions
        .loc[transactions["Date"].dt.year == year] # Filter DF to year in current iteration of loop
        .to_excel(writer, sheet_name=str(year)))   # Write each year's DF to sheet named for that year

In [16]:
for year in range(2013, 2018):                     # Specify years to filter by for each sheet and loop through them
    (transactions
     .loc[transactions["Date"].dt.year == year]    # Filter DF to year in current iteration of loop
     .to_csv(f"../data/transactions_{year}.csv")           # Write each year's DF to sheet named for that year
    )