In [1]:
import pandas as pd
import numpy as np

# Assignment 1: DataFrame Basics

Hi there!

Can you read in the transactions dataset and report on:

* The number of rows and columns
* The names of the columns
* The datatypes of each column

In [2]:
# A common practice is to create a path variable to pass to read_csv
path = "../retail/transactions.csv"

# read in trasactions csv and create DataFrame
transactions = pd.read_csv(path)

# return DataFrame (df)
transactions

Unnamed: 0,date,store_nbr,transactions
0,2013-01-01,25,770
1,2013-01-02,1,2111
2,2013-01-02,2,2358
3,2013-01-02,3,3487
4,2013-01-02,4,1922
...,...,...,...
83483,2017-08-15,50,2804
83484,2017-08-15,51,1573
83485,2017-08-15,52,2255
83486,2017-08-15,53,932


In [3]:
# shape of df: 84388 rows, 3 columns (not including index)
transactions.shape

(83488, 3)

In [4]:
# shape of df: 84388 rows. Add 1 to max default index value to get row count
transactions.index.max() + 1

83488

In [5]:
# dtypes attribute returns column names and their data types
transactions.dtypes

date            object
store_nbr        int64
transactions     int64
dtype: object

# Assignment 2: Exploring DataFrames

Hello!

* Can you quickly inspect the first 5 rows of the transactions data? 

* Then, dive a bit more deeply into the data and check if there are any missing values.
* What about the number of unique dates? I want to make sure we didn’t leave any out.
* Finally, can you report the mean, median, min and max of “transactions”?  I want to check for any anomalies in our data.


In [6]:
# Look at top 5 rows with .head()
transactions.head()

Unnamed: 0,date,store_nbr,transactions
0,2013-01-01,25,770
1,2013-01-02,1,2111
2,2013-01-02,2,2358
3,2013-01-02,3,3487
4,2013-01-02,4,1922


In [7]:
# use .isna().sum() on df to get missing counts for all columns

transactions.isna().sum()

date            0
store_nbr       0
transactions    0
dtype: int64

In [8]:
# Use info to determine missing counts - none are missing!
transactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 83488 entries, 0 to 83487
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   date          83488 non-null  object
 1   store_nbr     83488 non-null  int64 
 2   transactions  83488 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 1.9+ MB


In [9]:
# Use describe to return the specified aggregations (and more!)
transactions.describe()

Unnamed: 0,store_nbr,transactions
count,83488.0,83488.0
mean,26.939237,1694.602158
std,15.608204,963.286644
min,1.0,5.0
25%,13.0,1046.0
50%,27.0,1393.0
75%,40.0,2079.0
max,54.0,8359.0


In [10]:
# Method 1 for unique dates - once you know how to access columns :D
transactions["date"].nunique()

1682

In [11]:
# Method 2 for unique dates - speecify include="all" to get stats on text columns.

transactions.describe(include="all")

Unnamed: 0,date,store_nbr,transactions
count,83488,83488.0,83488.0
unique,1682,,
top,2017-08-15,,
freq,54,,
mean,,26.939237,1694.602158
std,,15.608204,963.286644
min,,1.0,5.0
25%,,13.0,1046.0
50%,,27.0,1393.0
75%,,40.0,2079.0


# Exercise 3 - Accessing DataFrames

Hi, starting to dive deeper into this data.

I noticed that the first row is the only one from 2013-01-01.

* Can you get me a copy of the DataFrame that excludes that row, and only includes “store_nbr” and “transactions”?
* Also, can you report the number of unique store numbers?
* Finally, return the total number of transactions in millions


In [12]:
transactions.head()

Unnamed: 0,date,store_nbr,transactions
0,2013-01-01,25,770
1,2013-01-02,1,2111
2,2013-01-02,2,2358
3,2013-01-02,3,3487
4,2013-01-02,4,1922


In [13]:
# skip the first row via slicing

transactions.loc[1:, "store_nbr":"transactions"]

Unnamed: 0,store_nbr,transactions
1,1,2111
2,2,2358
3,3,3487
4,4,1922
5,5,1903
...,...,...
83483,50,2804
83484,51,1573
83485,52,2255
83486,53,932


In [14]:
# calculate sum of store number column

transactions.loc[:, "store_nbr"].nunique()

54

In [15]:
# Divide sum of transactions by millions to get in units of millions

transactions.loc[:, "transactions"].sum() / 1000000

141.478945

# Assignment 4: Dropping Data and Duplicates

Hi there!

Can you:

1. Drop the first row of data? We want it permanently removed. 
2. Drop the date column but not in place
3. Return a dataframe that only includes the last row for each of the stores.

Thanks!

In [16]:
transactions.head()

Unnamed: 0,date,store_nbr,transactions
0,2013-01-01,25,770
1,2013-01-02,1,2111
2,2013-01-02,2,2358
3,2013-01-02,3,3487
4,2013-01-02,4,1922


In [17]:
# drop the first row of data in place

transactions.drop(0, axis=0, inplace=True)

transactions

Unnamed: 0,date,store_nbr,transactions
1,2013-01-02,1,2111
2,2013-01-02,2,2358
3,2013-01-02,3,3487
4,2013-01-02,4,1922
5,2013-01-02,5,1903
...,...,...,...
83483,2017-08-15,50,2804
83484,2017-08-15,51,1573
83485,2017-08-15,52,2255
83486,2017-08-15,53,932


In [18]:
# drop date but not in place

transactions.drop("date", axis=1)

Unnamed: 0,store_nbr,transactions
1,1,2111
2,2,2358
3,3,3487
4,4,1922
5,5,1903
...,...,...
83483,50,2804
83484,51,1573
83485,52,2255
83486,53,932


In [19]:
# drop duplicate rows subsetting by store number. keep last entry for each store

transactions.drop_duplicates(subset="store_nbr", keep="last").head()

Unnamed: 0,date,store_nbr,transactions
83434,2017-08-15,1,1693
83435,2017-08-15,2,1737
83436,2017-08-15,3,2956
83437,2017-08-15,4,1283
83438,2017-08-15,5,1310


# Assignment 5: Missing Data

Hello, 

Can you tell if any dates or prices are missing in the oil dataset?

Then compare the mean of the oil series when filling in with mean vs. filling in with 0.

Thanks!

In [20]:
oil = pd.read_csv("../retail/oil.csv")

In [21]:
# info can be used to infer missing counts 1218 - 1175 = 43

oil.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1218 entries, 0 to 1217
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   date        1218 non-null   object 
 1   dcoilwtico  1175 non-null   float64
dtypes: float64(1), object(1)
memory usage: 19.2+ KB


In [22]:
# But usually easier to use isna().sum() - let the computer count for you :D

oil.isna().sum()

date           0
dcoilwtico    43
dtype: int64

In [23]:
# calculate mean of oil series after filling missing values with 0

oil.loc[:, 'dcoilwtico'].fillna(0).mean()

65.32379310344835

In [24]:
# calculate mean of oil series after filling missing values with the mean of oil price

oil.loc[:, 'dcoilwtico'].fillna(oil.loc[:, 'dcoilwtico'].mean()).mean()

67.71436595744696

# Assignment 6: Filtering DataFrames

I need some quick research on store 25:

* First, calculate the percentage of times ALL stores had more than 2000 transactions
* Then, calculate the percentage of times store 25 had more than 2000 transactions, and calculate the sum of transactions on these days
* Finally, sum the transactions for stores 25 and 3, that occurred in May or June, and had less than 2000 transactions


In [25]:
transactions.head()

Unnamed: 0,date,store_nbr,transactions
1,2013-01-02,1,2111
2,2013-01-02,2,2358
3,2013-01-02,3,3487
4,2013-01-02,4,1922
5,2013-01-02,5,1903


In [26]:
# all stores > 2500 percentage occurence

(transactions['transactions'] > 2000).mean()

0.266808006036868

In [27]:
# Number of times store 25 had > 2000 divided by total days for store 25 to get percent of time it happened

mask = (transactions['transactions'] > 2000) & (transactions['store_nbr'] == 25)

(transactions.loc[mask, 'transactions'].count() 
 / transactions.loc[(transactions['store_nbr'] == 25), 'transactions'].count())

0.03469640644361834

In [28]:
# Sum of transactions where store 25 had > 2000 transactions

transactions.loc[mask, 'transactions'].sum()

144903

In [29]:
# sum of transactions for stores 25 and 31 in months May and June on days they had less than 2000 transactions

(transactions.query(
    "store_nbr in [25, 31] & date.str[6] in ['5', '6'] & transactions < 2000")
 .loc[:, "transactions"]
 .sum())

644910

# Assignment 7: Sorting DataFrames

Hi there,
* Can you get me a dataset that includes the 5 days with the highest transactions counts? Any similarities between them?
* Then, can you get me a dataset sorted by date from earliest to most recent, but with the highest transactions first and the lowest transactions last for each day?
* Finally, sort the columns in reverse alphabetical order. 

Thanks!


In [30]:
transactions.head()

Unnamed: 0,date,store_nbr,transactions
1,2013-01-02,1,2111
2,2013-01-02,2,2358
3,2013-01-02,3,3487
4,2013-01-02,4,1922
5,2013-01-02,5,1903


In [31]:
# sort dataframe by values in transaction column in descending order (ascending = False)
# then grab first 5 rows to retrieve 5 highest days

transactions.sort_values('transactions', ascending=False).iloc[:5, :]

Unnamed: 0,date,store_nbr,transactions
52011,2015-12-23,44,8359
71010,2016-12-23,44,8307
16570,2013-12-23,44,8256
33700,2014-12-23,44,8120
16572,2013-12-23,46,8001


In [32]:
# Sort dataframe by date in ascending order, and trasactions in descending order

transactions.sort_values(['date', 'transactions'], ascending=[True, False])

Unnamed: 0,date,store_nbr,transactions
40,2013-01-02,46,4886
38,2013-01-02,44,4821
39,2013-01-02,45,4208
41,2013-01-02,47,4161
11,2013-01-02,11,3547
...,...,...,...
83455,2017-08-15,22,766
83449,2017-08-15,16,742
83465,2017-08-15,32,615
83468,2017-08-15,35,612


In [33]:
# sort columns in reverse alphabetical order using sort_index on the column axis (1), in descending order

transactions.sort_index(axis=1, ascending=False)

Unnamed: 0,transactions,store_nbr,date
1,2111,1,2013-01-02
2,2358,2,2013-01-02
3,3487,3,2013-01-02
4,1922,4,2013-01-02
5,1903,5,2013-01-02
...,...,...,...
83483,2804,50,2017-08-15
83484,1573,51,2017-08-15
83485,2255,52,2017-08-15
83486,932,53,2017-08-15


# Assignment 8: Modifying Columns

Just some quick work, but can you send me the transaction data with the columns renamed?

* Rename `transactions` to `transaction_count` and `store_nbr` to `store_number`.
* Reorder the columns so date is first, then store number, then transaction count.

Thanks!


In [34]:
transactions.head()

Unnamed: 0,date,store_nbr,transactions
1,2013-01-02,1,2111
2,2013-01-02,2,2358
3,2013-01-02,3,3487
4,2013-01-02,4,1922
5,2013-01-02,5,1903


In [35]:
# Use rename to change column names
# Use reindex, axis=1, to reorder columns (this can also be done by assignment FYI)

transactions =(transactions
 .rename(
    columns={"transactions": "transaction_count", "store_nbr": "store_number"})
 .reindex(labels=["date", "transaction_count", "store_number"], axis=1)
)

transactions.head()

Unnamed: 0,date,transaction_count,store_number
1,2013-01-02,2111,1
2,2013-01-02,2358,2
3,2013-01-02,3487,3
4,2013-01-02,1922,4
5,2013-01-02,1903,5


# Assignment 9: Column Creation

Just some quick work, but can you send me the transaction data with the columns renamed?

* Create a `pct_to_target` column that divides transactions by 2500.
* Then, create a `met_target` column that returns True if `pct_to_target` is greater than or equal to 1.
* Next, create a `bonus_payable` column that equals 100 if `met_target` is True, and 0 if not. Then sum the bonus payable column.
* Finally, create columns for month and day of week as integers. There is some helper code for these dateparts below.



Thanks!



In [36]:
transactions.head()

Unnamed: 0,date,transaction_count,store_number
1,2013-01-02,2111,1
2,2013-01-02,2358,2
3,2013-01-02,3487,3
4,2013-01-02,1922,4
5,2013-01-02,1903,5


In [37]:
# target based columns
transactions["pct_to_target"] = transactions.loc[:, "transaction_count"] / 2500
transactions["met_target"] = transactions.loc[:, "pct_to_target"] >= 1
transactions["bonus_payable"] = 100 * transactions["met_target"]

# Date Columns
transactions["date"] = transactions["date"].astype("Datetime64")
transactions["month"] = transactions["date"].dt.month
transactions["day_of_week"] = transactions["date"].dt.dayofweek

transactions.head()

Unnamed: 0,date,transaction_count,store_number,pct_to_target,met_target,bonus_payable,month,day_of_week
1,2013-01-02,2111,1,0.8444,False,0,1,2
2,2013-01-02,2358,2,0.9432,False,0,1,2
3,2013-01-02,3487,3,1.3948,True,100,1,2
4,2013-01-02,1922,4,0.7688,False,0,1,2
5,2013-01-02,1903,5,0.7612,False,0,1,2


In [38]:
# Call sum on "bonus_payable" column to get total sum of bonus paid

transactions.loc[:, "bonus_payable"].sum()

1448300

# Assignment 10: np.select

Hi there! I need a few columns created.

1. Create a ‘seasonal_bonus’ column that applies to these dates: 
    * All days in December (month = 12)
    * Sundays (day_of_week = 6) in May (month = 5)
    * Mondays (day_of_week = 0) in July (month = 7)
2. Call the December bonus ‘Holiday Bonus’, the May bonus ‘Corporate Month’, and the July bonus ‘Summer Special’. If no bonus applies, the column should display ‘None’. 
3. Finally, calculate the total bonus owed at $100 per day.

Thanks!

In [39]:
transactions.head()

Unnamed: 0,date,transaction_count,store_number,pct_to_target,met_target,bonus_payable,month,day_of_week
1,2013-01-02,2111,1,0.8444,False,0,1,2
2,2013-01-02,2358,2,0.9432,False,0,1,2
3,2013-01-02,3487,3,1.3948,True,100,1,2
4,2013-01-02,1922,4,0.7688,False,0,1,2
5,2013-01-02,1903,5,0.7612,False,0,1,2


In [40]:
# set up the three conditions
conditions = [
    transactions["month"] == 12,
    (transactions["month"] == 5) & (transactions["day_of_week"] == 6),
    (transactions["month"] == 7) & (transactions["day_of_week"] == 0)
]

# specify outcomes for each condition
choices = ["Holiday Bonus", "Corporate Month", "Summer Special"]

# Call the select method, passing in conditions, choices, and a default value of 'None' if no condition met
transactions["seasonal_bonus"] = np.select(conditions, choices, default="None")

transactions.head()

Unnamed: 0,date,transaction_count,store_number,pct_to_target,met_target,bonus_payable,month,day_of_week,seasonal_bonus
1,2013-01-02,2111,1,0.8444,False,0,1,2,
2,2013-01-02,2358,2,0.9432,False,0,1,2,
3,2013-01-02,3487,3,1.3948,True,100,1,2,
4,2013-01-02,1922,4,0.7688,False,0,1,2,
5,2013-01-02,1903,5,0.7612,False,0,1,2,


In [41]:
# look at frequency for each 
transactions["seasonal_bonus"].value_counts()

None               75258
Holiday Bonus       6028
Summer Special      1103
Corporate Month     1098
Name: seasonal_bonus, dtype: int64

In [42]:
# Use value counts to extract counts of each holiday, 
# slice the series returned by value counts, and sum relevant values before multiply by 100 to get bonus owed

transactions.loc[:, "seasonal_bonus"].value_counts().iloc[1:].sum() * 100

822900

# Assignment 11: Assign 

* Drop the columns that have been created so far (keep only date, store_number, and transaction count), and recreate them using the assign method.
* Then sum the seasonal bonus owed once again to make sure the numbers are correct.


In [43]:
# Drop columns we created in prior exercises
transactions = transactions.drop(
    [
        "pct_to_target",
        "met_target",
        "bonus_payable",
        "month",
        "day_of_week",
        "seasonal_bonus",
    ],
    axis=1,
)

transactions.head()

Unnamed: 0,date,transaction_count,store_number
1,2013-01-02,2111,1
2,2013-01-02,2358,2
3,2013-01-02,3487,3
4,2013-01-02,1922,4
5,2013-01-02,1903,5


In [44]:
# Create same columns with assign

transactions = transactions.assign(
    target_pct = transactions["transaction_count"] / 2500,
    met_target = (transactions["transaction_count"] / 2500) >= 1,
    bonus_payable = ((transactions["transaction_count"] / 2500) >= 1) * 100,
    month = transactions.date.dt.month,
    day_of_week = transactions.date.dt.dayofweek,
    seasonal_bonus = np.select(conditions, choices, default="None"),
)



In [45]:
# Same method as assignment 10

transactions.loc[:, "seasonal_bonus"].value_counts().iloc[1:].sum() * 100

822900

# Assignment 12: Memory Optimization

Reduce the memory usage of the transactions DataFrame to below 5MB.

In [46]:
transactions.head()

Unnamed: 0,date,transaction_count,store_number,target_pct,met_target,bonus_payable,month,day_of_week,seasonal_bonus
1,2013-01-02,2111,1,0.8444,False,0,1,2,
2,2013-01-02,2358,2,0.9432,False,0,1,2,
3,2013-01-02,3487,3,1.3948,True,100,1,2,
4,2013-01-02,1922,4,0.7688,False,0,1,2,
5,2013-01-02,1903,5,0.7612,False,0,1,2,


In [47]:
transactions.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
Int64Index: 83487 entries, 1 to 83487
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   date               83487 non-null  datetime64[ns]
 1   transaction_count  83487 non-null  int64         
 2   store_number       83487 non-null  int64         
 3   target_pct         83487 non-null  float64       
 4   met_target         83487 non-null  bool          
 5   bonus_payable      83487 non-null  int64         
 6   month              83487 non-null  int64         
 7   day_of_week        83487 non-null  int64         
 8   seasonal_bonus     83487 non-null  object        
dtypes: bool(1), datetime64[ns](1), float64(1), int64(5), object(1)
memory usage: 10.1 MB


In [48]:
# Note - you may have been more conservative here, and that's OK.
# The main point is reducing memory consumption. Playing it safe is often wise.

transactions = transactions.astype(
    {
        "store_number": "Int8",
        "transaction_count": "Int16",
        "bonus_payable": "Int8",
        "month": "Int8",
        "day_of_week": "Int8",
        "seasonal_bonus": "category",
    }
)

In [49]:
transactions.head()

Unnamed: 0,date,transaction_count,store_number,target_pct,met_target,bonus_payable,month,day_of_week,seasonal_bonus
1,2013-01-02,2111,1,0.8444,False,0,1,2,
2,2013-01-02,2358,2,0.9432,False,0,1,2,
3,2013-01-02,3487,3,1.3948,True,100,1,2,
4,2013-01-02,1922,4,0.7688,False,0,1,2,
5,2013-01-02,1903,5,0.7612,False,0,1,2,


In [50]:
# down to 2.9MB vs. 10.1 to start. Will work on Chandler's grandparents computer!

transactions.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
Int64Index: 83487 entries, 1 to 83487
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   date               83487 non-null  datetime64[ns]
 1   transaction_count  83487 non-null  Int16         
 2   store_number       83487 non-null  Int8          
 3   target_pct         83487 non-null  float64       
 4   met_target         83487 non-null  bool          
 5   bonus_payable      83487 non-null  Int8          
 6   month              83487 non-null  Int8          
 7   day_of_week        83487 non-null  Int8          
 8   seasonal_bonus     83487 non-null  category      
dtypes: Int16(1), Int8(4), bool(1), category(1), datetime64[ns](1), float64(1)
memory usage: 2.9 MB
