# Import software libraries and load the datasets

In [15]:
import sys
import numpy as np
import pandas as pd

# Summarize software libraries used.
print('Libraries used in this project:')
print('- Python {}'.format(sys.version))
print('- NumPy {}'.format(np.__version__))
print('- pandas {}'.format(pd.__version__))

# Load the datasets.
stores_df = pd.read_csv('../data/stores_data_reindex.csv',
                        index_col = 0)
initial_df = pd.read_csv('../data/initial_invoices.csv',
                        index_col = 0)
ratings_df = pd.read_csv('../data/ratings_more.csv',
                         index_col = 0)
print('\nLoaded datasets.')

# Format floats with comma in thousands place.
pd.options.display.float_format = '{:,.2f}'.format

Libraries used in this project:
- Python 3.9.7 (default, Sep 16 2021, 16:59:28) [MSC v.1916 64 bit (AMD64)]
- NumPy 1.22.3
- pandas 1.4.3

Loaded datasets.


# Append the initial invoices to `stores_df`

In [16]:
initial_df

Unnamed: 0_level_0,Date,City,CustomerType,Gender,ProductLine,UnitPrice,Quantity,Tax,TotalPrice,Revenue,COGS
InvoiceID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
CAR-HBE-001,1/5/2019,Carbon Creek,Member,Female,Health and beauty,74.69,7,26.14,548.97,522.83,500.24
OLI-ELE-001,3/8/2019,Olinger,Normal,Female,Electronics,15.28,5,3.82,80.22,76.40,73.21
CAR-HML-001,3/3/2019,Carbon Creek,Normal,Male,Home and lifestyle,46.33,7,16.22,340.53,324.31,321.12
CAR-HBE-002,1/27/2019,Carbon Creek,Member,Male,Health and beauty,58.22,8,23.29,489.05,465.76,430.98
CAR-STR-001,2/8/2019,Carbon Creek,Normal,Male,Sports and travel,86.31,7,30.21,634.38,604.17,578.90
...,...,...,...,...,...,...,...,...,...,...,...
CAR-ELE-005,3/8/2019,Carbon Creek,Normal,Male,Electronics,97.16,1,4.86,102.02,97.16,92.68
GRC-HBE-005,3/29/2019,Greene City,Normal,Male,Health and beauty,87.87,10,43.94,922.64,878.70,848.40
OLI-ELE-008,2/9/2019,Olinger,Normal,Female,Electronics,12.45,6,3.74,78.44,74.70,72.06
CAR-FBV-005,3/23/2019,Carbon Creek,Normal,Male,Food and beverages,52.75,3,7.91,166.16,158.25,152.99


In [17]:
print('Number of rows BEFORE append: {}.'.format(stores_df.shape[0]))
print('Number of rows to append: {}.'.format(initial_df.shape[0]))
#stores_df = stores_df.append(initial_df, sort = False)
stores_df = pd.concat([stores_df, initial_df])

print('Number of rows AFTER append: {}.'.format(stores_df.shape[0]))

Number of rows BEFORE append: 900.
Number of rows to append: 96.
Number of rows AFTER append: 996.


In [18]:
stores_df.tail()

Unnamed: 0_level_0,Date,City,CustomerType,Gender,ProductLine,UnitPrice,Quantity,Tax,TotalPrice,Revenue,COGS
InvoiceID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
CAR-ELE-005,3/8/2019,Carbon Creek,Normal,Male,Electronics,97.16,1.0,4.86,102.02,97.16,92.68
GRC-HBE-005,3/29/2019,Greene City,Normal,Male,Health and beauty,87.87,10.0,43.94,922.64,878.7,848.4
OLI-ELE-008,2/9/2019,Olinger,Normal,Female,Electronics,12.45,6.0,3.74,78.44,74.7,72.06
CAR-FBV-005,3/23/2019,Carbon Creek,Normal,Male,Food and beverages,52.75,3.0,7.91,166.16,158.25,152.99
GRC-HML-006,3/5/2019,Greene City,Normal,Male,Home and lifestyle,82.7,6.0,24.81,521.01,496.2,477.84


# Merge the customer ratings into `stores_df`

In [19]:
ratings_df

Unnamed: 0_level_0,CustomerRating
InvoiceID,Unnamed: 1_level_1
GRC-FBV-004,4.00
OLI-STR-004,4.00
GRC-HBE-010,4.00
GRC-HBE-016,4.00
GRC-ELE-019,4.00
...,...
OLI-STR-002,10.00
GRC-STR-005,10.00
GRC-STR-011,10.00
CAR-HBE-020,10.00


In [20]:
print('Number of rows BEFORE merge: {}.'.format(stores_df.shape))

stores_df = stores_df.join(ratings_df)

print('Number of rows AFTER merge: {}.'.format(stores_df.shape))

Number of rows BEFORE merge: (996, 11).
Number of rows AFTER merge: (996, 12).


In [21]:
stores_df.tail()

Unnamed: 0_level_0,Date,City,CustomerType,Gender,ProductLine,UnitPrice,Quantity,Tax,TotalPrice,Revenue,COGS,CustomerRating
InvoiceID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
CAR-ELE-005,3/8/2019,Carbon Creek,Normal,Male,Electronics,97.16,1.0,4.86,102.02,97.16,92.68,7.2
GRC-HBE-005,3/29/2019,Greene City,Normal,Male,Health and beauty,87.87,10.0,43.94,922.64,878.7,848.4,5.1
OLI-ELE-008,2/9/2019,Olinger,Normal,Female,Electronics,12.45,6.0,3.74,78.44,74.7,72.06,4.1
CAR-FBV-005,3/23/2019,Carbon Creek,Normal,Male,Food and beverages,52.75,3.0,7.91,166.16,158.25,152.99,9.3
GRC-HML-006,3/5/2019,Greene City,Normal,Male,Home and lifestyle,82.7,6.0,24.81,521.01,496.2,477.84,7.4


# Sort the store data

In [22]:
stores_df.sort_index(axis = 0, inplace = True)
stores_df

Unnamed: 0_level_0,Date,City,CustomerType,Gender,ProductLine,UnitPrice,Quantity,Tax,TotalPrice,Revenue,COGS,CustomerRating
InvoiceID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
CAR-CLO-001,3/10/2019,Carbon Creek,Normal,Female,Clothing,87.67,2.00,8.77,184.11,175.34,170.12,7.70
CAR-CLO-002,1/12/2019,Carbon Creek,Member,Female,Clothing,20.01,9.00,9.00,189.09,180.09,174.55,5.70
CAR-CLO-003,2/10/2019,Carbon Creek,Member,Female,Clothing,30.14,10.00,15.07,316.47,301.40,284.76,9.20
CAR-CLO-004,1/29/2019,Carbon Creek,Normal,Male,Clothing,83.24,9.00,37.46,786.62,749.16,709.60,7.40
CAR-CLO-005,2/8/2019,Carbon Creek,Normal,Male,Clothing,98.98,10.00,49.49,1039.29,989.80,955.90,8.70
...,...,...,...,...,...,...,...,...,...,...,...,...
OLI-STR-041,1/1/2019,Olinger,Member,Female,Sports and travel,29.22,6.00,8.77,184.09,175.32,168.27,5.00
OLI-STR-042,1/30/2019,Olinger,Normal,Female,Sports and travel,22.38,1.00,1.12,23.50,22.38,21.53,8.60
OLI-STR-043,3/14/2019,Olinger,Member,Male,Sports and travel,42.85,1.00,2.14,44.99,42.85,41.93,9.30
OLI-STR-044,1/10/2019,Olinger,Normal,Female,Sports and travel,83.14,7.00,29.10,611.08,581.98,569.84,6.60


In [23]:
stores_df.sort_values(by = ['City', 'ProductLine', 'CustomerType'])

Unnamed: 0_level_0,Date,City,CustomerType,Gender,ProductLine,UnitPrice,Quantity,Tax,TotalPrice,Revenue,COGS,CustomerRating
InvoiceID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
CAR-CLO-002,1/12/2019,Carbon Creek,Member,Female,Clothing,20.01,9.00,9.00,189.09,180.09,174.55,5.70
CAR-CLO-003,2/10/2019,Carbon Creek,Member,Female,Clothing,30.14,10.00,15.07,316.47,301.40,284.76,9.20
CAR-CLO-007,1/23/2019,Carbon Creek,Member,Male,Clothing,17.94,5.00,4.49,94.19,89.70,85.88,6.80
CAR-CLO-013,1/14/2019,Carbon Creek,Member,Female,Clothing,96.70,5.00,24.18,507.68,483.50,468.58,7.00
CAR-CLO-014,2/2/2019,Carbon Creek,Member,Male,Clothing,43.13,10.00,21.57,452.87,431.30,419.05,5.50
...,...,...,...,...,...,...,...,...,...,...,...,...
OLI-STR-027,3/2/2019,Olinger,Normal,Female,Sports and travel,73.98,7.00,25.89,543.75,517.86,495.93,4.10
OLI-STR-028,2/17/2019,Olinger,Normal,Female,Sports and travel,46.66,9.00,21.00,440.94,419.94,409.27,5.30
OLI-STR-032,2/21/2019,Olinger,Normal,Female,Sports and travel,98.80,2.00,9.88,207.48,197.60,187.39,7.70
OLI-STR-042,1/30/2019,Olinger,Normal,Female,Sports and travel,22.38,1.00,1.12,23.50,22.38,21.53,8.60


In [24]:
stores_df.sort_values(by = ['CustomerRating'], ascending = False)

Unnamed: 0_level_0,Date,City,CustomerType,Gender,ProductLine,UnitPrice,Quantity,Tax,TotalPrice,Revenue,COGS,CustomerRating
InvoiceID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
GRC-STR-005,2/3/2019,Greene City,Member,Female,Sports and travel,55.07,9.00,24.78,520.41,495.63,469.75,10.00
OLI-STR-002,2/15/2019,Olinger,Member,Female,Sports and travel,24.74,3.00,3.71,77.93,74.22,70.11,10.00
GRC-STR-011,3/27/2019,Greene City,Normal,Male,Sports and travel,93.39,6.00,28.02,588.36,560.34,538.89,10.00
CAR-HBE-020,2/20/2019,Carbon Creek,Normal,Female,Health and beauty,32.32,10.00,16.16,339.36,323.20,306.21,10.00
GRC-ELE-041,2/25/2019,Greene City,Normal,Female,Electronics,52.79,10.00,26.40,554.30,527.90,502.63,10.00
...,...,...,...,...,...,...,...,...,...,...,...,...
OLI-FBV-060,1/8/2019,Olinger,Member,Female,Food and beverages,72.88,9.00,32.80,688.72,655.92,632.26,4.00
OLI-CLO-056,2/12/2019,Olinger,Member,Male,Clothing,52.35,1.00,2.62,54.97,52.35,50.99,4.00
GRC-HBE-010,1/26/2019,Greene City,Member,Male,Health and beauty,69.37,9.00,31.22,655.55,624.33,591.93,4.00
CAR-CLO-037,3/3/2019,Carbon Creek,Normal,Male,Clothing,46.41,1.00,2.32,48.73,46.41,44.83,4.00


# Pivot the data so you can analyze it from different perspectives

In [25]:
stores_df.pivot_table(index = 'Gender',
                      columns = 'ProductLine',
                      values = 'Quantity').round(2)

ProductLine,Clothing,Electronics,Food and beverages,Health and beauty,Home and lifestyle,Sports and travel
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Female,5.52,5.81,5.6,5.35,6.26,5.64
Male,4.48,5.62,5.23,5.86,5.1,5.47


In [26]:
stores_df.pivot_table(index = 'ProductLine',
                      columns = 'City',
                      values = 'Revenue',
                      aggfunc = np.sum)

City,Carbon Creek,Greene City,Olinger
ProductLine,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Clothing,15406.17,15631.73,20092.04
Electronics,17328.81,16239.47,18065.69
Food and beverages,16345.81,14490.37,22635.1
Health and beauty,11972.86,18567.26,15793.38
Home and lifestyle,20626.21,16713.49,12711.78
Sports and travel,18450.19,19036.38,15011.36


# Use grouping to summarize categories of data

In [27]:
stores_df.groupby('City').sum()

Unnamed: 0_level_0,UnitPrice,Quantity,Tax,TotalPrice,Revenue,COGS,CustomerRating
City,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Carbon Creek,18553.14,1842.0,5021.18,105440.89,100130.05,96226.04,2383.8
Greene City,18401.89,1814.0,5034.26,105712.96,100678.7,96824.58,2257.5
Olinger,18503.35,1810.0,5241.72,109780.01,104309.35,100287.37,2304.1


In [28]:
stores_df.groupby('City')[['Revenue', 'COGS']].sum()

Unnamed: 0_level_0,Revenue,COGS
City,Unnamed: 1_level_1,Unnamed: 2_level_1
Carbon Creek,100130.05,96226.04
Greene City,100678.7,96824.58
Olinger,104309.35,100287.37


In [29]:
stores_df.groupby('Gender')['CustomerRating'].mean()

Gender
Female   6.96
Male     6.97
Name: CustomerRating, dtype: float64