# Import software libraries and load the dataset

In [1]:
import sys
import numpy as np
import pandas as pd

# Summarize software libraries used.
print('Libraries used in this project:')
print('- Python {}'.format(sys.version))
print('- NumPy {}'.format(np.__version__))
print('- pandas {}'.format(pd.__version__))

# Load the dataset.
stores_df = pd.read_csv('../data/stores_data_full.csv',
                        index_col = 0)
print('\nLoaded dataset.')

Libraries used in this project:
- Python 3.9.7 (default, Sep 16 2021, 16:59:28) [MSC v.1916 64 bit (AMD64)]
- NumPy 1.22.3
- pandas 1.4.3

Loaded dataset.


# Rename some of the columns 

In [2]:
stores_df.columns

Index(['Date', 'City', 'CustomerType', 'Gender', 'ProductLine', 'UnitPrice',
       'Quantity', 'Tax', 'TotalPrice', 'Revenue', 'COGS', 'CustomerRating'],
      dtype='object')

In [None]:
new_cols = {'City': 'Branch', 'Tax': 'TaxPrice'}
stores_df = stores_df.rename(columns = new_cols)
stores_df.head()

# Convert the `Date` column to datetime format

In [None]:
converted_dates = pd.to_datetime(stores_df['Date'])
stores_df['Date'] = converted_dates
stores_df.head()

In [None]:
feb_cond = stores_df['Date'].dt.month_name() == 'February'
stores_df[feb_cond].head()

# Handle missing values

In [None]:
rows = stores_df.isna().any(axis = 1)
cols = stores_df.isna().any()
stores_df.loc[rows, cols]

In [None]:
fill_vals = {'Gender': 'Male', 'TotalPrice': 293.14}
stores_df = stores_df.fillna(fill_vals)
stores_df.loc[rows, cols]

In [None]:
quant_fill = pd.Series([4, 3], index = ['CAR-FBV-054', 'CAR-STR-027'])
stores_df['Quantity'] = stores_df['Quantity'].fillna(quant_fill)
stores_df.loc[rows, cols]

In [None]:
revenue = stores_df['TotalPrice'] - stores_df['TaxPrice']
stores_df['Revenue'] = stores_df['Revenue'].fillna(revenue)
stores_df.loc[rows, cols]

# Use arithmetic to impute missing COGS values

In [None]:
# Average percentage decrease from revenue to COGS.
perc_decr = ((stores_df['Revenue'] - stores_df['COGS']) \
             / (stores_df['Revenue'])).mean()

perc_decr

In [None]:
ind = ['CAR-CLO-015', 'CAR-ELE-060', 'CAR-HBE-025', 'OLI-HML-039']

impute_vals = round(stores_df.loc[ind, 'Revenue'] / (1 + perc_decr), 2)
impute_vals

In [None]:
stores_df['COGS'] = stores_df['COGS'].fillna(impute_vals)
stores_df.loc[rows, cols]

# Create a new gross income column

In [None]:
stores_df['GrossIncome'] = stores_df['Revenue'] - stores_df['COGS']
stores_df.iloc[:10, -8:]

# Identify and drop rows with erroneous quantities

In [None]:
stores_df[stores_df['Quantity'] < 1]

In [None]:
print('Number of rows BEFORE drop: {}.'.format(stores_df.shape[0]))

rows_drop = stores_df[stores_df['Quantity'] < 1].index
stores_df = stores_df.drop(index = rows_drop)

print('Number of rows AFTER drop: {}.'.format(stores_df.shape[0]))