# Exploratory Data Analysis - Online Retail Data

## Step 1: Importing necessary Libraries

In [30]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import datetime as dt

## Step 2: Loading the Dataset.

In [50]:
url = "https://raw.githubusercontent.com/nyangweso-rodgers/Data_Analytics/main/Datasets/Online_Retail.csv"
online_retail_data = pd.read_csv(url, encoding= 'unicode_escape', parse_dates=['InvoiceDate'])

## Step 3: Viewing the dataset.
We can quickly find out how many rows and columns there are in our dataset by using the shape method. This returns a tuple which contains the number of rows and columns.

Shape of the data

In [51]:
# number of columns and rows
online_retail_data.shape

(541909, 8)

Previewing the Dataset

In [52]:
online_retail_data.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


## Additional Columns
* Add some extra columns from the preview above.
  
  1. Add __TotalAmount = Quantity * UnitPrice__
  2. Add __Date__ extracted from __InvoiceDate__ datetime column

In [53]:
# add TotalAmount
online_retail_data['TotalAmount'] = online_retail_data['Quantity'] * online_retail_data['UnitPrice']
# add Date
online_retail_data['Date'] = online_retail_data['InvoiceDate'].dt.date

In [54]:
online_retail_data.head(3)

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,TotalAmount,Date
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom,15.3,2010-12-01
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,20.34,2010-12-01
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom,22.0,2010-12-01


Columns names

In [55]:
online_retail_data.columns

Index(['InvoiceNo', 'StockCode', 'Description', 'Quantity', 'InvoiceDate',
       'UnitPrice', 'CustomerID', 'Country', 'TotalAmount', 'Date'],
      dtype='object')

Concise info of dataset

In [56]:
# Basic Information about the data
online_retail_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 10 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   InvoiceNo    541909 non-null  object        
 1   StockCode    541909 non-null  object        
 2   Description  540455 non-null  object        
 3   Quantity     541909 non-null  int64         
 4   InvoiceDate  541909 non-null  datetime64[ns]
 5   UnitPrice    541909 non-null  float64       
 6   CustomerID   406829 non-null  float64       
 7   Country      541909 non-null  object        
 8   TotalAmount  541909 non-null  float64       
 9   Date         541909 non-null  object        
dtypes: datetime64[ns](1), float64(3), int64(1), object(5)
memory usage: 41.3+ MB


Data Types

In [57]:
online_retail_data.dtypes

InvoiceNo              object
StockCode              object
Description            object
Quantity                int64
InvoiceDate    datetime64[ns]
UnitPrice             float64
CustomerID            float64
Country                object
TotalAmount           float64
Date                   object
dtype: object

### Change the CustomerID column from float64 to object

In [58]:
## using dictionary to convert specific columns
convert_dic = {
    'CustomerID': str
}
online_retail_data = online_retail_data.astype(convert_dic)

Descriptive statistics.

In [59]:
# Describe the data - descriptive statistics
online_retail_data.describe(include='all')

  online_retail_data.describe(include='all')


Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,TotalAmount,Date
count,541909.0,541909,540455,541909.0,541909,541909.0,541909.0,541909,541909.0,541909
unique,25900.0,4070,4223,,23260,,4373.0,38,,305
top,573585.0,85123A,WHITE HANGING HEART T-LIGHT HOLDER,,2011-10-31 14:41:00,,,United Kingdom,,2011-12-05
freq,1114.0,2313,2369,,1114,,135080.0,495478,,5331
first,,,,,2010-12-01 08:26:00,,,,,
last,,,,,2011-12-09 12:50:00,,,,,
mean,,,,9.55225,,4.611114,,,17.987795,
std,,,,218.081158,,96.759853,,,378.810824,
min,,,,-80995.0,,-11062.06,,,-168469.6,
25%,,,,1.0,,1.25,,,3.4,


Checking Duplicates
* The pandas.DataFrame.duplicated.sum() function returns a Series or DataFrame containing the sum of the boolean values (True or False) in the duplicate rows of the DataFrame.

In [60]:
online_retail_data.duplicated().sum()

5268

### Checking Unique Values

In [61]:

print(online_retail_data['Country'].nunique())
online_retail_data['Country'].unique()

38


array(['United Kingdom', 'France', 'Australia', 'Netherlands', 'Germany',
       'Norway', 'EIRE', 'Switzerland', 'Spain', 'Poland', 'Portugal',
       'Italy', 'Belgium', 'Lithuania', 'Japan', 'Iceland',
       'Channel Islands', 'Denmark', 'Cyprus', 'Sweden', 'Austria',
       'Israel', 'Finland', 'Bahrain', 'Greece', 'Hong Kong', 'Singapore',
       'Lebanon', 'United Arab Emirates', 'Saudi Arabia',
       'Czech Republic', 'Canada', 'Unspecified', 'Brazil', 'USA',
       'European Community', 'Malta', 'RSA'], dtype=object)

### DATA CLEANING

#### Missing Values
* We check for missing values by utilizing the __isna()__ method, which returns a dataframe of boolean values indicating whether or not a field is null. We can use the sum() method to group all missing values by column.

In [62]:
# Handling Missing Values
online_retail_data.isnull().sum()

InvoiceNo         0
StockCode         0
Description    1454
Quantity          0
InvoiceDate       0
UnitPrice         0
CustomerID        0
Country           0
TotalAmount       0
Date              0
dtype: int64

In [63]:
# Getting proportion of missing values
online_retail_data.isnull().sum() * 100/online_retail_data.shape[0]

InvoiceNo      0.000000
StockCode      0.000000
Description    0.268311
Quantity       0.000000
InvoiceDate    0.000000
UnitPrice      0.000000
CustomerID     0.000000
Country        0.000000
TotalAmount    0.000000
Date           0.000000
dtype: float64

#### Dropping Rows with Null Values
* The include rows whose __CustomerID__ have null values.

In [65]:
# drop rows with NaN values
# axis=0 is used to drop the row with NaN values.
updated_online_retail_data = online_retail_data.dropna()
updated_online_retail_data.shape

(540455, 10)

#### Dropping Rows whose TotalAmount columns have Negative Values
* Since no sales can be done with negative amounts not unless we treat them as credit sales

In [66]:
# Count Rows with Negative UnitPrice values
print("Rows with Negative UnitPrice values: ", updated_online_retail_data[updated_online_retail_data['UnitPrice'] < 0].shape)

# Count Rows with Negative TotalAmount 
print("Rows with Negative TotalAmount values: ", updated_online_retail_data[updated_online_retail_data['TotalAmount'] < 0].shape)

Rows with Negative UnitPrice values:  (2, 10)
Rows with Negative TotalAmount values:  (9290, 10)


In [67]:
# Remove rows with negative TotalAmount
updated_online_retail_data = updated_online_retail_data[updated_online_retail_data['TotalAmount'] > 0]
updated_online_retail_data.shape

(530104, 10)

### Dropping Rows whose CustomerIDs have nan values
* Some of the __CustomerID__'s have __nan__ values.

In [69]:
# check for rows that have nan CustomerID
updated_online_retail_data[updated_online_retail_data['CustomerID'].isin(['nan'])].shape

# you can save the results in a csv file
## updated_online_retail_data[updated_online_retail_data['CustomerID'].isin(['nan'])].to_csv('nan_customer_id.csv')

(132220, 10)

In [73]:
updated_online_retail_data = updated_online_retail_data[~updated_online_retail_data['CustomerID'].isin(['nan'])]
updated_online_retail_data.shape

(397884, 10)

### Preview the New Data Frame

In [74]:
updated_online_retail_data.describe(include='all')

  updated_online_retail_data.describe(include='all')


Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,TotalAmount,Date
count,397884.0,397884,397884,397884.0,397884,397884.0,397884.0,397884,397884.0,397884
unique,18532.0,3665,3877,,17282,,4338.0,37,,305
top,576339.0,85123A,WHITE HANGING HEART T-LIGHT HOLDER,,2011-11-14 15:27:00,,17841.0,United Kingdom,,2011-11-06
freq,542.0,2035,2028,,542,,7847.0,354321,,3423
first,,,,,2010-12-01 08:26:00,,,,,
last,,,,,2011-12-09 12:50:00,,,,,
mean,,,,12.988238,,3.116488,,,22.397,
std,,,,179.331775,,22.097877,,,309.071041,
min,,,,1.0,,0.001,,,0.001,
25%,,,,2.0,,1.25,,,4.68,


## Grouping the DataFrame

#### Grouping Data 1: Group By CustomerID

In [83]:
grouped_daily_customer_data = updated_online_retail_data.groupby(['CustomerID', 'Date', 'Country']).agg(
    TotalAmount = pd.NamedAgg(column='TotalAmount', aggfunc='sum'),
    CountOfUniqueInvoices = pd.NamedAgg(column='InvoiceNo', aggfunc='nunique')
).reset_index()

In [88]:
# save results in a csv file
grouped_daily_customer_data.to_csv("grouped_daily_customer_data.csv")

#### Grouping Data 2: Group Data By Country
* __Goals__:
  * Calculate __Average Sales Value__ per Country.

In [86]:
grouped_country_data = updated_online_retail_data.groupby( 'Country').agg(
    CountOfUniqueCustomers = pd.NamedAgg(column = 'CustomerID', aggfunc = 'nunique'),
    TotalAmount = pd.NamedAgg(column='TotalAmount', aggfunc='sum'),
    CountOfUniqueInvoices = pd.NamedAgg(column='InvoiceNo', aggfunc='nunique')
).reset_index()

In [87]:
# calculate the Average Basket Value 
grouped_country_data['AverageSalesValue'] = grouped_country_data['TotalAmount'] / grouped_country_data['CountOfUniqueCustomers']

# save the output in a csv file
##grouped_country_data.to_csv("grouped_country_data.csv")