In [9]:
import pandas as pd
import numpy as np

import re


In [10]:
def import_and_concat(file, sheet_1, sheet_2):

    df1 = pd.read_excel(file, sheet_name = sheet_1)
    display(df1.shape)
    df2 = pd.read_excel(file, sheet_name = sheet_2)
    display(df2.shape)
    df = pd.concat([df1, df2], axis = 0, ignore_index= True)
    display(df.shape)
    return df

df = import_and_concat('Data/online_retail_II.xlsx', 'Year 2009-2010', 'Year 2010-2011')

(525461, 8)

(541910, 8)

(1067371, 8)

In [30]:
df.to_csv("Data/combined_total_data", index = False)

In [27]:
df = pd.read_csv("Data/combined_total_data")

In [12]:
df[df.Description.str.contains("wrongly", na = False)]

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
47635,493821,17129D,missing (wrongly coded?),-2127,2010-01-07 12:43:00,0.0,,United Kingdom
639983,546018,85172,wrongly sold as sets,-600,2011-03-08 17:23:00,0.0,,United Kingdom
639999,546023,85175,wrongly sold sets,-975,2011-03-08 17:29:00,0.0,,United Kingdom
643356,546408,22467,wrongly sold (22719) barcode,170,2011-03-11 16:25:00,0.0,,United Kingdom
789345,560039,20713,wrongly marked. 23343 in box,-3100,2011-07-14 14:27:00,0.0,,United Kingdom
802806,561103,85103,stock creditted wrongly,-32,2011-07-25 11:23:00,0.0,,United Kingdom
906148,569830,23343,wrongly coded 20713,800,2011-10-06 12:38:00,0.0,,United Kingdom
906149,569831,20713,wrongly coded-23343,-800,2011-10-06 12:38:00,0.0,,United Kingdom
941043,572547,20713,wrongly marked 23343,200,2011-10-24 17:01:00,0.0,,United Kingdom
941044,572548,23343,20713 wrongly marked,-200,2011-10-24 17:01:00,0.0,,United Kingdom


It looks like there are negative values in both the quanity and price columns. With some initial exploration, we can see that the stock code indicates if the transaction was a sale, or a shipping cost. 

We will split the dataset into two DataFrames, one with numberic stock codes which indicates a sale, and shipping dataframe which also includes refunds, gifts and other miscellaneous transactions. 

### Data Cleaning

The following rows are not useful for our project in marketing, customer segmentation and forcasting insights. We will drop them from our dataset going forward.  

In [3]:
def drop_rows(mask):                           #Establishing a function to drop rows based on a dataframe filter.
    df.drop(mask.index, inplace = True)  

drop_rows(df.loc[(df["Description"] == "This is a test product.") | (df.StockCode == "ADJUST")])
drop_rows(df[df.Description.str.contains("wrongly", na = False)])
drop_rows(df.loc[df.StockCode == "ADJUST"]) 
drop_rows(df.loc[df.StockCode == "B"])
df["Country"].replace({"EIRE": "Ireland", "USA": "United States of America", "RSA": "South Africa"}, inplace = True)
df["Description"].replace({"PACK OF 72 RETRO SPOT CAKE CASES": "PACK OF 72 RETROSPOT CAKE CASES"}, inplace = True)

df["Revenue"] = df["Quantity"] * df["Price"]

df.shape

df.to_csv("Data/cleaned_total_data", index = False)

In [29]:
#Adding a Revenue feature

df

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country,Revenue
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01 07:45:00,6.95,13085.0,United Kingdom,83.40
1,489434,79323P,PINK CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom,81.00
2,489434,79323W,WHITE CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom,81.00
3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2009-12-01 07:45:00,2.10,13085.0,United Kingdom,100.80
4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01 07:45:00,1.25,13085.0,United Kingdom,30.00
...,...,...,...,...,...,...,...,...,...
1067366,581587,22899,CHILDREN'S APRON DOLLY GIRL,6,2011-12-09 12:50:00,2.10,12680.0,France,12.60
1067367,581587,23254,CHILDRENS CUTLERY DOLLY GIRL,4,2011-12-09 12:50:00,4.15,12680.0,France,16.60
1067368,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,2011-12-09 12:50:00,4.15,12680.0,France,16.60
1067369,581587,22138,BAKING SET 9 PIECE RETROSPOT,3,2011-12-09 12:50:00,4.95,12680.0,France,14.85


In [4]:
#Extracting shipping-related or miscellaneous instances

shipping_df = df.copy()
shipping_df["StockCode"] = df["StockCode"].str.extract('(^\D+)', expand = False)

shipping_df.dropna(subset = ["StockCode"], inplace = True)  #Dropping rows without shipping information 
shipping_df.to_csv('Data/cleaned_shipping_data', index = False)

shipping_df

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country,Revenue
89,489439,POST,POSTAGE,3,2009-12-01 09:28:00,18.00,12682.0,France,54.00
126,489444,POST,POSTAGE,1,2009-12-01 09:55:00,141.00,12636.0,United States of America,141.00
173,489447,POST,POSTAGE,1,2009-12-01 10:10:00,130.00,12362.0,Belgium,130.00
625,489526,POST,POSTAGE,6,2009-12-01 11:50:00,18.00,12533.0,Germany,108.00
735,C489535,D,Discount,-1,2009-12-01 12:11:00,9.00,15299.0,United Kingdom,-9.00
...,...,...,...,...,...,...,...,...,...
1067002,C581499,M,Manual,-1,2011-12-09 10:28:00,224.69,15498.0,United Kingdom,-224.69
1067191,581570,POST,POSTAGE,1,2011-12-09 11:59:00,18.00,12662.0,Germany,18.00
1067228,581574,POST,POSTAGE,2,2011-12-09 12:09:00,18.00,12526.0,Germany,36.00
1067229,581578,POST,POSTAGE,3,2011-12-09 12:16:00,18.00,12713.0,Germany,54.00


In [5]:
shipping_df.StockCode.value_counts()

POST            2122
DOT             1446
M               1421
C                283
D                177
DCGS             108
S                104
BANK CHARGES     102
gift_            100
AMAZONFEE         43
DCGSSGIRL         25
DCGSSBOY          23
PADS              19
CRUK              16
m                  5
SP                 3
ADJUST             3
TEST               1
DCGSLGIRL          1
GIFT               1
DCGSLBOY           1
Name: StockCode, dtype: int64

In [6]:
df.drop(shipping_df.index, inplace = True, axis = 0)
df.to_csv("Data/sales_and_returns_data", index = False)

In [7]:
df["refund"] = df['Invoice'].str.extract("(^\D+[0-9]+)")                                #Identifying the rows which are returns by the invoice number
drop_rows(df.loc[df.refund.notnull()])
df.Invoice = df.Invoice.astype(int)
df.to_csv("Data/cleaned_sales_data", index = False)

In [60]:
df.to_csv("Data/cleaned_sales_data", index = False)

In [8]:
df['StockCode'].replace('\D+', '', regex=True, inplace = True)   #Removing any trailing letters
drop_rows(df[df["Quantity"]<=0])
df.to_csv("Data/cleaned_sales_data.csv", index = False)

In [61]:
df.to_csv("Data/cleaned_sales_data.csv", index = False)

In [24]:

df[df.StockCode.str.contains("35004", na = False)][["StockCode", "Description"]]

Unnamed: 0,StockCode,Description
23,35004,SET OF 3 BLACK FLYING DUCKS
1086,35004,SET OF 3 PINK FLYING DUCKS
1160,35004,SET OF 3 SILVER FLYING DUCKS
2203,35004,SET OF 3 BLACK FLYING DUCKS
2204,35004,SET OF 3 SILVER FLYING DUCKS
...,...,...
1059818,35004,SET OF 3 BLACK FLYING DUCKS
1060749,35004,SET OF 3 GOLD FLYING DUCKS
1063269,35004,SET OF 3 BLACK FLYING DUCKS
1067135,35004,SET OF 3 BLACK FLYING DUCKS


In [59]:
colors = [BLACK, PINK, SILVER, GOLD, CREAM]

NameError: name 'BLACK' is not defined