In [163]:
# Imoports
import pandas as pd
import sqlite3

In [164]:
# Connect SQLite database.
db_conn = sqlite3.connect("SuperstoreDB/superstore.db")

In [165]:
# Load all the data from the db into a pandas df.
df = pd.read_sql(
    """
    SELECT *
    FROM OrdersDetails
    JOIN Products
        ON Products.ProductID = OrdersDetails.ProductID
    JOIN Orders
        ON Orders.OrderID = OrdersDetails.OrderID
    JOIN Customers
        ON Customers.CustomerID = Orders.CustomerID
    JOIN Addresses
        ON Addresses.AddressID = Orders.AddressID
    """, db_conn)

In [166]:
# Since we have done 4 JOINs loading the data, we now have duplicated columns. Let's drop them.
# Drop columns with duplicated names
df = df.loc[:, ~df.columns.duplicated()]

In [167]:
# Converting Sales and Profit from Cents to Dollars.
df['Sales'] = df['Sales'] / 100
df['Profit'] = df['Profit'] / 100

In [168]:
df.head()

Unnamed: 0,OrderID,ProductID,Sales,Quantity,Discount,Profit,ProductName,Category,SubCategory,OrderDate,...,ShipMode,CustomerID,AddressID,CustomerName,Segment,PostalCode,City,State,Region,Country
0,CA-2016-152156,FUR-BO-10001798,261.95,2,0.0,41.91,Bush Somerset Collection Bookcase,Furniture,Bookcases,2016-11-08,...,Second Class,CG-12520,42420-000001,Claire Gute,Consumer,42420,Henderson,Kentucky,South,United States
1,CA-2016-152156,FUR-CH-10000454,731.94,3,0.0,219.58,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",Furniture,Chairs,2016-11-08,...,Second Class,CG-12520,42420-000001,Claire Gute,Consumer,42420,Henderson,Kentucky,South,United States
2,CA-2016-138688,OFF-LA-10000240,14.62,2,0.0,6.87,Self-Adhesive Address Labels for Typewriters b...,Office Supplies,Labels,2016-06-12,...,Second Class,DV-13045,90036-000001,Darrin Van Huff,Corporate,90036,Los Angeles,California,West,United States
3,US-2015-108966,FUR-TA-10000577,957.57,5,0.45,-383.03,Bretford CR4500 Series Slim Rectangular Table,Furniture,Tables,2015-10-11,...,Standard Class,SO-20335,33311-000001,Sean O'Donnell,Consumer,33311,Fort Lauderdale,Florida,South,United States
4,US-2015-108966,OFF-ST-10000760,22.36,2,0.2,2.51,Eldon Fold 'N Roll Cart System,Office Supplies,Storage,2015-10-11,...,Standard Class,SO-20335,33311-000001,Sean O'Donnell,Consumer,33311,Fort Lauderdale,Florida,South,United States


When working with sales data, a common thing to do is to understand the customers' buying patterns using 3 parameters: Monetary (amount of money spent), Frequency (how frequently the customer bought something) Recency (how many days has passed from today to their last purchase).

These parameters are not given to us; we will have to calculate them.

In [169]:
# First, we will create a new DataFrame by grouping the customers based on their ID and summing the Sales column.
df_monetary = df.groupby('CustomerID', as_index=False)['Sales'].sum()
df_monetary.head()

Unnamed: 0,CustomerID,Sales
0,AA-10315,5563.54
1,AA-10375,1056.36
2,AA-10480,1790.51
3,AA-10645,5086.9
4,AB-10015,886.15


Do the same with Frequency (How many unique Order IDs are there per customer?) and the Recency (Calculate it from the last order date in the dataset) and then merge all the 3 DataFrames.

In [170]:
# Calculate Frequency (How many unique Order IDs are there per customer?)
df_frequency = df.groupby('CustomerID', as_index=False)['OrderID'].count()
df_frequency.head()

Unnamed: 0,CustomerID,OrderID
0,AA-10315,11
1,AA-10375,15
2,AA-10480,12
3,AA-10645,18
4,AB-10015,6


In [171]:
# Calculate Recency (today (2017-12-30) - the last order date in the dataset)
df_recency = df.groupby('CustomerID', as_index=False)['OrderDate'].max()
df_recency['OrderDate'] = pd.to_datetime(df_recency['OrderDate'])
df_recency['Recency'] = pd.to_datetime('2017-12-30') - df_recency['OrderDate']
df_recency.drop(['OrderDate'], axis=1, inplace=True)
df_recency.head()

Unnamed: 0,CustomerID,Recency
0,AA-10315,184 days
1,AA-10375,19 days
2,AA-10480,259 days
3,AA-10645,55 days
4,AB-10015,415 days


In [172]:
# Merge all the 3 DataFrames and give the columns better names.
rfm_df = pd.merge(pd.merge(df_monetary,df_frequency,on='CustomerID'),df_recency,on='CustomerID')
rfm_df.rename(columns={'Sales': 'Amount',
                       'OrderID': 'Frequency'},
                       inplace=True)
rfm_df

Unnamed: 0,CustomerID,Amount,Frequency,Recency
0,AA-10315,5563.54,11,184 days
1,AA-10375,1056.36,15,19 days
2,AA-10480,1790.51,12,259 days
3,AA-10645,5086.90,18,55 days
4,AB-10015,886.15,6,415 days
...,...,...,...,...
788,XP-21865,2374.62,28,43 days
789,YC-21895,5454.34,8,4 days
790,YS-21880,6720.43,12,9 days
791,ZC-21910,8025.63,31,54 days


In [173]:
# Rescaling the attributes
rfm_df = rfm[['Amount', 'Frequency', 'Recency']]

# Instantiate
scaler = StandardScaler()

# fit_transform
rfm_df_scaled = scaler.fit_transform(rfm_df)
rfm_df_scaled.shape

rfm_df_scaled = pd.DataFrame(rfm_df_scaled)
rfm_df_scaled.columns = ['Amount', 'Frequency', 'Recency']
rfm_df_scaled.head()

NameError: name 'rfm' is not defined