# Returns Predictor

In this notebook, we explore the returns in the dataset. The only indication that a transaction is a return is that the invoice code starts with a 'C', and the quantity, price and Customer ID match an earlier transaction in the dataset. 

In order to develop a machine learning algorithm to predict future returns, we will need to identify which purchases were later returned. To properly label the returned purchases, each returned stock code was cross checked with each purchase, by Customer ID in the notebook below.  

From there, we developed a Logistic Regressior and AdaBoosted Decision Tree to predict returns based on Customer ID, Stockcode, quantity, price, revenue and country of purchaser. Both models score above 90% accuracy, proving that there are patterns in customers' behaviours which can indicate likelihood of a return. 

Both models were saved, and the logistic regressor is used in the web app dashboard. 

In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder

from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

import joblib

In [3]:
#Importing all of the data

def import_data(filename):
    df = pd.read_csv(filename)
    return df

df = import_data("Data/combined_total_data")
df

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01 07:45:00,6.95,13085.0,United Kingdom
1,489434,79323P,PINK CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom
2,489434,79323W,WHITE CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom
3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2009-12-01 07:45:00,2.10,13085.0,United Kingdom
4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01 07:45:00,1.25,13085.0,United Kingdom
...,...,...,...,...,...,...,...,...
1067366,581587,22899,CHILDREN'S APRON DOLLY GIRL,6,2011-12-09 12:50:00,2.10,12680.0,France
1067367,581587,23254,CHILDRENS CUTLERY DOLLY GIRL,4,2011-12-09 12:50:00,4.15,12680.0,France
1067368,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,2011-12-09 12:50:00,4.15,12680.0,France
1067369,581587,22138,BAKING SET 9 PIECE RETROSPOT,3,2011-12-09 12:50:00,4.95,12680.0,France


In [4]:
"""
Dropping the bad debt, test sales and returns, and manual adjustments.
Also preparing the DataFrame for analysis and labelling returned purchases. 
"""

def process_data(df):
    df["refund"] = df['Invoice'].str.extract("(^\D+[0-9]+)")          #Identifying the rows which are returns by the invoice number
    df.fillna(0, inplace = True)                                      #Filliing the NaN rows with 0  
    df["was_refunded"] = 0                                            #Creating a new column to be our target: 
    print(df.shape)                                                   #whether something was returned or not
    return df
df = process_data(df)

df

(1067371, 10)


Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country,refund,was_refunded
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01 07:45:00,6.95,13085.0,United Kingdom,0,0
1,489434,79323P,PINK CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom,0,0
2,489434,79323W,WHITE CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom,0,0
3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2009-12-01 07:45:00,2.10,13085.0,United Kingdom,0,0
4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01 07:45:00,1.25,13085.0,United Kingdom,0,0
...,...,...,...,...,...,...,...,...,...,...
1067366,581587,22899,CHILDREN'S APRON DOLLY GIRL,6,2011-12-09 12:50:00,2.10,12680.0,France,0,0
1067367,581587,23254,CHILDRENS CUTLERY DOLLY GIRL,4,2011-12-09 12:50:00,4.15,12680.0,France,0,0
1067368,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,2011-12-09 12:50:00,4.15,12680.0,France,0,0
1067369,581587,22138,BAKING SET 9 PIECE RETROSPOT,3,2011-12-09 12:50:00,4.95,12680.0,France,0,0


In [5]:
#Creating a Dataframe and Dictionary to represent all returned items(identified by Stock Code), and mapping them to
#the respective Customer ID

returned_products_df = df.loc[df.refund != 0]                            #Filtering by all identified returns
df.drop(returned_products_df.index, inplace = True, axis = 0)            #Dropping these returns from the main dataframe

returned_products = pd.DataFrame(returned_products_df.groupby("Customer ID")["StockCode"].apply(list))   #grouping returned item codes by Customer ID
#returned_products.drop([0], axis = 0, inplace = True)                                                 #Dropping all missing Customer ID's as these are not going to be helpful to our predictive model.
returns_dict = dict(zip(returned_products.index, returned_products["StockCode"]))                    #Creating a dictionary for quicker lookup times


prices = pd.DataFrame(returned_products_df.groupby("Customer ID")["Price"].apply(list))
prices = dict(zip(prices.index, prices["Price"]))
prices

{0.0: [69.57,
  8.5,
  2.55,
  4.25,
  8.97,
  1.59,
  0.42,
  398.05,
  848.43,
  906.26,
  2.55,
  12.95,
  21.0,
  3.75,
  2.55,
  0.85,
  0.42,
  3.75,
  2.55,
  1.95,
  4.25,
  1.45,
  1.45,
  0.42,
  767.99,
  69.56,
  1747.62,
  16.98,
  4.45,
  8.47,
  503.19,
  59.1,
  5.15,
  14.38,
  8.5,
  14.95,
  3.75,
  16.95,
  16.95,
  9.58,
  11.93,
  373.57,
  4.95,
  0.42,
  0.61,
  7.22,
  503.9,
  0.85,
  1.25,
  0.85,
  0.85,
  2.1,
  1.25,
  1.25,
  2.95,
  1.25,
  1.25,
  0.29,
  2.95,
  0.42,
  0.85,
  0.85,
  1.25,
  1.95,
  1.95,
  0.65,
  0.85,
  1.25,
  0.85,
  0.85,
  1.45,
  0.55,
  1.65,
  1.65,
  2.55,
  0.85,
  9.95,
  467.54,
  664.23,
  1466.23,
  73.8,
  32.03,
  0.96,
  11.29,
  373.57,
  40.68,
  774.04,
  372.3,
  255.24,
  1.85,
  11.02,
  0.85,
  0.85,
  0.85,
  170.37,
  259.59,
  7.06,
  605.18,
  94.19,
  39.24,
  241.38,
  4.25,
  25.0,
  361.11,
  9.95,
  25.89,
  31.19,
  188.55,
  8.22,
  4.21,
  3.38,
  11.02,
  6.95,
  3.75,
  8.22,
  34.0,
  4.21,
  

In [6]:
#Mapping all instances of a match between Customer ID, Stock Code and price from the returns dictionary to the larger 
#Dataframe to indicate if a purchase was later returned. 


for i in returns_dict.keys():
    returned_prices = prices.get(i)
    for v in returns_dict[i]:
        try: 
            refund_index = df.loc[(df["Customer ID"] == i) & (df["StockCode"] == v)].index
            for p in refund_index:
                if df.iloc[p]["Price"] in returned_prices:
                    df.at[refund_index, "was_refunded"] = 1
            else:
                pass
        except:
            pass


In [7]:
df.drop(["Invoice","InvoiceDate"], axis = 1, inplace = True)
df.to_csv("Data/cleaned_data_with_labels")


In [11]:
def data_processing_for_modeling():
    df['StockCode'].replace('\D+', '', regex=True, inplace = True)   #Removing any trailing letters from the StockCode
    df.drop(df.loc[df.StockCode == ''].index, axis = 0, inplace = True)
    df.StockCode = df.StockCode.astype(int)
    df["Customer ID"] = df["Customer ID"].astype(int)
    return df

df = data_processing_for_modeling()

df

Unnamed: 0,StockCode,Description,Quantity,Price,Customer ID,Country,refund,was_refunded,Revenue
0,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,6.95,13085,40,0,0,83.40
1,79323,PINK CHERRY LIGHTS,12,6.75,13085,40,0,0,81.00
2,79323,WHITE CHERRY LIGHTS,12,6.75,13085,40,0,0,81.00
3,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2.10,13085,40,0,0,100.80
4,21232,STRAWBERRY CERAMIC TRINKET BOX,24,1.25,13085,40,0,0,30.00
...,...,...,...,...,...,...,...,...,...
1067365,22613,PACK OF 20 SPACEBOY NAPKINS,12,0.85,12680,14,0,0,10.20
1067366,22899,CHILDREN'S APRON DOLLY GIRL,6,2.10,12680,14,0,0,12.60
1067367,23254,CHILDRENS CUTLERY DOLLY GIRL,4,4.15,12680,14,0,0,16.60
1067368,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,4.15,12680,14,0,0,16.60


In [12]:
df.was_refunded.value_counts(normalize = True)

0    0.93706
1    0.06294
Name: was_refunded, dtype: float64

In [13]:
le = LabelEncoder()
df["Country"] = le.fit_transform(df["Country"])
joblib.dump(le, "encoder")

X = df[["Customer ID", "Quantity", "Country", "Revenue", "StockCode", "Price"]]
y = df["was_refunded"]

X

Unnamed: 0,Customer ID,Quantity,Country,Revenue,StockCode,Price
0,13085,12,40,83.40,85048,6.95
1,13085,12,40,81.00,79323,6.75
2,13085,12,40,81.00,79323,6.75
3,13085,48,40,100.80,22041,2.10
4,13085,24,40,30.00,21232,1.25
...,...,...,...,...,...,...
1067365,12680,12,14,10.20,22613,0.85
1067366,12680,6,14,12.60,22899,2.10
1067367,12680,4,14,16.60,23254,4.15
1067368,12680,4,14,16.60,23255,4.15


In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y)
ss = StandardScaler()
ss.fit_transform(X_train)
ss.transform(X_test)
joblib.dump(ss, "scaler")



['scaler']

In [15]:
#training an Adaboost Model
ada = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(), n_estimators = 75)
ada.fit(X_train, y_train)




AdaBoostClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=75)

In [16]:
print(f'Adaboost score on Training: {ada.score(X_train, y_train)}')
print(f'Adaboost score on Test: {ada.score(X_test, y_test)}')
joblib.dump(ada, "adaboost-model")

Adaboost score on Training: 0.9978533670831731
Adaboost score on Test: 0.9857900749020599


In [19]:
#Training a basic logistic regression model 
logreg = LogisticRegression()
ss = StandardScaler()
X = ss.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y)

logreg.fit(X_train, y_train)


LogisticRegression()

In [20]:
print(f'Logistic Regression score on Training: {logreg.score(X_train, y_train)}')
print(f'Logistic Regression score on Testing: {logreg.score(X_test, y_test)}')
joblib.dump(logreg, "logistic-regression-model")


Logistic Regression score on Training: 0.9368723813953369
Logistic Regression score on Testing: 0.9368852396175932


['logistic-regression-model']