In [2]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder

from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

import joblib

In [3]:
#Importing all of the data

def import_data(filename):
    df = pd.read_csv(filename)
    return df

df = import_data("Data/cleaned_total_data")

df

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country,Revenue
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01 07:45:00,6.95,13085.0,United Kingdom,83.40
1,489434,79323P,PINK CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom,81.00
2,489434,79323W,WHITE CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom,81.00
3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2009-12-01 07:45:00,2.10,13085.0,United Kingdom,100.80
4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01 07:45:00,1.25,13085.0,United Kingdom,30.00
...,...,...,...,...,...,...,...,...,...
1067263,581587,22899,CHILDREN'S APRON DOLLY GIRL,6,2011-12-09 12:50:00,2.10,12680.0,France,12.60
1067264,581587,23254,CHILDRENS CUTLERY DOLLY GIRL,4,2011-12-09 12:50:00,4.15,12680.0,France,16.60
1067265,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,2011-12-09 12:50:00,4.15,12680.0,France,16.60
1067266,581587,22138,BAKING SET 9 PIECE RETROSPOT,3,2011-12-09 12:50:00,4.95,12680.0,France,14.85


In [4]:
"""
Dropping the bad debt, test sales and returns, and manual adjustments.
Also preparing the DataFrame for analysis and labelling returned purchases. 
"""

def process_data(df):
    df["refund"] = df['Invoice'].str.extract("(^\D+[0-9]+)")                  #Identifying the rows which are returns by the invoice number
    df.fillna(0, inplace = True)                                             #Filliing the NaN rows with 0  
    df["was_refunded"] = 0                                                      #Creating a new column to be our target: 
    print(df.shape)                                                            #whether something was returned or not
    return df
df = process_data(df)

df

(1067268, 11)


Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country,Revenue,refund,was_refunded
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01 07:45:00,6.95,13085.0,United Kingdom,83.40,0,0
1,489434,79323P,PINK CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom,81.00,0,0
2,489434,79323W,WHITE CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom,81.00,0,0
3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2009-12-01 07:45:00,2.10,13085.0,United Kingdom,100.80,0,0
4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01 07:45:00,1.25,13085.0,United Kingdom,30.00,0,0
...,...,...,...,...,...,...,...,...,...,...,...
1067263,581587,22899,CHILDREN'S APRON DOLLY GIRL,6,2011-12-09 12:50:00,2.10,12680.0,France,12.60,0,0
1067264,581587,23254,CHILDRENS CUTLERY DOLLY GIRL,4,2011-12-09 12:50:00,4.15,12680.0,France,16.60,0,0
1067265,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,2011-12-09 12:50:00,4.15,12680.0,France,16.60,0,0
1067266,581587,22138,BAKING SET 9 PIECE RETROSPOT,3,2011-12-09 12:50:00,4.95,12680.0,France,14.85,0,0


In [5]:
#Creating a Dataframe and Dictionary to represent all returned items(identified by Stock Code), and mapping them to
#the respective Customer ID

returned_products_df = df.loc[df.refund != 0]                            #Filtering by all identified returns
df.drop(returned_products_df.index, inplace = True, axis = 0)            #Dropping these returns from the main dataframe

returned_products = pd.DataFrame(returned_products_df.groupby("Customer ID")["StockCode"].apply(list))   #grouping returned item codes by Customer ID
#returned_products.drop([0], axis = 0, inplace = True)                                                 #Dropping all missing Customer ID's as these are not going to be helpful to our predictive model.
returns_dict = dict(zip(returned_products.index, returned_products["StockCode"]))                    #Creating a dictionary for quicker lookup times


prices = pd.DataFrame(returned_products_df.groupby("Customer ID")["Price"].apply(list))
#prices.drop([0], axis = 0, inplace = True)
prices = dict(zip(prices.index, prices["Price"]))
prices

{0.0: [69.57,
  8.5,
  2.55,
  4.25,
  8.97,
  1.59,
  0.42,
  398.05,
  848.43,
  906.26,
  2.55,
  12.95,
  21.0,
  3.75,
  2.55,
  0.85,
  0.42,
  3.75,
  2.55,
  1.95,
  4.25,
  1.45,
  1.45,
  0.42,
  767.99,
  69.56,
  1747.62,
  16.98,
  4.45,
  8.47,
  5.15,
  14.38,
  8.5,
  14.95,
  3.75,
  16.95,
  16.95,
  9.58,
  11.93,
  373.57,
  4.95,
  0.42,
  0.61,
  7.22,
  503.9,
  0.85,
  1.25,
  0.85,
  0.85,
  2.1,
  1.25,
  1.25,
  2.95,
  1.25,
  1.25,
  0.29,
  2.95,
  0.42,
  0.85,
  0.85,
  1.25,
  1.95,
  1.95,
  0.65,
  0.85,
  1.25,
  0.85,
  0.85,
  1.45,
  0.55,
  1.65,
  1.65,
  2.55,
  0.85,
  9.95,
  467.54,
  664.23,
  1466.23,
  73.8,
  32.03,
  0.96,
  11.29,
  373.57,
  40.68,
  774.04,
  372.3,
  255.24,
  1.85,
  11.02,
  0.85,
  0.85,
  0.85,
  170.37,
  259.59,
  7.06,
  605.18,
  94.19,
  39.24,
  241.38,
  4.25,
  25.0,
  361.11,
  9.95,
  25.89,
  31.19,
  188.55,
  8.22,
  4.21,
  3.38,
  11.02,
  6.95,
  3.75,
  8.22,
  34.0,
  4.21,
  4.95,
  1075.63,
 

In [None]:
#Mapping all instances of a match between Customer ID, Stock Code and price from the returns dictionary to the larger 
#Dataframe to indicate if a purchase was later returned. 


for i in returns_dict.keys():
    returned_prices = prices.get(i)
    for v in returns_dict[i]:
        try: 
            refund_index = df.loc[(df["Customer ID"] == i) & (df["StockCode"] == v)].index
            for p in refund_index:
                if df.iloc[p]["Price"] in returned_prices:
                    df.at[refund_index, "was_refunded"] = 1
            else:
                pass
        except:
            pass


In [94]:
df[df.was_refunded != 0] 



Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country,Revenue,refund,was_refunded
20,489436,22109,FULL ENGLISH BREAKFAST PLATE,16,2009-12-01 09:06:00,3.39,13078.0,United Kingdom,54.24,0,1
21,489436,22107,PIZZA PLATE IN BOX,4,2009-12-01 09:06:00,3.75,13078.0,United Kingdom,15.00,0,1
22,489436,22194,BLACK DINER WALL CLOCK,2,2009-12-01 09:06:00,8.50,13078.0,United Kingdom,17.00,0,1
23,489436,35004B,SET OF 3 BLACK FLYING DUCKS,12,2009-12-01 09:06:00,4.65,13078.0,United Kingdom,55.80,0,1
30,489436,22111,SCOTTIE DOG HOT WATER BOTTLE,24,2009-12-01 09:06:00,4.25,13078.0,United Kingdom,102.00,0,1
...,...,...,...,...,...,...,...,...,...,...,...
1067193,581579,22083,PAPER CHAIN KIT RETROSPOT,6,2011-12-09 12:19:00,2.95,17581.0,United Kingdom,17.70,0,1
1067197,581579,23343,JUMBO BAG VINTAGE CHRISTMAS,30,2011-12-09 12:19:00,1.79,17581.0,United Kingdom,53.70,0,1
1067235,581585,22178,VICTORIAN GLASS HANGING T-LIGHT,12,2011-12-09 12:31:00,1.95,15804.0,United Kingdom,23.40,0,1
1067240,581585,84945,MULTI COLOUR SILVER T-LIGHT HOLDER,24,2011-12-09 12:31:00,0.85,15804.0,United Kingdom,20.40,0,1


In [66]:
df.to_csv("Data/cleaned_data_with_labels")
#df = pd.read_csv("Data/cleaned_data_with_labels")


In [95]:
def data_processing_for_modeling():
    df['StockCode'].replace('\D+', '', regex=True, inplace = True)   #Removing any trailing letters from the StockCode
    df.drop(df.loc[df.StockCode == ''].index, axis = 0, inplace = True)
    df.StockCode = df.StockCode.astype(int)
    #le  = LabelEncoder()
    #df["Country_encoded"] = le.fit_transform(df.Country)
    #country_encoded = df[["Country", "Country_encoded"]]
    #country_encoded.to_csv("Data/encoded_countries", index = False)
    df["Customer ID"] = df["Customer ID"].astype(int)
    return df

df = data_processing_for_modeling()

df

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country,Revenue,refund,was_refunded
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01 07:45:00,6.95,13085.0,United Kingdom,83.40,0,0
1,489434,79323,PINK CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom,81.00,0,0
2,489434,79323,WHITE CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom,81.00,0,0
3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2009-12-01 07:45:00,2.10,13085.0,United Kingdom,100.80,0,0
4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01 07:45:00,1.25,13085.0,United Kingdom,30.00,0,0
...,...,...,...,...,...,...,...,...,...,...,...
1067268,581587,22613,PACK OF 20 SPACEBOY NAPKINS,12,2011-12-09 12:50:00,0.85,12680.0,France,10.20,0,0
1067269,581587,22899,CHILDREN'S APRON DOLLY GIRL,6,2011-12-09 12:50:00,2.10,12680.0,France,12.60,0,0
1067270,581587,23254,CHILDRENS CUTLERY DOLLY GIRL,4,2011-12-09 12:50:00,4.15,12680.0,France,16.60,0,0
1067271,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,2011-12-09 12:50:00,4.15,12680.0,France,16.60,0,0


In [96]:
df.was_refunded.value_counts(normalize = True)

0    0.93682
1    0.06318
Name: was_refunded, dtype: float64

In [120]:
le = LabelEncoder()
df["Country"] = le.fit_transform(df["Country"])
joblib.dump(le, "encoder")

X = df[["Customer ID", "Quantity", "Country", "Revenue", "StockCode", "Price"]]
y = df["was_refunded"]

X

Unnamed: 0,Customer ID,Quantity,Country,Revenue,StockCode,Price
0,13085.0,12,39,83.40,85048,6.95
1,13085.0,12,39,81.00,79323,6.75
2,13085.0,12,39,81.00,79323,6.75
3,13085.0,48,39,100.80,22041,2.10
4,13085.0,24,39,30.00,21232,1.25
...,...,...,...,...,...,...
1067268,12680.0,12,13,10.20,22613,0.85
1067269,12680.0,6,13,12.60,22899,2.10
1067270,12680.0,4,13,16.60,23254,4.15
1067271,12680.0,4,13,16.60,23255,4.15


In [121]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y)
ss = StandardScaler()
ss.fit_transform(X_train)
ss.transform(X_test)
joblib.dump(ss, "scaler")



['scaler']

In [122]:
ada = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(), n_estimators = 75)
ada.fit(X_train, y_train)




AdaBoostClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=75)

In [123]:
ada.score(X_train, y_train)

0.9979044238379455

In [124]:
ada.score(X_test, y_test)

0.9860427495898308

In [None]:
ada.predict

In [126]:
ada.predict_proba(X_test)

array([[9.98068580e-01, 1.93141984e-03],
       [9.99648024e-01, 3.51976413e-04],
       [9.98780089e-01, 1.21991102e-03],
       ...,
       [9.99994795e-01, 5.20501369e-06],
       [9.97419547e-01, 2.58045341e-03],
       [9.99987346e-01, 1.26535768e-05]])

In [None]:
joblib.dump(ada, "adaboost-model")

In [74]:
ss = StandardScaler()
X = ss.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y)

logreg.fit(X_train, y_train)
logreg.score(X_train, y_train)
joblib.dump(logreg, "logistic-regression-model")


['logistic-regression-model']

In [77]:
X_test

array([[ 0.35508687,  0.01022697, -3.14457135, -0.02182635, -0.43986591,
        -0.4327745 ],
       [ 0.43217999, -0.06356001,  0.26473488, -0.0477911 , -0.51713421,
         0.33557856],
       [ 0.58500844,  0.01022697,  0.26473488, -0.02182635, -0.50535473,
        -0.4327745 ],
       ...,
       [ 0.45239619, -0.01190912,  0.26473488, -0.08490542, -0.47180234,
        -0.63213097],
       [-1.78058881, -0.06356001,  0.26473488,  0.0226701 , -0.50769361,
         1.77260644],
       [-1.78058881, -0.07093871,  0.26473488, -0.08144345, -0.50259059,
        -0.009142  ]])

In [75]:
logreg.score(X_test, y_test)

0.9679611762135308

In [76]:
logreg.score(X_train, y_train)


0.9679573019580584

In [76]:
logreg.predict_proba(X_test)

array([[0.9709734 , 0.0290266 ],
       [0.97108519, 0.02891481],
       [0.96321659, 0.03678341],
       ...,
       [0.99729237, 0.00270763],
       [0.95561132, 0.04438868],
       [0.96916323, 0.03083677]])

In [86]:
[x[1] for x in logreg.predict_proba(X_test)]

[0.02902659893525342,
 0.028914808723402158,
 0.03678340657871519,
 0.026832662882498422,
 0.027872340123366767,
 0.0248443622771671,
 0.0333658015309531,
 0.039050256966731645,
 0.0360329720631315,
 0.024812139973085876,
 0.031111407528059666,
 0.0274634011598535,
 0.03255063416975485,
 0.03295214749991278,
 0.03907524011265123,
 0.03213075105747487,
 0.010761675486348047,
 0.02873014279435345,
 0.03506434382677986,
 0.002696064264058546,
 0.03828316872230697,
 0.023069212262609672,
 0.11542635130880098,
 0.0026959380767950106,
 0.03597696203281132,
 0.03573791200883194,
 0.04793370247072748,
 0.035088804280358515,
 0.030944746759304288,
 0.002697419156028873,
 0.03759301328363022,
 0.05102049139443088,
 0.03829541751693851,
 0.03161826653870836,
 0.054797230546125314,
 0.13815427859528062,
 0.002736456644744254,
 0.0421514146068974,
 0.0498190245781084,
 0.04019526455686205,
 0.04047939754792614,
 0.03892751223580605,
 0.03799622553275967,
 0.02613224636031389,
 0.03885183768582637,


In [88]:
predictions = pd.DataFrame()
predictions["Actual"] = y_test
predictions["preds"] =  logreg.predict(X_test)
predictions["probability_of return"]=[x[1] for x in logreg.predict_proba(X_test)]
predictions

Unnamed: 0,Actual,preds,probability_of return
25167,0,0,0.029027
268322,0,0,0.028915
348519,0,0,0.036783
489731,0,0,0.026833
708531,0,0,0.027872
...,...,...,...
780398,0,0,0.002719
1036,0,0,0.104520
120417,0,0,0.002708
262307,0,0,0.044389


In [95]:
incorrect_predictions = predictions.loc[predictions.Actual != predictions.preds].index

df.iloc[[300594]]

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country,refund,was_refunded,revenue
308606,519514,20682,RED SPOTTY CHILDS UMBRELLA,60,2010-08-18 09:19:00,2.75,14156.0,11,0,0,165.0
269204,515423,47556,TEA TIME TEA TOWELS,3,2010-07-12 13:26:00,2.95,17143.0,40,0,0,8.85
