In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
orders = pd.read_csv('./drive/MyDrive/dataset/orders_FE.csv', encoding= 'unicode_escape')
orders = orders.drop('Unnamed: 0',axis=1)
orders

Unnamed: 0,InvoiceDate,InvoiceNo,StockCode,Description,UnitPrice,CustomerID,Country,Quantity,amount_spent,date,time,month,month_name,is_weekend,hour,part_of_day,day,IsCancelled
0,2010-12-01 08:26:00,536365,21730,GLASS STAR FROSTED T-LIGHT HOLDER,4.25,17850.0,United Kingdom,6,25.50,2010-12-01,08:26:00,12,Dec,0,8,3,1,False
1,2010-12-01 08:26:00,536365,22752,SET 7 BABUSHKA NESTING BOXES,7.65,17850.0,United Kingdom,2,15.30,2010-12-01,08:26:00,12,Dec,0,8,3,1,False
2,2010-12-01 08:26:00,536365,71053,WHITE METAL LANTERN,3.39,17850.0,United Kingdom,6,20.34,2010-12-01,08:26:00,12,Dec,0,8,3,1,False
3,2010-12-01 08:26:00,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,3.39,17850.0,United Kingdom,6,20.34,2010-12-01,08:26:00,12,Dec,0,8,3,1,False
4,2010-12-01 08:26:00,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,3.39,17850.0,United Kingdom,6,20.34,2010-12-01,08:26:00,12,Dec,0,8,3,1,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
531803,2011-12-09 12:50:00,581587,22730,ALARM CLOCK BAKELIKE IVORY,3.75,12680.0,France,4,15.00,2011-12-09,12:50:00,12,Dec,0,12,4,9,False
531804,2011-12-09 12:50:00,581587,22899,CHILDREN'S APRON DOLLY GIRL,2.10,12680.0,France,6,12.60,2011-12-09,12:50:00,12,Dec,0,12,4,9,False
531805,2011-12-09 12:50:00,581587,23254,CHILDRENS CUTLERY DOLLY GIRL,4.15,12680.0,France,4,16.60,2011-12-09,12:50:00,12,Dec,0,12,4,9,False
531806,2011-12-09 12:50:00,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4.15,12680.0,France,4,16.60,2011-12-09,12:50:00,12,Dec,0,12,4,9,False


In [None]:
orders['InvoiceDate'] = pd.to_datetime(orders['InvoiceDate'])

# Classification Model
### Will the user make an order in the next 3 month?

> ## Working with data before the last 3 months to avoid data leakage

The approach is as follows:
* We exclude the last 3 months from the data and build the rfm model for user and every other feature only on the data 3 months before the last day 
* We create a label for each user whether or not that user made a purchase in the last 3 month 
* We represent each user by relevant features
* Predictions is done and the accuracy of the model is calculated


In [None]:
day_model = orders["InvoiceDate"].max() - pd.Timedelta(value=91, unit='days')

orders_9 = orders[orders['InvoiceDate']< day_model]
orders_3 = orders[orders['InvoiceDate']>= day_model]

> ## RFM for classification model

In [None]:
# Customer ID to object
# orders['CustomerID'] = orders['CustomerID'].astype(np.object)

In [None]:
orders_9['TotalPrice'] = orders_9['UnitPrice'] * orders_9['Quantity']
orders_9

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,InvoiceDate,InvoiceNo,StockCode,Description,UnitPrice,CustomerID,Country,Quantity,amount_spent,date,time,month,month_name,is_weekend,hour,part_of_day,day,IsCancelled,TotalPrice
0,2010-12-01 08:26:00,536365,21730,GLASS STAR FROSTED T-LIGHT HOLDER,4.25,17850.0,United Kingdom,6,25.50,2010-12-01,08:26:00,12,Dec,0,8,3,1,False,25.50
1,2010-12-01 08:26:00,536365,22752,SET 7 BABUSHKA NESTING BOXES,7.65,17850.0,United Kingdom,2,15.30,2010-12-01,08:26:00,12,Dec,0,8,3,1,False,15.30
2,2010-12-01 08:26:00,536365,71053,WHITE METAL LANTERN,3.39,17850.0,United Kingdom,6,20.34,2010-12-01,08:26:00,12,Dec,0,8,3,1,False,20.34
3,2010-12-01 08:26:00,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,3.39,17850.0,United Kingdom,6,20.34,2010-12-01,08:26:00,12,Dec,0,8,3,1,False,20.34
4,2010-12-01 08:26:00,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,3.39,17850.0,United Kingdom,6,20.34,2010-12-01,08:26:00,12,Dec,0,8,3,1,False,20.34
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
328455,2011-09-09 12:42:00,566179,23236,STORAGE TIN VINTAGE DOILY,2.89,0.0,Israel,2,5.78,2011-09-09,12:42:00,9,Sep,0,12,4,9,False,5.78
328456,2011-09-09 12:42:00,566179,23240,SET OF 4 KNICK KNACK TINS DOILY,4.15,0.0,Israel,5,20.75,2011-09-09,12:42:00,9,Sep,0,12,4,9,False,20.75
328457,2011-09-09 12:42:00,566179,23433,HANGING QUILTED PATCHWORK APPLES,0.83,0.0,Israel,36,29.88,2011-09-09,12:42:00,9,Sep,0,12,4,9,False,29.88
328458,2011-09-09 12:42:00,566179,47574A,ENGLISH ROSE SCENTED HANGING FLOWER,0.75,0.0,Israel,6,4.50,2011-09-09,12:42:00,9,Sep,0,12,4,9,False,4.50


In [None]:
# working only with purchases and not returns
orders_9 = orders_9[orders_9['TotalPrice'] > 0]
orders_9

Unnamed: 0,InvoiceDate,InvoiceNo,StockCode,Description,UnitPrice,CustomerID,Country,Quantity,amount_spent,date,time,month,month_name,is_weekend,hour,part_of_day,day,IsCancelled,TotalPrice
0,2010-12-01 08:26:00,536365,21730,GLASS STAR FROSTED T-LIGHT HOLDER,4.25,17850.0,United Kingdom,6,25.50,2010-12-01,08:26:00,12,Dec,0,8,3,1,False,25.50
1,2010-12-01 08:26:00,536365,22752,SET 7 BABUSHKA NESTING BOXES,7.65,17850.0,United Kingdom,2,15.30,2010-12-01,08:26:00,12,Dec,0,8,3,1,False,15.30
2,2010-12-01 08:26:00,536365,71053,WHITE METAL LANTERN,3.39,17850.0,United Kingdom,6,20.34,2010-12-01,08:26:00,12,Dec,0,8,3,1,False,20.34
3,2010-12-01 08:26:00,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,3.39,17850.0,United Kingdom,6,20.34,2010-12-01,08:26:00,12,Dec,0,8,3,1,False,20.34
4,2010-12-01 08:26:00,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,3.39,17850.0,United Kingdom,6,20.34,2010-12-01,08:26:00,12,Dec,0,8,3,1,False,20.34
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
328455,2011-09-09 12:42:00,566179,23236,STORAGE TIN VINTAGE DOILY,2.89,0.0,Israel,2,5.78,2011-09-09,12:42:00,9,Sep,0,12,4,9,False,5.78
328456,2011-09-09 12:42:00,566179,23240,SET OF 4 KNICK KNACK TINS DOILY,4.15,0.0,Israel,5,20.75,2011-09-09,12:42:00,9,Sep,0,12,4,9,False,20.75
328457,2011-09-09 12:42:00,566179,23433,HANGING QUILTED PATCHWORK APPLES,0.83,0.0,Israel,36,29.88,2011-09-09,12:42:00,9,Sep,0,12,4,9,False,29.88
328458,2011-09-09 12:42:00,566179,47574A,ENGLISH ROSE SCENTED HANGING FLOWER,0.75,0.0,Israel,6,4.50,2011-09-09,12:42:00,9,Sep,0,12,4,9,False,4.50


In [None]:
# the last date of purchase
orders_9["InvoiceDate"].max()

Timestamp('2011-09-09 12:42:00')

In [None]:
import datetime as dt
today_date = dt.datetime(2011, 12, 11)

In [None]:
rfm = orders_9.groupby('CustomerID').agg({'InvoiceDate': lambda InvoiceDate: (today_date - InvoiceDate.max()).days,
                                     'InvoiceNo': lambda Invoice: Invoice.nunique(),
                                     'TotalPrice': lambda TotalPrice: TotalPrice.sum()})
rfm

Unnamed: 0_level_0,InvoiceDate,InvoiceNo,TotalPrice
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,92,1109,1101757.36
12346.0,326,1,77183.60
12347.0,130,5,2790.86
12348.0,249,3,1487.24
12350.0,311,1,334.40
...,...,...,...
18280.0,278,1,180.60
18281.0,181,1,80.82
18282.0,127,1,100.21
18283.0,96,10,1120.67


In [None]:
rfm.columns = ['recency', 'frequency', 'monetary']
rfm

Unnamed: 0_level_0,recency,frequency,monetary
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,92,1109,1101757.36
12346.0,326,1,77183.60
12347.0,130,5,2790.86
12348.0,249,3,1487.24
12350.0,311,1,334.40
...,...,...,...
18280.0,278,1,180.60
18281.0,181,1,80.82
18282.0,127,1,100.21
18283.0,96,10,1120.67


In [None]:
rfm = rfm.reset_index()
rfm

Unnamed: 0,CustomerID,recency,frequency,monetary
0,0.0,92,1109,1101757.36
1,12346.0,326,1,77183.60
2,12347.0,130,5,2790.86
3,12348.0,249,3,1487.24
4,12350.0,311,1,334.40
...,...,...,...,...
3361,18280.0,278,1,180.60
3362,18281.0,181,1,80.82
3363,18282.0,127,1,100.21
3364,18283.0,96,10,1120.67


> ## Merging both rfm and orders

In [None]:
## Vectorizing Description
# from sklearn.feature_extraction.text import TfidfVectorizer
# v = TfidfVectorizer(max_features=500)
# v.fit_transform(orders['Description']).toarray()

In [None]:
# first we drop UnitPrice ,country, and stockcode
orders_9.drop(['UnitPrice','Country','StockCode','Description','InvoiceDate'],axis=1,inplace=True)
orders_9

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0,InvoiceNo,CustomerID,Quantity,amount_spent,date,time,month,month_name,is_weekend,hour,part_of_day,day,IsCancelled,TotalPrice
0,536365,17850.0,6,25.50,2010-12-01,08:26:00,12,Dec,0,8,3,1,False,25.50
1,536365,17850.0,2,15.30,2010-12-01,08:26:00,12,Dec,0,8,3,1,False,15.30
2,536365,17850.0,6,20.34,2010-12-01,08:26:00,12,Dec,0,8,3,1,False,20.34
3,536365,17850.0,6,20.34,2010-12-01,08:26:00,12,Dec,0,8,3,1,False,20.34
4,536365,17850.0,6,20.34,2010-12-01,08:26:00,12,Dec,0,8,3,1,False,20.34
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
328455,566179,0.0,2,5.78,2011-09-09,12:42:00,9,Sep,0,12,4,9,False,5.78
328456,566179,0.0,5,20.75,2011-09-09,12:42:00,9,Sep,0,12,4,9,False,20.75
328457,566179,0.0,36,29.88,2011-09-09,12:42:00,9,Sep,0,12,4,9,False,29.88
328458,566179,0.0,6,4.50,2011-09-09,12:42:00,9,Sep,0,12,4,9,False,4.50


In [None]:
orders_cust = orders_9.groupby('InvoiceNo').agg({
    'CustomerID':'min','Quantity':'sum'
    ,'TotalPrice':'sum', 'is_weekend':'min'}).reset_index().drop('InvoiceNo',axis=1)
orders_cust

Unnamed: 0,CustomerID,Quantity,TotalPrice,is_weekend
0,17850.0,40,139.12,0
1,17850.0,12,22.20,0
2,13047.0,83,278.73,0
3,13047.0,15,70.05,0
4,13047.0,3,17.85,0
...,...,...,...,...
12982,12787.0,145,186.56,0
12983,18237.0,104,158.96,0
12984,16353.0,49,141.90,0
12985,0.0,233,423.73,0


In [None]:
orders_cust = pd.merge(orders_cust,rfm,left_on='CustomerID',right_on='CustomerID',how='outer')
orders_cust

Unnamed: 0,CustomerID,Quantity,TotalPrice,is_weekend,recency,frequency,monetary
0,17850.0,40,139.12,0,373,34,5391.21
1,17850.0,12,22.20,0,373,34,5391.21
2,17850.0,12,22.20,0,373,34,5391.21
3,17850.0,88,259.86,0,373,34,5391.21
4,17850.0,88,259.86,0,373,34,5391.21
...,...,...,...,...,...,...,...
12982,13244.0,38,121.74,0,93,1,121.74
12983,17004.0,602,1006.40,0,93,1,1006.40
12984,13441.0,204,296.64,0,92,1,296.64
12985,13823.0,325,650.42,0,92,1,650.42


In [None]:
# Already represented in monetary
orders_cust.drop('TotalPrice',axis=1,inplace=True)

In [None]:
# Average in is_weekend flag will represent the percentage of weekend orders to total orders
orders_cust = orders_cust.groupby('CustomerID').agg({
    'is_weekend':'mean','frequency':'min'
    ,'monetary':'min', 'recency':'min',
    'Quantity':'sum'
})
orders_cust

Unnamed: 0_level_0,is_weekend,frequency,monetary,recency,Quantity
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0.0,0.018936,1109,1101757.36,92,262018
12346.0,0.000000,1,77183.60,326,74215
12347.0,0.000000,5,2790.86,130,1590
12348.0,0.000000,3,1487.24,249,2124
12350.0,0.000000,1,334.40,311,197
...,...,...,...,...,...
18280.0,0.000000,1,180.60,278,45
18281.0,1.000000,1,80.82,181,54
18282.0,0.000000,1,100.21,127,75
18283.0,0.100000,10,1120.67,96,742


In [None]:
orders_cust = orders_cust.reset_index()
orders_cust

Unnamed: 0,CustomerID,is_weekend,frequency,monetary,recency,Quantity
0,0.0,0.018936,1109,1101757.36,92,262018
1,12346.0,0.000000,1,77183.60,326,74215
2,12347.0,0.000000,5,2790.86,130,1590
3,12348.0,0.000000,3,1487.24,249,2124
4,12350.0,0.000000,1,334.40,311,197
...,...,...,...,...,...,...
3361,18280.0,0.000000,1,180.60,278,45
3362,18281.0,1.000000,1,80.82,181,54
3363,18282.0,0.000000,1,100.21,127,75
3364,18283.0,0.100000,10,1120.67,96,742


> ## Labelling

In [None]:
orders_3 = pd.DataFrame(orders_3['CustomerID'].unique())
orders_3['label'] = 1
orders_3.columns = ['CustomerID','label']
orders_3

Unnamed: 0,CustomerID,label
0,16173.0,1
1,14243.0,1
2,14210.0,1
3,14837.0,1
4,16723.0,1
...,...,...
2918,13404.0,1
2919,13077.0,1
2920,16446.0,1
2921,12423.0,1


In [None]:
pd.merge(orders_cust,orders_3,left_on ='CustomerID',right_on='CustomerID',how='left')['label'].fillna(0).value_counts()

1.0    1946
0.0    1420
Name: label, dtype: int64

In [None]:
orders_cust = pd.merge(orders_cust,orders_3,left_on ='CustomerID',right_on='CustomerID',how='left').fillna(0)
orders_cust

Unnamed: 0,CustomerID,is_weekend,frequency,monetary,recency,Quantity,label
0,0.0,0.018936,1109,1101757.36,92,262018,1.0
1,12346.0,0.000000,1,77183.60,326,74215,0.0
2,12347.0,0.000000,5,2790.86,130,1590,1.0
3,12348.0,0.000000,3,1487.24,249,2124,1.0
4,12350.0,0.000000,1,334.40,311,197,0.0
...,...,...,...,...,...,...,...
3361,18280.0,0.000000,1,180.60,278,45,0.0
3362,18281.0,1.000000,1,80.82,181,54,0.0
3363,18282.0,0.000000,1,100.21,127,75,1.0
3364,18283.0,0.100000,10,1120.67,96,742,1.0


In [None]:
orders_cust['label'].value_counts()

1.0    1946
0.0    1420
Name: label, dtype: int64

## Step 3: Modelling

### Splitting Data

In [None]:
orders_cust.columns

Index(['CustomerID', 'is_weekend', 'frequency', 'monetary', 'recency',
       'Quantity', 'label'],
      dtype='object')

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(
     orders_cust[['CustomerID', 'is_weekend', 'frequency', 'monetary', 'recency',
       'Quantity']], orders_cust['label'], test_size=0.33, random_state=42)

###Normalization

In [None]:
from sklearn import preprocessing

min_max_scaler = preprocessing.MinMaxScaler()
x_train[['frequency', 'monetary', 'recency', 'Quantity']] = min_max_scaler.fit_transform(x_train[['frequency', 'monetary', 'recency', 'Quantity']])
x_test[['frequency', 'monetary', 'recency', 'Quantity']] = min_max_scaler.transform(x_test[['frequency', 'monetary', 'recency', 'Quantity']])

## Step 3: Modelling

### Splitting Data

In [None]:
orders_cust.columns

Index(['CustomerID', 'is_weekend', 'frequency', 'monetary', 'recency',
       'Quantity', 'label'],
      dtype='object')

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(
     orders_cust[['CustomerID', 'is_weekend', 'frequency', 'monetary', 'recency',
       'Quantity']], orders_cust['label'], test_size=0.33, random_state=42)

###Normalization

In [None]:
x_test

Unnamed: 0,CustomerID,is_weekend,frequency,monetary,recency,Quantity
1116,14259.0,0.000000,1,120.00,140,68
2358,16469.0,0.000000,2,276.88,245,114
807,13726.0,0.000000,1,292.68,99,384
2700,17085.0,0.600000,5,1566.65,122,786
196,12664.0,0.000000,3,1957.48,142,505
...,...,...,...,...,...,...
1805,15449.0,0.000000,2,647.70,184,516
1151,14321.0,0.000000,3,900.65,115,1516
340,12902.0,1.000000,1,138.68,263,66
1563,15046.0,0.454545,11,3984.53,95,2561


In [None]:
from sklearn import preprocessing

min_max_scaler = preprocessing.MinMaxScaler()
x_train[['frequency', 'monetary', 'recency', 'Quantity']] = min_max_scaler.fit_transform(x_train[['frequency', 'monetary', 'recency', 'Quantity']])
x_test[['frequency', 'monetary', 'recency', 'Quantity']] = min_max_scaler.transform(x_test[['frequency', 'monetary', 'recency', 'Quantity']])

In [None]:
x_train

Unnamed: 0,CustomerID,is_weekend,frequency,monetary,recency,Quantity
1356,14660.0,0.000000,0.000000,0.000702,0.031915,0.000806
480,13139.0,0.333333,0.015385,0.009575,0.063830,0.005675
2880,17423.0,0.000000,0.000000,0.002500,0.446809,0.002450
2609,16918.0,0.333333,0.015385,0.004593,0.014184,0.004392
1446,14821.0,0.000000,0.000000,0.000253,0.975177,0.000149
...,...,...,...,...,...,...
1095,14227.0,0.000000,0.046154,0.013437,0.372340,0.010004
1130,14286.0,0.000000,0.000000,0.011477,0.535461,0.015633
1294,14548.0,0.000000,0.007692,0.003640,0.209220,0.001996
860,13814.0,0.000000,0.007692,0.008836,0.003546,0.006756


In [None]:
x_train.drop('CustomerID', axis=1, inplace=True)

In [None]:
cus_id = x_test['CustomerID']
x_test.drop('CustomerID', axis=1, inplace=True)

###Logistic regression

In [None]:
x_train

Unnamed: 0,is_weekend,frequency,monetary,recency,Quantity
1356,0.000000,0.000000,0.000702,0.031915,0.000806
480,0.333333,0.015385,0.009575,0.063830,0.005675
2880,0.000000,0.000000,0.002500,0.446809,0.002450
2609,0.333333,0.015385,0.004593,0.014184,0.004392
1446,0.000000,0.000000,0.000253,0.975177,0.000149
...,...,...,...,...,...
1095,0.000000,0.046154,0.013437,0.372340,0.010004
1130,0.000000,0.000000,0.011477,0.535461,0.015633
1294,0.000000,0.007692,0.003640,0.209220,0.001996
860,0.000000,0.007692,0.008836,0.003546,0.006756


In [None]:
x_test

Unnamed: 0,is_weekend,frequency,monetary,recency,Quantity
1116,0.000000,0.000000,0.000652,0.177305,0.000524
2358,0.000000,0.007692,0.001532,0.549645,0.000885
807,0.000000,0.000000,0.001620,0.031915,0.002998
2700,0.600000,0.030769,0.008766,0.113475,0.006145
196,0.000000,0.015385,0.010958,0.184397,0.003945
...,...,...,...,...,...
1805,0.000000,0.007692,0.003612,0.333333,0.004032
1151,0.000000,0.015385,0.005030,0.088652,0.011860
340,1.000000,0.000000,0.000757,0.613475,0.000509
1563,0.454545,0.076923,0.022326,0.017730,0.020040


In [None]:
# import the class
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# instantiate the model (using the default parameters)
logreg = LogisticRegression()

# fit the model with data
logreg.fit(x_train,y_train)

#predict on the test data
y_pred=logreg.predict(x_test)

print('Test accuracy is', accuracy_score(y_test, y_pred))

Test accuracy is 0.6552655265526552


###SVC

In [None]:
from sklearn import svm

clf_svc = svm.SVC()
clf_svc.fit(x_train, y_train)

y_pred = clf_svc.predict(x_test)

print('Test accuracy is', accuracy_score(y_test, y_pred))

Test accuracy is 0.6741674167416741


###RandomForestClassifier


In [None]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=2, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [None]:
y_pred = clf.predict(x_test)

In [None]:
print('Test accuracy is', accuracy_score(y_test, y_pred))

Test accuracy is 0.6642664266426642


###RandomForestClassifier with GridSearchCV


In [None]:
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [None]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(x_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 158 tasks      | elapsed:  7.2min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 13.3min finished


RandomizedSearchCV(cv=3, error_score=nan,
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    ccp_alpha=0.0,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    max_samples=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
               

In [None]:
y_pred = rf_random.predict(x_test)

print('Test accuracy is', accuracy_score(y_test, y_pred))

Test accuracy is 0.6426642664266426


###XGBoost

In [None]:
import xgboost as xgb

data_dmatrix = xgb.DMatrix(data=x_train,label=y_train)

xg_cls = xgb.XGBClassifier(objective ='reg:linear', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 10)

In [None]:
xg_cls.fit(x_train,y_train)

preds = xg_cls.predict(x_test)



In [None]:
print('Test accuracy is', accuracy_score(y_test, preds))

Test accuracy is 0.6462646264626463


In [None]:
x_train

Unnamed: 0,CustomerID,is_weekend,frequency,monetary,recency,Quantity
1356,14660.0,0.000000,0.000000,0.000702,0.031915,0.000806
480,13139.0,0.333333,0.015385,0.009575,0.063830,0.005675
2880,17423.0,0.000000,0.000000,0.002500,0.446809,0.002450
2609,16918.0,0.333333,0.015385,0.004593,0.014184,0.004392
1446,14821.0,0.000000,0.000000,0.000253,0.975177,0.000149
...,...,...,...,...,...,...
1095,14227.0,0.000000,0.046154,0.013437,0.372340,0.010004
1130,14286.0,0.000000,0.000000,0.011477,0.535461,0.015633
1294,14548.0,0.000000,0.007692,0.003640,0.209220,0.001996
860,13814.0,0.000000,0.007692,0.008836,0.003546,0.006756


In [None]:
x_train.drop('CustomerID', axis=1, inplace=True)

In [None]:
cus_id = x_test['CustomerID']
x_test.drop('CustomerID', axis=1, inplace=True)

###Logistic regression

In [None]:
x_train

Unnamed: 0,is_weekend,frequency,monetary,recency,Quantity
1356,0.000000,0.000000,0.000702,0.031915,0.000806
480,0.333333,0.015385,0.009575,0.063830,0.005675
2880,0.000000,0.000000,0.002500,0.446809,0.002450
2609,0.333333,0.015385,0.004593,0.014184,0.004392
1446,0.000000,0.000000,0.000253,0.975177,0.000149
...,...,...,...,...,...
1095,0.000000,0.046154,0.013437,0.372340,0.010004
1130,0.000000,0.000000,0.011477,0.535461,0.015633
1294,0.000000,0.007692,0.003640,0.209220,0.001996
860,0.000000,0.007692,0.008836,0.003546,0.006756


In [None]:
x_test

Unnamed: 0,is_weekend,frequency,monetary,recency,Quantity
1116,0.000000,0.000000,0.000652,0.177305,0.000524
2358,0.000000,0.007692,0.001532,0.549645,0.000885
807,0.000000,0.000000,0.001620,0.031915,0.002998
2700,0.600000,0.030769,0.008766,0.113475,0.006145
196,0.000000,0.015385,0.010958,0.184397,0.003945
...,...,...,...,...,...
1805,0.000000,0.007692,0.003612,0.333333,0.004032
1151,0.000000,0.015385,0.005030,0.088652,0.011860
340,1.000000,0.000000,0.000757,0.613475,0.000509
1563,0.454545,0.076923,0.022326,0.017730,0.020040


In [None]:
# import the class
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# instantiate the model (using the default parameters)
logreg = LogisticRegression()

# fit the model with data
logreg.fit(x_train,y_train)

#predict on the test data
y_pred=logreg.predict(x_test)

print('Test accuracy is', accuracy_score(y_test, y_pred))

Test accuracy is 0.6552655265526552


###SVC

In [None]:
from sklearn import svm

clf_svc = svm.SVC()
clf_svc.fit(x_train, y_train)

y_pred = clf_svc.predict(x_test)

print('Test accuracy is', accuracy_score(y_test, y_pred))

Test accuracy is 0.6741674167416741


###RandomForestClassifier


In [None]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=2, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [None]:
y_pred = clf.predict(x_test)

In [None]:
print('Test accuracy is', accuracy_score(y_test, y_pred))

Test accuracy is 0.6642664266426642


###RandomForestClassifier with GridSearchCV


In [None]:
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [None]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(x_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 158 tasks      | elapsed:  7.2min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 13.3min finished


RandomizedSearchCV(cv=3, error_score=nan,
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    ccp_alpha=0.0,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    max_samples=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
               

In [None]:
y_pred = rf_random.predict(x_test)

print('Test accuracy is', accuracy_score(y_test, y_pred))

Test accuracy is 0.6426642664266426


###XGBoost

In [None]:
import xgboost as xgb

data_dmatrix = xgb.DMatrix(data=x_train,label=y_train)

xg_cls = xgb.XGBClassifier(objective ='reg:linear', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 10)

In [None]:
xg_cls.fit(x_train,y_train)

preds = xg_cls.predict(x_test)



In [None]:
print('Test accuracy is', accuracy_score(y_test, preds))

Test accuracy is 0.6462646264626463
