In [1]:
import pandas as pd
import numpy as np
import time
from datetime import datetime as dt
from sklearn.neural_network import MLPRegressor
pd.set_option('display.max_rows', 30)
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler, RobustScaler
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.svm import SVR
from sklearn.svm import SVC

# Data Preperation

In [2]:
fileName = "Online Retail.xlsx"
data_raw = pd.read_excel("Online Retail.xlsx")


In [3]:
data = data_raw.copy()

In [4]:
#Datetime transformation
data['date'] = pd.to_datetime(data.InvoiceDate.dt.date)
data['time'] = data.InvoiceDate.dt.time
data['hour'] = data['time'].apply(lambda x: x.hour)
data['weekend'] = data['date'].apply(lambda x: x.weekday() in [5, 6])
data['dayofweek'] = data['date'].apply(lambda x: x.dayofweek)

data['Revenue'] = data['Quantity'] * data['UnitPrice']
data.drop(["StockCode","Description","Country"],axis = 1, inplace = True)
data.dropna(inplace = True)

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 406829 entries, 0 to 541908
Data columns (total 11 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   InvoiceNo    406829 non-null  object        
 1   Quantity     406829 non-null  int64         
 2   InvoiceDate  406829 non-null  datetime64[ns]
 3   UnitPrice    406829 non-null  float64       
 4   CustomerID   406829 non-null  float64       
 5   date         406829 non-null  datetime64[ns]
 6   time         406829 non-null  object        
 7   hour         406829 non-null  int64         
 8   weekend      406829 non-null  bool          
 9   dayofweek    406829 non-null  int64         
 10  Revenue      406829 non-null  float64       
dtypes: bool(1), datetime64[ns](2), float64(3), int64(3), object(2)
memory usage: 34.5+ MB


In [6]:
data.head(30)

Unnamed: 0,InvoiceNo,Quantity,InvoiceDate,UnitPrice,CustomerID,date,time,hour,weekend,dayofweek,Revenue
0,536365,6,2010-12-01 08:26:00,2.55,17850.0,2010-12-01,08:26:00,8,False,2,15.3
1,536365,6,2010-12-01 08:26:00,3.39,17850.0,2010-12-01,08:26:00,8,False,2,20.34
2,536365,8,2010-12-01 08:26:00,2.75,17850.0,2010-12-01,08:26:00,8,False,2,22.0
3,536365,6,2010-12-01 08:26:00,3.39,17850.0,2010-12-01,08:26:00,8,False,2,20.34
4,536365,6,2010-12-01 08:26:00,3.39,17850.0,2010-12-01,08:26:00,8,False,2,20.34
5,536365,2,2010-12-01 08:26:00,7.65,17850.0,2010-12-01,08:26:00,8,False,2,15.3
6,536365,6,2010-12-01 08:26:00,4.25,17850.0,2010-12-01,08:26:00,8,False,2,25.5
7,536366,6,2010-12-01 08:28:00,1.85,17850.0,2010-12-01,08:28:00,8,False,2,11.1
8,536366,6,2010-12-01 08:28:00,1.85,17850.0,2010-12-01,08:28:00,8,False,2,11.1
9,536367,32,2010-12-01 08:34:00,1.69,13047.0,2010-12-01,08:34:00,8,False,2,54.08


In [7]:
def prep_data(data, feature_start, feature_end, target_start, target_end):

    print(f'Using data from {(pd.to_datetime(feature_end) - pd.to_datetime(feature_start)).days} days')
    print(f'To predict {(pd.to_datetime(target_end) - pd.to_datetime(target_start)).days} days')
    feature_data = data.loc[(feature_start <= data.date) & (data.date <= feature_end)]
    target_data = data.loc[(target_start <= data.date) & (data.date <= target_end)]

    #total_rev: totalde getirdiği para
    total_rev = feature_data.groupby("CustomerID")["Revenue"].sum().rename("total_rev")
    #recency: kullanıcının son alışverişi ile ilk alışverişi arasındaki süre
    recency = (feature_data.groupby("CustomerID")["date"].max() - feature_data.groupby("CustomerID")["date"].min()).apply(lambda x: x.days).rename("recency")
    #frequency: totalde kaç defa alışveriş yaptığının sayısı
    frequency = feature_data.groupby("CustomerID")["InvoiceNo"].count().rename("frequency")
    #t: kullanıcının ilk alışverişinden günümüze(dataset teki son tarih) kadar olan gün sayısı
    t = feature_data.groupby("CustomerID")["date"].min().apply(lambda x: (dt.strptime(feature_end, '%Y-%m-%d') - x).days).rename("t")
    #time_between: kullanıcının iki alışverişi arasındaki ortalama zaman, alışveriş sıklığı
    time_between = (t/frequency).rename("time_between")
    #avg_basket_value: tek seferde yapılan ortalama harcama
    avg_basket_value =  (total_rev/ frequency).rename("avg_basket_value")
    #avg_basket_size: tek seferde alının ortalama ürün sayısı
    avg_basket_size = (feature_data.groupby("CustomerID")["Quantity"].sum() / frequency).rename("avg_basket_size")
    #returns: yapılan iade sayısı
    returns = feature_data.loc[feature_data["Revenue"] < 0].groupby("CustomerID")["InvoiceNo"].count().rename("returns")
    returns.fillna(0)
    #hour: kullanıcının yaptığı alışverişlerin saatlerinin medyanı
    hour = feature_data.groupby("CustomerID")["hour"].median().rename("purchase_hour_med")
    #dow: Alışveriş yaptığı günlerin medyanı
    dow = feature_data.groupby("CustomerID") ["dayofweek"].median().rename("purchase_dow_med")
    #weekend: kullanıcının haftasonu alışveriş yapma sıklığı
    weekend = feature_data.groupby("CustomerID")["weekend"].mean().rename("purchase_weekend_prop")
    
    final_data = pd.concat([total_rev, recency, frequency, t, time_between, avg_basket_value, avg_basket_size, returns, hour, dow, weekend],axis=1)
    
    target_rev = target_data.groupby("CustomerID")["Revenue"].sum().rename("target_rev")
    final_data = final_data.join(target_rev).fillna(0)

    return final_data

In [8]:
def evaluate(model,X,Y):
    R2_score = neural_net.score(X,Y)
    MSE = mean_squared_error(model.predict(X),Y)
    print("R2 score is:", R2_score)
    print("MSE is:", MSE)

In [9]:
def inverse_scaling_target(scaler,arr):
    arr = arr.reshape((-1,1))
    dummy_arr = np.zeros((len(arr),scaler.n_features_in_-1))
    return scaler.inverse_transform(np.concatenate((dummy_arr,arr), axis = 1))[:,-1]

In [10]:
train_data = prep_data(data, '2011-01-01', '2011-06-11', '2011-06-12', '2011-09-09')
test_data = prep_data(data, '2011-04-02', '2011-09-10', '2011-09-11', '2011-12-09')

Using data from 161 days
To predict 89 days
Using data from 161 days
To predict 89 days


In [11]:
scaler = RobustScaler()
scaler.fit(train_data)
scaled_train_data = scaler.transform(train_data)
scaled_test_data = scaler.transform(test_data)

# Neural Net Implementation

In [None]:
learning_rates = [10**(-3),1*10**(-4)]
results = {"MSE_train":[] ,"MSE_test":[], "R2_train":[], "R2_test":[], "run_time":[], "n_iterations":[], "learning_rate":[]}
for learning_rate in learning_rates:
    t1 = time.time()
    neural_net = MLPRegressor(shuffle = True,learning_rate_init = learning_rate,max_iter=2000, hidden_layer_sizes = (128,128,128,128,128), solver = "adam", tol = 9e-6, batch_size = 512
                         ).fit(scaled_train_data[:,:-1], scaled_train_data[:,-1])
    t2 = time.time()
    run_time = t2-t1
    train_actual = scaled_train_data[:,-1]
    train_predict = neural_net.predict(scaled_train_data[:,:-1])
    test_actual = scaled_test_data[:,-1]
    test_predict = neural_net.predict(scaled_test_data[:,:-1])
    MSE_train = mean_squared_error(train_actual,train_predict)
    MSE_test = mean_squared_error(test_actual,test_predict)
    R2_train = r2_score(train_actual, train_predict)
    R2_test = r2_score(test_actual, test_predict)
    results["MSE_train"].append(MSE_train)
    results["MSE_test"].append(MSE_test)
    results["R2_train"].append(R2_train)
    results["R2_test"].append(R2_test)
    results["run_time"].append(run_time)
    results["n_iterations"].append(neural_net.n_iter_)
    results["learning_rate"].append(learning_rate)
    print("Neural Net trained.")
    
                                       

Neural Net trained.
Neural Net trained.


In [126]:
df = pd.DataFrame(results)
df

Unnamed: 0,MSE_train,MSE_test,R2_train,R2_test,run_time,n_iterations,learning_rate
0,6.398988,36.232613,0.768451,0.547512,1.277357,25,0.001
1,5.828865,29.420755,0.789081,0.632582,2.628742,53,0.0001


In [166]:
inverse_scaled_test_actual = inverse_scaling_target(scaler, test_actual)
inverse_scaled_test_predict = inverse_scaling_target(scaler, test_predict)
print("MSE:",mean_squared_error(inverse_scaled_test_actual, inverse_scaled_test_predict))
print("R2:", r2_score(inverse_scaled_test_actual, inverse_scaled_test_predict))

MSE: 7293534.902382953
R2: 0.6325815528713058


# SVM Implementation

In [13]:
results = {"MSE_train":[] ,"MSE_test":[], "R2_train":[], "R2_test":[], "run_time":[], "C_val":[], "kernel_val":[]}
C_val = [0.01 ,0.1, 1, 5, 10]
kernel_val = ["linear"]
counter = 0
for kernel in kernel_val:
    for c in C_val:
            t1 = time.time()
            regr = SVR(kernel = kernel, C = c, tol = 0.0001).fit(scaled_train_data[:,:-1], scaled_train_data[:,-1])
            t2 = time.time()
            run_time = t2-t1
            train_actual = scaled_train_data[:,-1]
            train_predict = regr.predict(scaled_train_data[:,:-1])
            test_actual = scaled_test_data[:,-1]
            test_predict = regr.predict(scaled_test_data[:,:-1])

            inverse_scaled_train_actual = inverse_scaling_target(scaler, train_actual)
            inverse_scaled_train_predict = inverse_scaling_target(scaler, train_predict)

            inverse_scaled_test_actual = inverse_scaling_target(scaler, test_actual)
            inverse_scaled_test_predict = inverse_scaling_target(scaler, test_predict)

            MSE_train = mean_squared_error(inverse_scaled_train_actual, inverse_scaled_train_predict)
            MSE_test = mean_squared_error(inverse_scaled_test_actual, inverse_scaled_test_predict)
            R2_train = r2_score(train_actual, train_predict)
            R2_test = r2_score(test_actual, test_predict)
            results["MSE_train"].append(MSE_train)
            results["MSE_test"].append(MSE_test)
            results["R2_train"].append(R2_train)
            results["R2_test"].append(R2_test)
            results["run_time"].append(run_time)
            results["C_val"].append(c)
            results["kernel_val"].append(kernel)
            counter += 1
            print("Model-{} trained".format(counter))

Model-1 trained
Model-2 trained
Model-3 trained
Model-4 trained
Model-5 trained


In [14]:
df = pd.DataFrame(results)
df.sort_values("MSE_test", inplace = True, ascending=False)
df

Unnamed: 0,MSE_train,MSE_test,R2_train,R2_test,run_time,C_val,kernel_val
0,1644802.0,8633533.0,0.759917,0.565078,0.340969,0.01,linear
1,1621690.0,8510149.0,0.763291,0.571294,0.796036,0.1,linear
2,1619977.0,8500917.0,0.763541,0.571759,4.209861,1.0,linear
3,1619939.0,8500665.0,0.763546,0.571771,16.729045,5.0,linear
4,1619817.0,8500020.0,0.763564,0.571804,38.677852,10.0,linear


In [None]:
results

In [23]:
5.948097e+06

5948097.0