In [1]:
# Importing basic libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Removing warnings at output
import warnings
warnings.filterwarnings('ignore')

In [3]:
file=pd.read_csv('Segmentdata.csv',parse_dates=['InvoiceDate']) # Parsing 'InvoiceDate' as date format
file['UnitPrice'] = file['UnitPrice'].apply(np.int64) # Changing 'UnitPrice' to 'int' data type
file.head() # Checking 1st 5 rows of the segmented data

Unnamed: 0,InvoiceDate,Invoice Time,CustomerID,InvoiceNo,StockCode,Description,Country,Quantity,UnitPrice,Revenue,Items availability,revenue_buckets,price_buckets,final_revenue
0,2017-12-14,6:00,AVpgMuGwLJeJML43KY_c,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,United Kingdom,6,2,15.3,In Stock,Very Good,Medium,20
1,2017-08-09,5:00,AVpgMuGwLJeJML43KY_c,536365,71053,WHITE METAL LANTERN,United Kingdom,6,3,20.34,In Stock,Excellent,High,30
2,2017-10-10,5:00,AVpgMuGwLJeJML43KY_c,536365,84406B,CREAM CUPID HEARTS COAT HANGER,United Kingdom,8,2,22.0,In Stock,Excellent,High,30
3,2017-08-28,7:00,AVpgMuGwLJeJML43KY_c,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,United Kingdom,6,3,20.34,In Stock,Excellent,High,30
4,2017-10-24,4:00,AVpgMuGwLJeJML43KY_c,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,United Kingdom,6,3,20.34,In Stock,Excellent,High,30


In [4]:
# Calculating unique items and countries in segmented data
print("number of unique items :",file.Description.nunique()) 
print("number of unique country:",file.Country.nunique())

number of unique items : 2439
number of unique country: 5


In [5]:
# Assigning numerical values for 'Country' column
Labels = {'United Kingdom': 1,'Greece': 2,'Malta':3,'Canada':4,'United Arab Emirates':5} 

file.Country = [Labels[item] for item in file.Country] 

In [6]:
# Filtering rows in file with Top10 'Description' values and assigning it to new variable
f1=file.loc[file['Description'].isin(['REGENCY CAKESTAND 3 TIER','WHITE HANGING HEART T-LIGHT HOLDER','PARTY BUNTING','ASSORTED COLOUR BIRD ORNAMENT',
   'JUMBO BAG RED RETROSPOT','SMALL POPCORN HOLDER','PLEASE ONE PERSON METAL SIGN','WOODEN PICTURE FRAME WHITE FINISH',
   'LUNCH BAG  BLACK SKULL.','REX CASH+CARRY JUMBO SHOPPER'])]

# Assigning numerical values for Top10 'Description'
Mask = {'REGENCY CAKESTAND 3 TIER':1,'WHITE HANGING HEART T-LIGHT HOLDER':2,'PARTY BUNTING':3,
       'ASSORTED COLOUR BIRD ORNAMENT':4,'JUMBO BAG RED RETROSPOT':5,'SMALL POPCORN HOLDER':6,
       'PLEASE ONE PERSON METAL SIGN':7,'WOODEN PICTURE FRAME WHITE FINISH':8,'LUNCH BAG  BLACK SKULL.':9,
       'REX CASH+CARRY JUMBO SHOPPER':10}

f1.Description = [Mask[item] for item in f1.Description]

In [7]:
# Assigning numerical values to 'price_buckets'
M1= {'Critical':1,'Low':2,'Medium':3,'High':4,'Very High':5}
f1.price_buckets = [M1[item] for item in f1.price_buckets] 


In [8]:
# Assigning numerical values to 'revenue_buckets'
M2= {'Very Low':1,'Low':2,'Good':3,'Very Good':4,'Excellent':5}
f1.revenue_buckets = [M2[item] for item in f1.revenue_buckets] 

In [9]:
f1.head() # Checking the new Top10 'Description' wise filtered data

Unnamed: 0,InvoiceDate,Invoice Time,CustomerID,InvoiceNo,StockCode,Description,Country,Quantity,UnitPrice,Revenue,Items availability,revenue_buckets,price_buckets,final_revenue
0,2017-12-14,6:00,AVpgMuGwLJeJML43KY_c,536365,85123A,2,1,6,2,15.3,In Stock,4,3,20
11,2018-05-26,16:00,AVpe9FXeLJeJML43zHrq,536373,85123A,2,1,6,2,15.3,In Stock,4,3,20
21,2018-05-26,7:00,AVpi6aLv1cnluZ0-Rv8A,536373,82482,8,1,6,2,12.6,In Stock,3,3,20
27,2017-09-28,4:00,AV2Z1Efc-jtxr-f39lm6,536375,85123A,2,1,6,2,15.3,In Stock,4,3,20
37,2018-05-26,15:00,AWACBprXKZqtpbFMVBZo,536375,82482,8,1,6,2,12.6,In Stock,3,3,20


In [10]:
np.random.seed(0) # Making random function to generate pseudo random numbers
msk = np.random.rand(len(f1)) < 0.7 # Random splitting for dataset with 7:3 
train = f1[msk] # 70% data
test = f1[~msk] # 30% data
train.shape, test.shape # Displaying their rows and columns count

((612, 14), (240, 14))

In [11]:
# Creating a new columns for 'InvoiceDate' to make easier calculations
train['dayofmonth'] = train.InvoiceDate.dt.day
train['dayofyear'] = train.InvoiceDate.dt.dayofyear
train['dayofweek'] = train.InvoiceDate.dt.dayofweek
train['month'] = train.InvoiceDate.dt.month
train['year'] = train.InvoiceDate.dt.year
train['weekofyear'] = train.InvoiceDate.dt.weekofyear
train.head() # Checking new train columns

Unnamed: 0,InvoiceDate,Invoice Time,CustomerID,InvoiceNo,StockCode,Description,Country,Quantity,UnitPrice,Revenue,Items availability,revenue_buckets,price_buckets,final_revenue,dayofmonth,dayofyear,dayofweek,month,year,weekofyear
0,2017-12-14,6:00,AVpgMuGwLJeJML43KY_c,536365,85123A,2,1,6,2,15.3,In Stock,4,3,20,14,348,3,12,2017,50
21,2018-05-26,7:00,AVpi6aLv1cnluZ0-Rv8A,536373,82482,8,1,6,2,12.6,In Stock,3,3,20,26,146,5,5,2018,21
27,2017-09-28,4:00,AV2Z1Efc-jtxr-f39lm6,536375,85123A,2,1,6,2,15.3,In Stock,4,3,20,28,271,3,9,2017,39
37,2018-05-26,15:00,AWACBprXKZqtpbFMVBZo,536375,82482,8,1,6,2,12.6,In Stock,3,3,20,26,146,5,5,2018,21
68,2017-06-12,3:00,AVpfLsb-ilAPnD_xWtDE,536390,85123A,2,1,64,2,163.2,In Stock,5,3,170,12,163,0,6,2017,24


In [12]:
# Creating a new columns for 'InvoiceDate' to make easier calculations
test['dayofmonth'] = test.InvoiceDate.dt.day
test['dayofyear'] = test.InvoiceDate.dt.dayofyear
test['dayofweek'] = test.InvoiceDate.dt.dayofweek
test['month'] = test.InvoiceDate.dt.month
test['year'] = test.InvoiceDate.dt.year
test['weekofyear'] = test.InvoiceDate.dt.weekofyear
test.head() # Checking new test columns


Unnamed: 0,InvoiceDate,Invoice Time,CustomerID,InvoiceNo,StockCode,Description,Country,Quantity,UnitPrice,Revenue,Items availability,revenue_buckets,price_buckets,final_revenue,dayofmonth,dayofyear,dayofweek,month,year,weekofyear
11,2018-05-26,16:00,AVpe9FXeLJeJML43zHrq,536373,85123A,2,1,6,2,15.3,In Stock,4,3,20,26,146,5,5,2018,21
82,2017-03-30,10:00,AVpf3txeLJeJML43FN82,536390,85099B,5,1,100,1,165.0,In Stock,5,2,170,30,89,3,3,2017,13
83,2018-04-24,15:00,AWIm0C3TYSSHbkXwx3S6,536396,85123A,2,1,6,2,15.3,In Stock,4,3,20,24,114,1,4,2018,17
105,2018-04-26,17:00,AV13iAUYGV-KLJ3aka9M,536406,85123A,2,1,8,2,20.4,In Stock,5,3,30,26,116,3,4,2018,17
174,2018-05-03,10:00,AVphBFxt1cnluZ0-9PuF,536409,85099B,5,1,2,1,3.9,In Stock,2,3,10,3,123,3,5,2018,18


In [13]:
# Factorising numerical columns for train (an alternative for get_dummy)
train['Country']=pd.factorize(train['Country'])[0]
train['Description']=pd.factorize(train['Description'])[0]
train['final_revenue']=pd.factorize(train['final_revenue'])[0]
train['price_buckets']=pd.factorize(train['price_buckets'])[0]
train['revenue_buckets']=pd.factorize(train['revenue_buckets'])[0]
train['dayofmonth']=pd.factorize(train['dayofmonth'])[0]
train['dayofyear']=pd.factorize(train['dayofyear'])[0]
train['dayofweek']=pd.factorize(train['dayofweek'])[0]
train['month']=pd.factorize(train['month'])[0]
train['year']=pd.factorize(train['year'])[0]
train['weekofyear']=pd.factorize(train['weekofyear'])[0]
train['UnitPrice']=pd.factorize(train['UnitPrice'])[0]

# Dropping categorical columns
train.drop(['InvoiceDate','Invoice Time','CustomerID','InvoiceNo','StockCode','Items availability','Revenue'],axis=1,inplace=True)
train.shape # Checking rows and columns count

(612, 13)

In [14]:
# Factorising numerical columns for test (an alternative for get_dummy)
test['Country']=pd.factorize(test['Country'])[0]
test['Description']=pd.factorize(test['Description'])[0]
test['final_revenue']=pd.factorize(test['final_revenue'])[0]
test['price_buckets']=pd.factorize(test['price_buckets'])[0]
test['revenue_buckets']=pd.factorize(test['revenue_buckets'])[0]
test['dayofmonth']=pd.factorize(test['dayofmonth'])[0]
test['dayofyear']=pd.factorize(test['dayofyear'])[0]
test['dayofweek']=pd.factorize(test['dayofweek'])[0]
test['month']=pd.factorize(test['month'])[0]
test['year']=pd.factorize(test['year'])[0]
test['weekofyear']=pd.factorize(test['weekofyear'])[0]
test['UnitPrice']=pd.factorize(test['UnitPrice'])[0]

# Dropping categorical columns
test.drop(['InvoiceDate','Invoice Time','CustomerID','InvoiceNo','StockCode','Items availability','Revenue'],axis=1,inplace=True)
test.shape # Checking rows and columns count

(240, 13)

In [15]:
X_train = train.drop('Quantity', axis=1).values # Drop the dependent variable
X_test = test.drop('Quantity', axis=1).values   # Drop the dependent variable
y_train = train['Quantity'].values              # Find the dependent variable
y_test = test['Quantity'].values                # Find the dependent variable

In [16]:
import lightgbm as lgb # Using LightGBM as predictive model 
from sklearn.metrics import mean_squared_error # MSE for accuracy

lgb_train = lgb.Dataset(X_train, y_train) # Sending train date
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train) # Sending test data

params = {'task':'train', 'boosting_type':'gbdt', 'objective':'regression', 
              'metric': {'rmse'}, 'num_leaves': 10, 'learning_rate': 0.05, 
              'feature_fraction': 0.8, 'max_depth': 5, 'verbose': 0, 
              'num_boost_round':10000, 'early_stopping_rounds':1000, 'nthread':-1} # setting model parameters

gbm = lgb.train(params,
                lgb_train,
                num_boost_round=20,
                valid_sets=lgb_eval,
                early_stopping_rounds=5) # sending values to models

# predict
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration) # predict using test
# eval
print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5) # RMSE for accuracy

[1]	valid_0's rmse: 34.8688
Training until validation scores don't improve for 1000 rounds
[2]	valid_0's rmse: 34.2974
[3]	valid_0's rmse: 33.7919
[4]	valid_0's rmse: 33.2196
[5]	valid_0's rmse: 32.6775
[6]	valid_0's rmse: 32.1938
[7]	valid_0's rmse: 31.7624
[8]	valid_0's rmse: 31.4211
[9]	valid_0's rmse: 31.0694
[10]	valid_0's rmse: 30.7095
[11]	valid_0's rmse: 30.3958
[12]	valid_0's rmse: 30.1596
[13]	valid_0's rmse: 29.9509
[14]	valid_0's rmse: 29.7456
[15]	valid_0's rmse: 29.5453
[16]	valid_0's rmse: 29.4206
[17]	valid_0's rmse: 29.3233
[18]	valid_0's rmse: 29.1645
[19]	valid_0's rmse: 29.1164
[20]	valid_0's rmse: 29.0409
[21]	valid_0's rmse: 28.993
[22]	valid_0's rmse: 29.0004
[23]	valid_0's rmse: 28.8259
[24]	valid_0's rmse: 28.7577
[25]	valid_0's rmse: 28.7974
[26]	valid_0's rmse: 28.6606
[27]	valid_0's rmse: 28.6032
[28]	valid_0's rmse: 28.5429
[29]	valid_0's rmse: 28.4419
[30]	valid_0's rmse: 28.3574
[31]	valid_0's rmse: 28.2333
[32]	valid_0's rmse: 28.3236
[33]	valid_0's rmse

[468]	valid_0's rmse: 39.0044
[469]	valid_0's rmse: 38.9987
[470]	valid_0's rmse: 38.9886
[471]	valid_0's rmse: 39.0264
[472]	valid_0's rmse: 39.0089
[473]	valid_0's rmse: 38.9419
[474]	valid_0's rmse: 38.9536
[475]	valid_0's rmse: 38.9901
[476]	valid_0's rmse: 38.973
[477]	valid_0's rmse: 38.9502
[478]	valid_0's rmse: 38.9297
[479]	valid_0's rmse: 38.9174
[480]	valid_0's rmse: 38.8965
[481]	valid_0's rmse: 38.9158
[482]	valid_0's rmse: 38.9355
[483]	valid_0's rmse: 38.9633
[484]	valid_0's rmse: 38.9566
[485]	valid_0's rmse: 38.9478
[486]	valid_0's rmse: 38.967
[487]	valid_0's rmse: 38.9655
[488]	valid_0's rmse: 38.9554
[489]	valid_0's rmse: 38.9358
[490]	valid_0's rmse: 38.9682
[491]	valid_0's rmse: 38.9501
[492]	valid_0's rmse: 38.9346
[493]	valid_0's rmse: 38.915
[494]	valid_0's rmse: 38.9028
[495]	valid_0's rmse: 38.8717
[496]	valid_0's rmse: 38.8025
[497]	valid_0's rmse: 38.7862
[498]	valid_0's rmse: 38.7202
[499]	valid_0's rmse: 38.7396
[500]	valid_0's rmse: 38.7518
[501]	valid_0

[956]	valid_0's rmse: 37.6177
[957]	valid_0's rmse: 37.6385
[958]	valid_0's rmse: 37.6283
[959]	valid_0's rmse: 37.6355
[960]	valid_0's rmse: 37.6252
[961]	valid_0's rmse: 37.6277
[962]	valid_0's rmse: 37.627
[963]	valid_0's rmse: 37.6178
[964]	valid_0's rmse: 37.6135
[965]	valid_0's rmse: 37.6039
[966]	valid_0's rmse: 37.602
[967]	valid_0's rmse: 37.6184
[968]	valid_0's rmse: 37.6367
[969]	valid_0's rmse: 37.6275
[970]	valid_0's rmse: 37.6455
[971]	valid_0's rmse: 37.6456
[972]	valid_0's rmse: 37.6327
[973]	valid_0's rmse: 37.6428
[974]	valid_0's rmse: 37.6473
[975]	valid_0's rmse: 37.6384
[976]	valid_0's rmse: 37.6287
[977]	valid_0's rmse: 37.6232
[978]	valid_0's rmse: 37.6246
[979]	valid_0's rmse: 37.6286
[980]	valid_0's rmse: 37.6312
[981]	valid_0's rmse: 37.6378
[982]	valid_0's rmse: 37.6251
[983]	valid_0's rmse: 37.6186
[984]	valid_0's rmse: 37.6164
[985]	valid_0's rmse: 37.6114
[986]	valid_0's rmse: 37.6059
[987]	valid_0's rmse: 37.6062
[988]	valid_0's rmse: 37.5981
[989]	valid_

In [17]:
# Array:Description,Country,UnitPrice,revenue_buckets,price_buckets,final_revenue,dayofmonth,dayofyear,dayofweek,month,year,weekofyear
X_prediction=np.array([[1,2,3,4,5,16,7,8,9,11,2018,30]]) # new values
predictions = gbm.predict(X_prediction) # predictive function
predictions # display 'Quantity'value 

array([113.12487383])