In [1]:
# Importing basic libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Removing warnings at output
import warnings
warnings.filterwarnings('ignore')

In [3]:
file=pd.read_csv('Segmentdata.csv',parse_dates=['InvoiceDate']) # Parsing 'InvoiceDate' as date format
file['UnitPrice'] = file['UnitPrice'].apply(np.int64) # Changing 'UnitPrice' to 'int' data type
file.head() # Checking 1st 5 rows of the segmented data

Unnamed: 0,InvoiceDate,Invoice Time,CustomerID,InvoiceNo,StockCode,Description,Country,Quantity,UnitPrice,Revenue,Items availability,revenue_buckets,price_buckets,final_revenue
0,2017-12-14,6:00,AVpgMuGwLJeJML43KY_c,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,United Kingdom,6,2,15.3,In Stock,Very Good,Medium,20
1,2017-08-09,5:00,AVpgMuGwLJeJML43KY_c,536365,71053,WHITE METAL LANTERN,United Kingdom,6,3,20.34,In Stock,Excellent,High,30
2,2017-10-10,5:00,AVpgMuGwLJeJML43KY_c,536365,84406B,CREAM CUPID HEARTS COAT HANGER,United Kingdom,8,2,22.0,In Stock,Excellent,High,30
3,2017-08-28,7:00,AVpgMuGwLJeJML43KY_c,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,United Kingdom,6,3,20.34,In Stock,Excellent,High,30
4,2017-10-24,4:00,AVpgMuGwLJeJML43KY_c,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,United Kingdom,6,3,20.34,In Stock,Excellent,High,30


In [4]:
# Calculating unique items and countries in segmented data
print("number of unique items :",file.Description.nunique())
print("number of unique country:",file.Country.nunique())

number of unique items : 2439
number of unique country: 5


In [5]:
# Assigning numerical values for 'Country' column
Labels = {'United Kingdom': 1,'Greece': 2,'Malta':3,'Canada':4,'United Arab Emirates':5} 

file.Country = [Labels[item] for item in file.Country] 


In [6]:
# Filtering rows in file with Top10 'Description' values and assigning it to new variable
f1=file.loc[file['Description'].isin(['REGENCY CAKESTAND 3 TIER','WHITE HANGING HEART T-LIGHT HOLDER','PARTY BUNTING','ASSORTED COLOUR BIRD ORNAMENT',
   'JUMBO BAG RED RETROSPOT','SMALL POPCORN HOLDER','PLEASE ONE PERSON METAL SIGN','WOODEN PICTURE FRAME WHITE FINISH',
   'LUNCH BAG  BLACK SKULL.','REX CASH+CARRY JUMBO SHOPPER'])]

# Assigning numerical values for Top10 'Description'
Mask = {'REGENCY CAKESTAND 3 TIER':1,'WHITE HANGING HEART T-LIGHT HOLDER':2,'PARTY BUNTING':3,
       'ASSORTED COLOUR BIRD ORNAMENT':4,'JUMBO BAG RED RETROSPOT':5,'SMALL POPCORN HOLDER':6,
       'PLEASE ONE PERSON METAL SIGN':7,'WOODEN PICTURE FRAME WHITE FINISH':8,'LUNCH BAG  BLACK SKULL.':9,
       'REX CASH+CARRY JUMBO SHOPPER':10}

f1.Description = [Mask[item] for item in f1.Description] 


In [7]:
# Assigning numerical values to 'price_buckets'
M1= {'Critical':1,'Low':2,'Medium':3,'High':4,'Very High':5}
f1.price_buckets = [M1[item] for item in f1.price_buckets] 



In [8]:
# Assigning numerical values to 'revenue_buckets'
M2= {'Very Low':1,'Low':2,'Good':3,'Very Good':4,'Excellent':5}
f1.revenue_buckets = [M2[item] for item in f1.revenue_buckets] 


In [9]:
f1.head() # Checking the new Top10 'Description' wise filtered data

Unnamed: 0,InvoiceDate,Invoice Time,CustomerID,InvoiceNo,StockCode,Description,Country,Quantity,UnitPrice,Revenue,Items availability,revenue_buckets,price_buckets,final_revenue
0,2017-12-14,6:00,AVpgMuGwLJeJML43KY_c,536365,85123A,2,1,6,2,15.3,In Stock,4,3,20
11,2018-05-26,16:00,AVpe9FXeLJeJML43zHrq,536373,85123A,2,1,6,2,15.3,In Stock,4,3,20
21,2018-05-26,7:00,AVpi6aLv1cnluZ0-Rv8A,536373,82482,8,1,6,2,12.6,In Stock,3,3,20
27,2017-09-28,4:00,AV2Z1Efc-jtxr-f39lm6,536375,85123A,2,1,6,2,15.3,In Stock,4,3,20
37,2018-05-26,15:00,AWACBprXKZqtpbFMVBZo,536375,82482,8,1,6,2,12.6,In Stock,3,3,20


In [10]:
np.random.seed(0) # Making random function to generate pseudo random numbers
msk = np.random.rand(len(f1)) < 0.7  # Random splitting for dataset with 7:3 
train = f1[msk]  # 70% data
test = f1[~msk]  # 30% data
train.shape, test.shape  # Displaying their rows and columns count

((612, 14), (240, 14))

In [11]:
# Creating a new columns for 'InvoiceDate' to make easier calculations
train['dayofmonth'] = train.InvoiceDate.dt.day
train['dayofyear'] = train.InvoiceDate.dt.dayofyear
train['dayofweek'] = train.InvoiceDate.dt.dayofweek
train['month'] = train.InvoiceDate.dt.month
train['year'] = train.InvoiceDate.dt.year
train['weekofyear'] = train.InvoiceDate.dt.weekofyear
train.head() # Checking new train columns

Unnamed: 0,InvoiceDate,Invoice Time,CustomerID,InvoiceNo,StockCode,Description,Country,Quantity,UnitPrice,Revenue,Items availability,revenue_buckets,price_buckets,final_revenue,dayofmonth,dayofyear,dayofweek,month,year,weekofyear
0,2017-12-14,6:00,AVpgMuGwLJeJML43KY_c,536365,85123A,2,1,6,2,15.3,In Stock,4,3,20,14,348,3,12,2017,50
21,2018-05-26,7:00,AVpi6aLv1cnluZ0-Rv8A,536373,82482,8,1,6,2,12.6,In Stock,3,3,20,26,146,5,5,2018,21
27,2017-09-28,4:00,AV2Z1Efc-jtxr-f39lm6,536375,85123A,2,1,6,2,15.3,In Stock,4,3,20,28,271,3,9,2017,39
37,2018-05-26,15:00,AWACBprXKZqtpbFMVBZo,536375,82482,8,1,6,2,12.6,In Stock,3,3,20,26,146,5,5,2018,21
68,2017-06-12,3:00,AVpfLsb-ilAPnD_xWtDE,536390,85123A,2,1,64,2,163.2,In Stock,5,3,170,12,163,0,6,2017,24


In [12]:
# Creating a new columns for 'InvoiceDate' to make easier calculations
test['dayofmonth'] = test.InvoiceDate.dt.day
test['dayofyear'] = test.InvoiceDate.dt.dayofyear
test['dayofweek'] = test.InvoiceDate.dt.dayofweek
test['month'] = test.InvoiceDate.dt.month
test['year'] = test.InvoiceDate.dt.year
test['weekofyear'] = test.InvoiceDate.dt.weekofyear
test.head() # Checking new test columns


Unnamed: 0,InvoiceDate,Invoice Time,CustomerID,InvoiceNo,StockCode,Description,Country,Quantity,UnitPrice,Revenue,Items availability,revenue_buckets,price_buckets,final_revenue,dayofmonth,dayofyear,dayofweek,month,year,weekofyear
11,2018-05-26,16:00,AVpe9FXeLJeJML43zHrq,536373,85123A,2,1,6,2,15.3,In Stock,4,3,20,26,146,5,5,2018,21
82,2017-03-30,10:00,AVpf3txeLJeJML43FN82,536390,85099B,5,1,100,1,165.0,In Stock,5,2,170,30,89,3,3,2017,13
83,2018-04-24,15:00,AWIm0C3TYSSHbkXwx3S6,536396,85123A,2,1,6,2,15.3,In Stock,4,3,20,24,114,1,4,2018,17
105,2018-04-26,17:00,AV13iAUYGV-KLJ3aka9M,536406,85123A,2,1,8,2,20.4,In Stock,5,3,30,26,116,3,4,2018,17
174,2018-05-03,10:00,AVphBFxt1cnluZ0-9PuF,536409,85099B,5,1,2,1,3.9,In Stock,2,3,10,3,123,3,5,2018,18


In [13]:
# Factorising numerical columns for train (an alternative for get_dummy)
train['Country']=pd.factorize(train['Country'])[0]
train['Description']=pd.factorize(train['Description'])[0]
train['final_revenue']=pd.factorize(train['final_revenue'])[0]
train['price_buckets']=pd.factorize(train['price_buckets'])[0]
train['revenue_buckets']=pd.factorize(train['revenue_buckets'])[0]
train['dayofmonth']=pd.factorize(train['dayofmonth'])[0]
train['dayofyear']=pd.factorize(train['dayofyear'])[0]
train['dayofweek']=pd.factorize(train['dayofweek'])[0]
train['month']=pd.factorize(train['month'])[0]
train['year']=pd.factorize(train['year'])[0]
train['weekofyear']=pd.factorize(train['weekofyear'])[0]
train['Quantity']=pd.factorize(train['Quantity'])[0]

# Dropping categorical columns
train.drop(['InvoiceDate','Invoice Time','CustomerID','InvoiceNo','StockCode','Items availability','Revenue'],axis=1,inplace=True)
train.shape # Checking rows and columns count

(612, 13)

In [14]:
train.head()

Unnamed: 0,Description,Country,Quantity,UnitPrice,revenue_buckets,price_buckets,final_revenue,dayofmonth,dayofyear,dayofweek,month,year,weekofyear
0,0,0,0,2,0,0,0,0,0,0,0,0,0
21,1,0,0,2,1,0,0,1,1,1,1,1,1
27,0,0,0,2,0,0,0,2,2,0,2,0,2
37,1,0,0,2,1,0,0,1,1,1,1,1,1
68,0,0,1,2,2,0,1,3,3,2,3,0,3


In [15]:
# Factorising numerical columns for test (an alternative for get_dummy)
test['Country']=pd.factorize(test['Country'])[0]
test['Description']=pd.factorize(test['Description'])[0]
test['final_revenue']=pd.factorize(test['final_revenue'])[0]
test['price_buckets']=pd.factorize(test['price_buckets'])[0]
test['revenue_buckets']=pd.factorize(test['revenue_buckets'])[0]
test['dayofmonth']=pd.factorize(test['dayofmonth'])[0]
test['dayofyear']=pd.factorize(test['dayofyear'])[0]
test['dayofweek']=pd.factorize(test['dayofweek'])[0]
test['month']=pd.factorize(test['month'])[0]
test['year']=pd.factorize(test['year'])[0]
test['weekofyear']=pd.factorize(test['weekofyear'])[0]
test['Quantity']=pd.factorize(test['Quantity'])[0]

# Dropping categorical columns
test.drop(['InvoiceDate','Invoice Time','CustomerID','InvoiceNo','StockCode','Items availability','Revenue'],axis=1,inplace=True)
test.shape # Checking rows and columns count

(240, 13)

In [16]:
X_train = train.drop('UnitPrice', axis=1).values  # Drop the dependent variable
X_test = test.drop('UnitPrice', axis=1).values    # Drop the dependent variable
y_train = train['UnitPrice'].values               # Find the dependent variable
y_test = test['UnitPrice'].values                 # Find the dependent variable

In [17]:
from sklearn.neighbors import KNeighborsClassifier # Importing predictive model function
from sklearn.metrics import mean_squared_error     # MSE for model accuracy

KNN_model= KNeighborsClassifier(n_neighbors=5)     # Setting model parameters
KNN_model.fit(X_train,y_train)


# predict
y_pred = KNN_model.predict(X_test)                  # Predict the test data
# eval
print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)  # RMSE for accuracy

The rmse of prediction is: 3.598031869416019


In [18]:
# Array format:Description,Country,Quantity,revenue_buckets,price_buckets,final_revenue,dayofmonth,dayofyear,dayofweek,month,year,weekofyear
X_prediction=np.array([[1,1,5,5,4,20,8,9,2,3,2018,12]]) # new data
predictions = KNN_model.predict(X_prediction) # predictive function
predictions # display optimised price

array([2], dtype=int64)