In [1]:
# Importing basic libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm   # OSM module
import statistics as sts       # Basic statistics

In [2]:
# Read the data set using pandas
file=pd.read_csv('Segmentdata.csv')
# Parsing 'InvoiceDate' as date data type
file['InvoiceDate'] = pd.to_datetime(file['InvoiceDate']) .apply(lambda x: x.strftime('%d/%m/%Y')if not pd.isnull(x) else '')
# Parsing 'InvoiceTime' as time data type
file['Invoice Time'] = pd.to_datetime(file['Invoice Time']).apply(lambda x: x.strftime('%H:%M:%S')if not pd.isnull(x) else '')
file.dropna(inplace=True)      # Drop null values
file.reset_index(inplace=True) # Introduce index col
file.head()                    # Check the data

Unnamed: 0,index,InvoiceDate,Invoice Time,CustomerID,InvoiceNo,StockCode,Description,Country,Quantity,UnitPrice,Revenue,Items availability,revenue_buckets,price_buckets,final_revenue
0,0,14/12/2017,06:00:00,AVpgMuGwLJeJML43KY_c,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,United Kingdom,6,2.55,15.3,In Stock,Very Good,Medium,20
1,1,09/08/2017,05:00:00,AVpgMuGwLJeJML43KY_c,536365,71053,WHITE METAL LANTERN,United Kingdom,6,3.39,20.34,In Stock,Excellent,High,30
2,2,10/10/2017,05:00:00,AVpgMuGwLJeJML43KY_c,536365,84406B,CREAM CUPID HEARTS COAT HANGER,United Kingdom,8,2.75,22.0,In Stock,Excellent,High,30
3,3,28/08/2017,07:00:00,AVpgMuGwLJeJML43KY_c,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,United Kingdom,6,3.39,20.34,In Stock,Excellent,High,30
4,4,24/10/2017,04:00:00,AVpgMuGwLJeJML43KY_c,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,United Kingdom,6,3.39,20.34,In Stock,Excellent,High,30


In [3]:
# int and float variables range
file.describe()

Unnamed: 0,index,InvoiceNo,Quantity,UnitPrice,Revenue,final_revenue
count,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0
mean,9999.0,545115.134307,10.80124,2.985579,20.080151,25.577279
std,5773.358353,5492.606894,46.768777,4.137969,88.068819,87.944806
min,0.0,536365.0,1.0,0.1,0.12,10.0
25%,4999.5,540520.0,1.0,1.25,3.3,10.0
50%,9999.0,544836.0,4.0,1.95,8.95,10.0
75%,14998.5,549947.0,10.0,3.75,17.4,20.0
max,19998.0,554656.0,3906.0,195.0,3828.0,3830.0


In [4]:
# Data frame variables data types
file.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19999 entries, 0 to 19998
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   index               19999 non-null  int64  
 1   InvoiceDate         19999 non-null  object 
 2   Invoice Time        19999 non-null  object 
 3   CustomerID          19999 non-null  object 
 4   InvoiceNo           19999 non-null  int64  
 5   StockCode           19999 non-null  object 
 6   Description         19999 non-null  object 
 7   Country             19999 non-null  object 
 8   Quantity            19999 non-null  int64  
 9   UnitPrice           19999 non-null  float64
 10  Revenue             19999 non-null  float64
 11  Items availability  19999 non-null  object 
 12  revenue_buckets     19999 non-null  object 
 13  price_buckets       19999 non-null  object 
 14  final_revenue       19999 non-null  int64  
dtypes: float64(2), int64(4), object(9)
memory usage: 2.3+

In [5]:
# Plotting distribution of 'final_revenue','UnitPrice' and 'Quantity'
%matplotlib notebook
h=['final_revenue','UnitPrice','Quantity']
cond=file[h]
sns.set()
sns.boxplot(data=cond,orient='h')
plt.xlabel('Range')
plt.ylabel('Attributes')
plt.title('Quantitative attributes range')
plt.show()

<IPython.core.display.Javascript object>

In [6]:
# Data frame in quartiles
q1=file.quantile(0.25)
q3=file.quantile(0.75)
iqr=q3-q1 
iqr # Inter-quartile range

index            9999.0
InvoiceNo        9427.0
Quantity            9.0
UnitPrice           2.5
Revenue            14.1
final_revenue      10.0
dtype: float64

In [7]:
file_t=file.drop('InvoiceDate',axis=1)   # Drop date data type
file_t=file.drop('index',axis=1)         # Drop index column
file_t=file.drop('Invoice Time',axis=1)  # Drop time data type

file_out = file_t[~((file_t< (q1 - 1.5 * iqr)) |(file_t> (q3 + 1.5 * iqr))).any(axis=1)] # outlier removal

print ("When we have not removed any outliers from the dataset, we have " + str(len(file_t)) + " entries") 
print ("When we have removed outliers from the dataset, we have " + str(len(file_out)) + " entries")
print ("The proportion of outliers which exist when compared to the dataframe are: " + str(len(file_out)/len(file_t)))

When we have not removed any outliers from the dataset, we have 19999 entries
When we have removed outliers from the dataset, we have 15483 entries
The proportion of outliers which exist when compared to the dataframe are: 0.7741887094354718


In [8]:
# 33% of original data was removed when outliers are considered

In [9]:
rollingdf = file[h].rolling(30).std()  # 30-day rolling standard deviation
rollingdf.dropna(inplace=True)         # Drop null values

In [10]:
%matplotlib notebook
#h=['final_revenue','UnitPrice','Quantity']
cor=rollingdf[h].corr()   # correlation for selected columns
sns.set()
sns.heatmap(cor,cmap='RdYlGn',annot=True)

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x1700b6e2c70>

In [11]:
%matplotlib notebook

h1=['UnitPrice','Quantity']
h2=['final_revenue']
X = rollingdf[h1]       # Independent variables
X = sm.add_constant(X)  # Intercept added by user
y = rollingdf[h2]       # Dependent variable
model = sm.OLS(y, X)    # OLS model for predictive analysis 
train = model.fit()     # Fit the model
train.summary()         # Display statistics

0,1,2,3
Dep. Variable:,final_revenue,R-squared:,0.438
Model:,OLS,Adj. R-squared:,0.438
Method:,Least Squares,F-statistic:,7789.0
Date:,"Sun, 16 Aug 2020",Prob (F-statistic):,0.0
Time:,19:17:32,Log-Likelihood:,-108870.0
No. Observations:,19970,AIC:,217700.0
Df Residuals:,19967,BIC:,217800.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,2.0695,0.579,3.572,0.000,0.934,3.205
UnitPrice,1.9344,0.140,13.809,0.000,1.660,2.209
Quantity,1.1815,0.010,122.397,0.000,1.163,1.200

0,1,2,3
Omnibus:,23286.822,Durbin-Watson:,0.036
Prob(Omnibus):,0.0,Jarque-Bera (JB):,4368262.708
Skew:,5.966,Prob(JB):,0.0
Kurtosis:,74.466,Cond. No.,65.7


In [12]:
%matplotlib notebook
ypredict = train.predict(X)                      # Predict the dependent values using trained data
plt.plot(ypredict, label='Predicted Revenue')
plt.plot(y, label='Actual Revenue')
plt.title('Regression analysis for revenue')
plt.legend()
plt.show()

<IPython.core.display.Javascript object>