# Import Libraries and packages

In [3]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(
    { "figure.figsize": (8, 5) },
    style='ticks',
    color_codes=True,
    font_scale=1
)
%config InlineBackend.figure_format = 'retina'

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

!pip install category_encoders -q

# ML utilities
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit, KFold, cross_val_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn import metrics

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, export_text
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans

# Encoding and metrics
from category_encoders import TargetEncoder
from sklearn.metrics import accuracy_score, balanced_accuracy_score, confusion_matrix, classification_report

# Hyperparameter tuning
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

# Loading the dataset

In [7]:
ecom_data = pd.read_csv('/Users/lapt/Downloads/Dataset_ecommerce.csv')

# Initial inspection of the dataset

In [10]:
ecom_data.head()

Unnamed: 0,InvoiceNo,InvoiceDate,CustomerID,StockCode,Description,Quantity,UnitPrice,Country
0,536365,2010-12-01 08:26:00,17850.0,SC1734,Electronics,65,10.23,Egypt
1,536365,2010-12-01 08:26:00,17850.0,SC2088,Furniture,95,19.61,Mali
2,536365,2010-12-01 08:26:00,17850.0,SC3463,Books,78,61.49,Mali
3,536365,2010-12-01 08:26:00,17850.0,SC6228,Toys,15,24.73,South Africa
4,536365,2010-12-01 08:26:00,17850.0,SC2149,Toys,50,38.83,Rwanda


In [12]:
ecom_data.sample()

Unnamed: 0,InvoiceNo,InvoiceDate,CustomerID,StockCode,Description,Quantity,UnitPrice,Country
100943,544887,2011-02-24 11:54:00,17364.0,SC7317,Sports Equipment,47,72.8,South Africa


In [14]:
ecom_data.shape

(541909, 8)

In [16]:
ecom_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   InvoiceNo    541909 non-null  object 
 1   InvoiceDate  541909 non-null  object 
 2   CustomerID   406829 non-null  float64
 3   StockCode    541909 non-null  object 
 4   Description  541909 non-null  object 
 5   Quantity     541909 non-null  int64  
 6   UnitPrice    541909 non-null  float64
 7   Country      541909 non-null  object 
dtypes: float64(2), int64(1), object(5)
memory usage: 33.1+ MB


### Observation: Sampled Dataset contains 541909 rows and 8 columns in which most of them are strings.

In [19]:
ecom_data.describe()

Unnamed: 0,CustomerID,Quantity,UnitPrice
count,406829.0,541909.0,541909.0
mean,15287.69057,50.534748,50.476354
std,1713.600303,28.849367,28.564775
min,12346.0,1.0,1.0
25%,13953.0,26.0,25.75
50%,15152.0,51.0,50.43
75%,16791.0,76.0,75.18
max,18287.0,100.0,100.0


In [21]:
# Checking Unique values in the columns
print("Unique Customers:", ecom_data['CustomerID'].nunique())
print("Unique Products:", ecom_data['StockCode'].nunique())
print("Countries:", ecom_data['Country'].nunique())

Unique Customers: 4372
Unique Products: 9000
Countries: 28


In [23]:
ecom_data.isnull().sum()

InvoiceNo           0
InvoiceDate         0
CustomerID     135080
StockCode           0
Description         0
Quantity            0
UnitPrice           0
Country             0
dtype: int64

### Observation - The CustomerID column have 135080 null values (missing data).

# Splitting the features to qualitative and quantitative features