In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

In [2]:
# Load the data
retail_data = pd.read_excel('../data/raw_data/Online Retail.xlsx')

In [4]:
def preprocess_data(retail_data):
    # Handle missing values
    retail_data = handle_missing_values(retail_data)
    
    # Remove duplicates
    retail_data = remove_duplicates(retail_data)
    
    # Perform additional preprocessing steps as needed
    
    return retail_data

def handle_missing_values(df):
    # Drop rows with missing values in 'InvoiceNo' or 'StockCode'
    df = df.dropna(subset=['InvoiceNo', 'StockCode'])
    
    # Fill missing values in 'Quantity' with 0
    df['Quantity'] = df['Quantity'].fillna(0)
    
    # Fill missing values in 'UnitPrice' with the mean
    df['UnitPrice'] = df['UnitPrice'].fillna(df['UnitPrice'].mean())
    
    return df

def remove_duplicates(df):
    # Drop exact duplicates based on all columns
    df = df.drop_duplicates()
    
    # Drop duplicates based on 'InvoiceNo', 'StockCode', and 'Description'
    df = df.drop_duplicates(subset=['InvoiceNo', 'StockCode', 'Description'])
    
    return df

In [5]:
# Preprocess the data (handle missing values, remove duplicates, etc.)
retail_data = preprocess_data(retail_data)

In [6]:
# Extract relevant features for customer segmentation
customer_data = retail_data.groupby('CustomerID').agg({
    'InvoiceNo': 'count',
    'Quantity': 'sum',
    'UnitPrice': 'mean'
}).reset_index()

In [7]:
# Scale the features using StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(customer_data[['Quantity', 'UnitPrice']])
customer_data[['Quantity', 'UnitPrice']] = X

In [8]:
# Perform customer segmentation using K-Means clustering
kmeans = KMeans(n_clusters=5, random_state=42)
customer_data['segment'] = kmeans.fit_predict(customer_data[['Quantity', 'UnitPrice']])


In [9]:
import pandas as pd

def segment_analysis(customer_data):
    # Get the unique segments
    segments = customer_data['segment'].unique()
    
    # Analyze the characteristics of each segment
    for segment in segments:
        segment_data = customer_data[customer_data['segment'] == segment]
        
        # Calculate the mean and standard deviation for each feature
        quantity_mean = segment_data['Quantity'].mean()
        quantity_std = segment_data['Quantity'].std()
        price_mean = segment_data['UnitPrice'].mean()
        price_std = segment_data['UnitPrice'].std()
        
        # Calculate the total number of customers in the segment
        num_customers = len(segment_data)
        
        # Print the segment analysis
        print(f"Segment {segment}:")
        print(f"Number of Customers: {num_customers}")
        print(f"Average Quantity: {quantity_mean:.2f} (±{quantity_std:.2f})")
        print(f"Average Unit Price: {price_mean:.2f} (±{price_std:.2f})")
        print()

In [10]:
# Analyze the characteristics of each segment
segment_analysis(customer_data)

Segment 0:
Number of Customers: 4326
Average Quantity: -0.07 (±0.26)
Average Unit Price: -0.03 (±0.15)

Segment 3:
Number of Customers: 10
Average Quantity: 16.33 (±9.17)
Average Unit Price: -0.02 (±0.02)

Segment 2:
Number of Customers: 32
Average Quantity: 4.15 (±1.77)
Average Unit Price: -0.02 (±0.11)

Segment 1:
Number of Customers: 1
Average Quantity: -0.23 (±nan)
Average Unit Price: 55.52 (±nan)

Segment 4:
Number of Customers: 3
Average Quantity: -0.24 (±0.00)
Average Unit Price: 18.14 (±9.97)



In [11]:
import joblib

In [12]:
# Save the preprocessed data and segmentation results
customer_data.to_csv('../data/processed/customer_data.csv', index=False)
joblib.dump(kmeans, '../models/segmentation_model.joblib')

['../models/segmentation_model.joblib']