## Importing Data

In [None]:
import pandas as pd
import warnings

warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('..\\Datasets\\e-commerce_data.csv', encoding='latin1')

In [None]:
df

---
## Data Handling

In [None]:
df.isna().sum()

In [None]:
df_clean = df.dropna(subset=['CustomerID'])

df_clean.drop_duplicates(inplace=True)

In [None]:
# Remove negative quantities
df_clean = df_clean[df_clean['Quantity'] > 0]

# Remove rows with extremely high quantities or unit prices
quantity_threshold = df_clean['Quantity'].quantile(0.99)
unit_price_threshold = df_clean['UnitPrice'].quantile(0.99)

df_clean = df_clean[df_clean['Quantity'] <= quantity_threshold]
df_clean = df_clean[df_clean['UnitPrice'] <= unit_price_threshold]

df_clean['InvoiceDate'] = pd.to_datetime(df_clean['InvoiceDate'])

df_clean

In [None]:
df_final = df_clean.drop(['Description', 'StockCode'], axis=1)

df_final.head()

In [None]:
# Countries with a percentage below 0.3% will be set to 'Other' for easier caluclations
country_counts = df_final['Country'].value_counts(normalize=True) * 100
small_countries = country_counts[country_counts < 0.2].index
df_final['Country'] = df_final['Country'].apply(lambda x: 'Other' if x in small_countries else x)

df_final['Country'].value_counts(normalize=True) * 100

---
## EDA

In [None]:
df_final.describe()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
plt.figure(figsize=(10, 8))
sns.histplot(df_final['Quantity'], bins=50, kde=True)
plt.show()

In [None]:
plt.figure(figsize=(15, 5))
df_final['Country'].value_counts().plot(kind='bar')
plt.title('Number of Transactions per Country')
plt.show()

In [None]:
df_final.set_index('InvoiceDate')['Quantity'].resample('M').sum().plot(figsize=(15, 5))
plt.title('Monthly Sales Quantity Over Time')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
plt.hexbin(data=df_final, x='UnitPrice', y='Quantity', gridsize=50, cmap='Purples')
plt.colorbar(label='Count')
plt.title('Hexbin Plot of UnitPrice vs Quantity')
plt.xlabel('UnitPrice')
plt.ylabel('Quantity')
plt.show()


---
## Feature Engineering

In [None]:
import datetime as dt

# Recency
date = df_final['InvoiceDate'].max() + dt.timedelta(days=1)
recency = df_final.groupby('CustomerID').apply(lambda x: (date - x['InvoiceDate'].max()).days)
recency = recency.reset_index()
recency.columns = ['CustomerID', 'Recency']

# Frequency
frequency = df_final.groupby('CustomerID').InvoiceNo.nunique()
frequency = frequency.reset_index()
frequency.columns = ['CustomerID', 'Frequency']

# Monetary
df_final['TotalPrice'] = df_final['Quantity'] * df_final['UnitPrice']
monetary = df_final.groupby('CustomerID').TotalPrice.sum()
monetary = monetary.reset_index()
monetary.columns = ['CustomerID', 'Monetary']

rfm = recency.merge(frequency, on='CustomerID').merge(monetary, on='CustomerID')
rfm

---
## Preprocessing for Neural Network

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

rfm_scaled = scaler.fit_transform(rfm[['Recency', 'Frequency', 'Monetary']])
rfm_scaled = pd.DataFrame(rfm_scaled, columns=['Recency', 'Frequency', 'Monetary'])

rfm_scaled.head()

In [None]:
from sklearn.model_selection import train_test_split

X = rfm_scaled.drop(['Monetary'], axis=1)
y = rfm_scaled['Monetary']

columns = X.columns
X = scaler.fit_transform(X)
X = pd.DataFrame(X, columns=columns)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=26)
print('X_train shape', X_train.shape)
print('y_train shape', y_train.shape)
print('X_test shape', X_test.shape)
print('y_test shape', y_test.shape)

---
## Neural Network Development

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.regularizers import l2

In [None]:
# Define the model
model = Sequential()
model.add(Dense(256, activation='relu', input_shape=(X_train.shape[1],), kernel_regularizer=l2(0.02)))
model.add(Dense(128, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='linear'))

model.summary()

In [None]:
model.compile(optimizer='sgd', loss='mse')
model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2)

---
## Training and Evaluation

In [None]:
preds = model.predict(X_test).reshape(-1,)

res = pd.DataFrame()
res['Actual'] = y_test.values
res['Preds'] = preds

res

In [None]:
res.describe()

In [None]:
from sklearn.metrics import mean_squared_error, r2_score

# Make predictions
preds = model.predict(X_test).reshape(-1,)

# Calculate additional metrics
mse_test = mean_squared_error(y_test, preds)
r2_test = r2_score(y_test, preds)

print(f"Mean Squared Error (Test Data): {mse_test}")
print(f"R-squared (Test Data): {r2_test}")

# Create a DataFrame to store the actual and predicted values
res = pd.DataFrame()
res['Actual'] = y_test.values
res['Preds'] = preds

# Display the results
print(res.head())

# Plot Actual vs Predicted Monetary values
plt.figure(figsize=(8, 6))
plt.scatter(res['Actual'], res['Preds'], alpha=0.5)
plt.plot([res['Actual'].min(), res['Actual'].max()],
         [res['Actual'].min(), res['Actual'].max()],
         'r--', lw=2)
plt.xlabel('Actual Monetary')
plt.ylabel('Predicted Monetary')
plt.title('Actual vs Predicted Monetary Values')
plt.grid(True)
plt.show()

---
## Insights and Market Segmentation

In [None]:
from sklearn.cluster import KMeans

preds_reshaped = preds.reshape(-1, 1)


num_clusters = 3
kmeans = KMeans(n_clusters=num_clusters, random_state=26)
clusters = kmeans.fit_predict(preds_reshaped)

res['Cluster'] = clusters

cluster_summary = res.groupby('Cluster').agg({
    'Actual': ['mean', 'std', 'min', 'max'],
    'Preds': ['mean', 'std', 'min', 'max']
}).reset_index()

cluster_summary

In [None]:
import seaborn as sns

plt.figure(figsize=(10, 6))
sns.scatterplot(x=res.index, y=res['Preds'], hue=res['Cluster'], palette='viridis')
plt.title('Market Segmentation based on Predicted Monetary Values')
plt.xlabel('Index')
plt.ylabel('Predicted Monetary Value')
plt.legend(title='Cluster')
plt.grid(True)
plt.show()

---

## Recommendations

#### Based on the market segmentation, we recommend tailored strategies for each customer segment.

- **high-value customers (cluster 2):** focus on exclusive deals, personalized offers, and loyalty programs to enhance engagement and satisfaction. Develop premium products and provide dedicated customer support to maintain their loyalty. 

- **medium-value customers:** offer discounts and bundle deals to increase spending and attract referrals. Develop mid-range products and ensure robust customer support to address their needs efficiently.

- **low-value customers:** use broad marketing campaigns and seasonal discounts to drive sales. Focus on cost-effective products and maintain scalable customer support options to handle a larger volume of inquiries effectively.

> These strategies aim to maximize the value of each customer segment and improve their overall experience with the brand. Implementing these recommendations can lead to increased customer satisfaction, loyalty, and business growth.