# Customer Engagement Analysis using E-commerce Dataset
This notebook performs data cleaning, exploratory data analysis, feature engineering, and predictive modeling on an e-commerce dataset.

In [None]:
# Step 1: Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report, accuracy_score

## Step 2: Load Dataset

In [None]:
# Step 2: Load Dataset
data = pd.read_csv('ecommerce_customer_data.csv')
print("Dataset Preview:")
print(data.head())

## Step 3: Data Cleaning

In [1]:
# Step 3: Data Cleaning
# Drop irrelevant columns
data = data.drop(['CustomerID', 'ProductID'], axis=1, errors='ignore')

# Handle missing values
numeric_data = data.select_dtypes(include=[np.number])  # Select numeric columns
data[numeric_data.columns] = numeric_data.fillna(numeric_data.median())

# Fill missing values in categorical columns
data['Gender'].fillna(data['Gender'].mode()[0], inplace=True)
data['Product Category'].fillna(data['Product Category'].mode()[0], inplace=True)

# Drop rows with missing 'Purchase Date'
data.dropna(subset=['Purchase Date'], inplace=True)

print("Data Cleaning Completed!")

NameError: name 'data' is not defined

## Step 4: Exploratory Data Analysis

In [2]:
# Step 4: Exploratory Data Analysis
plt.figure(figsize=(10, 8))
sns.heatmap(data.select_dtypes(include=[np.number]).corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

sns.histplot(data['Quantity'], bins=30, kde=True)
plt.title('Quantity Distribution (Proxy for Engagement)')
plt.xlabel('Quantity')
plt.show()

NameError: name 'plt' is not defined

## Step 5: Feature Engineering

In [3]:
# Step 5: Feature Engineering
data['EngagementScore'] = data['Quantity'] * data['Product Price']
data['HighEngagement'] = (data['EngagementScore'] > data['EngagementScore'].median()).astype(int)

data['PurchaseDate'] = pd.to_datetime(data['Purchase Date'])
data['DaysSinceLastPurchase'] = (data['PurchaseDate'].max() - data['PurchaseDate']).dt.days

X = data.drop(['HighEngagement', 'EngagementScore', 'Purchase Date'], axis=1)
y = data['HighEngagement']
scaler = MinMaxScaler()
X = pd.DataFrame(scaler.fit_transform(X.select_dtypes(include=[np.number])), columns=X.select_dtypes(include=[np.number]).columns)

NameError: name 'data' is not defined

## Step 6: Split Data

In [None]:
# Step 6: Split Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Step 7: Random Forest Classifier

In [None]:
# Step 7: Random Forest Classifier
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

print("\nRandom Forest Classification Report:")
print(classification_report(y_test, y_pred_rf))
print("Accuracy:", accuracy_score(y_test, y_pred_rf))

importance_rf = pd.DataFrame({'Feature': X.columns, 'Importance': rf_model.feature_importances_})
importance_rf = importance_rf.sort_values(by='Importance', ascending=False)

sns.barplot(x='Importance', y='Feature', data=importance_rf)
plt.title('Feature Importance')
plt.show()

## Step 8: Save Results

In [None]:
# Step 8: Save Results
data.to_csv('processed_ecommerce_data.csv', index=False)
with open('rf_model_summary.txt', 'w') as f:
    f.write("Random Forest Accuracy: {:.2f}\n".format(accuracy_score(y_test, y_pred_rf)))
    f.write("\nFeature Importances:\n")
    f.write(importance_rf.to_string(index=False))

print("\nAnalysis Complete! Results saved to disk.")

### Additional Data Exploration

In [None]:
# Step 4: Additional Exploratory Data Analysis

# Distribution of numerical features
numeric_cols = data.select_dtypes(include=[np.number]).columns
data[numeric_cols].hist(figsize=(12, 10), bins=20)
plt.suptitle('Histograms of Numeric Features', fontsize=16)
plt.show()

# Count plots for categorical features
categorical_cols = data.select_dtypes(include=['object']).columns
for col in categorical_cols:
    plt.figure(figsize=(8, 4))
    sns.countplot(data=data, x=col, order=data[col].value_counts().index)
    plt.title(f'Distribution of {col}')
    plt.xticks(rotation=45)
    plt.show()


### Model Development and Insights

In [None]:
# Step 7: Model Insights

# Confusion Matrix
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
cm = confusion_matrix(y_test, y_pred_rf)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Low Engagement', 'High Engagement'])
disp.plot(cmap='Blues', values_format='d')
plt.title('Confusion Matrix')
plt.show()

# Insights based on Feature Importance
print("Top Features Contributing to Engagement:")
print(importance_rf.head(5))