# Customer Purchase Behavior Analysis and Prediction
This project demonstrates end-to-end data analysis and machine learning workflow to interpret customer purchase behavior, derive insights, and build predictive models.

In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


## Load Dataset

In [None]:

# Load dataset (generated earlier)
df = pd.read_csv("ecommerce_data.csv")

# Show sample
df.head()


## Data Exploration

In [None]:

# Basic info
print(df.info())
print(df.describe())

# Null check
df.isnull().sum()


## Data Visualization

In [None]:

plt.figure(figsize=(6,4))
sns.histplot(df['Amount'], bins=20, kde=True)
plt.title("Distribution of Purchase Amount")
plt.show()

plt.figure(figsize=(6,4))
sns.countplot(x='RepeatCustomer', data=df)
plt.title("Repeat Customer Distribution")
plt.show()


## Feature Engineering

In [None]:

# Convert categorical variable
df['Gender'] = df['Gender'].map({'Male':0, 'Female':1})

# Features and target
X = df[['Age','Amount','Frequency','Recency','Gender']]
y = df['RepeatCustomer']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


## Logistic Regression Model

In [None]:

log_model = LogisticRegression()
log_model.fit(X_train_scaled, y_train)
y_pred_log = log_model.predict(X_test_scaled)

print("Accuracy:", accuracy_score(y_test, y_pred_log))
print(classification_report(y_test, y_pred_log))
sns.heatmap(confusion_matrix(y_test, y_pred_log), annot=True, fmt='d')
plt.title("Confusion Matrix - Logistic Regression")
plt.show()


## Random Forest Model

In [None]:

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))
sns.heatmap(confusion_matrix(y_test, y_pred_rf), annot=True, fmt='d')
plt.title("Confusion Matrix - Random Forest")
plt.show()


## Customer Segmentation (KMeans Clustering)

In [None]:

kmeans = KMeans(n_clusters=3, random_state=42)
clusters = kmeans.fit_predict(X[['Amount','Frequency','Recency']])
df['Cluster'] = clusters

plt.figure(figsize=(6,4))
sns.scatterplot(x='Amount', y='Frequency', hue='Cluster', data=df, palette='Set1')
plt.title("Customer Segments")
plt.show()

df.groupby('Cluster').mean()
