<a href="https://colab.research.google.com/github/nikshitagchiliveri/DATA-ANALYSIS-IN-E-COMMERCE/blob/main/AIEC_EXP_3_F2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from mlxtend.frequent_patterns import apriori, association_rules

In [None]:
df = pd.read_excel('/content/Online Retail.xlsx')
df.head()

In [None]:
# Drop missing values
df = df.dropna()

In [None]:
df.isnull().sum()

In [None]:
# Remove negative quantities (possible returns)
df = df[df['Quantity'] > 0]

In [None]:
# Add a Total Price column
df['TotalPrice'] = df['Quantity'] * df['UnitPrice']

In [None]:
print(df.describe())

In [None]:
# Top 10 countries with the highest number of customers
top_countries = df['Country'].value_counts().head(10)
plt.figure(figsize=(10, 5))
sns.barplot(x=top_countries.values, y=top_countries.index, palette="viridis")
plt.title("Top 10 Countries by Number of Transactions")
plt.xlabel("Number of Transactions")
plt.ylabel("Country")
plt.show()

In [None]:
# Add TotalPrice column to the DataFrame
df['TotalPrice'] = df['Quantity'] * df['UnitPrice']
# Select features for segmentation
segmentation_data = df.groupby('CustomerID').agg({
'TotalPrice': 'sum',
'InvoiceNo': 'nunique',
'Quantity': 'sum'
}).rename(columns={'InvoiceNo': 'UniquePurchases'})

In [None]:
# Scale the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(segmentation_data)

In [None]:
# Elbow method to determine optimal clusters
inertia = []
k_range = range(1, 11)
for k in k_range:
  kmeans = KMeans(n_clusters=k, random_state=42)
  kmeans.fit(scaled_data)
  inertia.append(kmeans.inertia_)

In [None]:
# Plot the Elbow Curve
plt.figure(figsize=(9, 5))
plt.plot(k_range, inertia, marker='o', linestyle='-.', color='b')
plt.title("Elbow Method for Optimal K")
plt.xlabel("Number of Clusters (k)")
plt.ylabel("Inertia")
plt.grid(True)
plt.show()

In [None]:
# Apply KMeans clustering with optimal k (assume k=3 based on elbow curve)
optimal_k = 3
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
segmentation_data['Cluster'] = kmeans.fit_predict(scaled_data)

In [None]:
# Visualize the clusters
plt.figure(figsize=(9, 5))
sns.scatterplot(
x=segmentation_data['UniquePurchases'],
y=segmentation_data['TotalPrice'],
hue=segmentation_data['Cluster'],
palette='viridis',
s=100
 )
plt.title("Customer Segmentation")
plt.xlabel("Unique Purchases")
plt.ylabel("Total Spending (Total Price)")
plt.legend(title="Cluster")
plt.show()

In [None]:
# Prepare the data for Market Basket Analysis
basket = df.groupby(['InvoiceNo', 'Description'])['Quantity'].sum().unstack().fillna(0)

In [None]:
# Convert values to binary (0 or 1)
basket = basket.applymap(lambda x: 1 if x > 0 else 0)

In [None]:
# Perform Apriori Algorithm
frequent_itemsets = apriori(basket, min_support=0.02, use_colnames=True)

In [None]:
# Generate Association Rules (Add num_itemsets argument)
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1, num_itemsets=2)

In [None]:
# Sort the rules based on lift
rules = rules.sort_values(by='lift', ascending=False)

In [None]:
# Display top rules
print("\nTop Association Rules:")
print(rules.head())

In [None]:
 # Plot top 10 rules by lift
top_rules = rules.head(10)
plt.figure(figsize=(9, 5))
sns.barplot(
x=top_rules['lift'],
y=top_rules['antecedents'].apply(lambda x: ', '.join(list(x))),)
plt.title("Top 10 Association Rules by Lift")
plt.xlabel("Lift")
plt.ylabel("Itemset")
plt.show()