### **Machine Learning for Online Shoppers Intention Dataset**

## **Importing all Libraries**

In [98]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn import preprocessing
import matplotlib.pyplot as plt
import seaborn as sns
import plotly as py
import plotly.graph_objs as go

## **Calling the csv file stored on Google Colab**

In [None]:
#reading the csv file from notebook
df = pd.read_csv('/content/online_shoppers_intention.csv')
df = pd.DataFrame(df)
print(df)

In [100]:
df.groupby(['Month'])['Revenue'].count()
df.groupby(['Month'])['Revenue'].mean()

Month
Aug     0.175520
Dec     0.125072
Feb     0.016304
Jul     0.152778
June    0.100694
Mar     0.100682
May     0.108502
Nov     0.253502
Oct     0.209472
Sep     0.191964
Name: Revenue, dtype: float64

## **Applying mean encoding for month and visitor type and Data Preprocessing**

In [None]:
Mean_encoded_subject = df.groupby(['Month'])['Revenue'].mean().to_dict()
df['Month'] = df['Month'].map(Mean_encoded_subject)

Mean_encoded_subject = df.groupby(['VisitorType'])['Revenue'].mean().to_dict()
df['VisitorType'] = df['VisitorType'].map(Mean_encoded_subject)

df['Weekend'] = df['Weekend'].astype(int)
df['Revenue'] = df['Revenue'].astype(int)

print(df)

## **Deploying K means Clustering Algorithm**

In [102]:
X = df.drop(columns=['Revenue'])
X = preprocessing.normalize(X)

# Initialize KMeans model with 2 clusters
kmeans = KMeans(n_clusters=2, random_state = 0, n_init='auto')

# Fit KMeans model to the data
kmeans.fit(X)

# Predict cluster labels
labels = kmeans.labels_

# Add cluster labels to a copy of the dataframe
df_with_2clusters = df.copy()
df_with_2clusters['Cluster'] = labels

# Filter data based on cluster labels
C1 = df_with_2clusters[df_with_2clusters['Cluster'] == 0]
C2 = df_with_2clusters[df_with_2clusters['Cluster'] == 1]

In [103]:
#Adding the K=2 clustered data in the data table
df['Cluster K=2'] = labels

In [104]:
X = df.drop(columns=['Revenue'])
X = preprocessing.normalize(X)

# Initialize KMeans model with 4 clusters
kmeans = KMeans(n_clusters=4, random_state = 0, n_init='auto')

# Fit KMeans model to the data
kmeans.fit(X)

# Predict cluster labels
labels = kmeans.labels_

# Add cluster labels to a copy of the dataframe
df_with_2clusters = df.copy()
df_with_2clusters['Cluster'] = labels

# Filter data based on all 4 cluster labels
C1 = df_with_2clusters[df_with_2clusters['Cluster'] == 0]
C2 = df_with_2clusters[df_with_2clusters['Cluster'] == 1]
C3 = df_with_2clusters[df_with_2clusters['Cluster'] == 2]
C4 = df_with_2clusters[df_with_2clusters['Cluster'] == 3]

In [105]:
#Adding the K=4 clustered data in the data table
df['Cluster K=4'] = labels

In [106]:
#You can now observe the updated data with K=2 and 4 values
print(df)

       Administrative  Administrative_Duration  Informational  \
0                   0                      0.0              0   
1                   0                      0.0              0   
2                   0                      0.0              0   
3                   0                      0.0              0   
4                   0                      0.0              0   
...               ...                      ...            ...   
12325               3                    145.0              0   
12326               0                      0.0              0   
12327               0                      0.0              0   
12328               4                     75.0              0   
12329               0                      0.0              0   

       Informational_Duration  ProductRelated  ProductRelated_Duration  \
0                         0.0               1                 0.000000   
1                         0.0               2                64.000000 

## **Performance Measure for K Means Clustering**

In [107]:
labels2 = df['Cluster K=2']
labels4 = df['Cluster K=4']

#This value is 2(S+D)
same_cluster_count = sum(labels2 == labels4)

#This value is m(m-1)
switched_labels_count = len(labels2) - same_cluster_count

#Computing Rand Index
RI_Kmeans = same_cluster_count / (same_cluster_count + switched_labels_count)

## **Deploying Agglomerative Clustering Algorithm**

In [108]:
X = df.drop(columns=['Revenue'])
X = preprocessing.normalize(X)

# Initialize Agglomerative Clustering model with 2 clusters
AClus = AgglomerativeClustering(n_clusters=2)

# Fit the model to the data
AClus.fit(X)

# Predict cluster labels
labels = AClus.labels_

# Add cluster labels to a copy of the dataframe
df_with_2clusters = df.copy()
df_with_2clusters['Cluster'] = labels

df['Agglomerative Cluster K=2'] = labels

In [109]:
X = df.drop(columns=['Revenue'])
X = preprocessing.normalize(X)

# Initialize Agglomerative Clustering model with 4 clusters
AClus = AgglomerativeClustering(n_clusters=4)

# Fit the model to the data
AClus.fit(X)

# Predict cluster labels
labels = AClus.labels_

# Add cluster labels to a copy of the dataframe
df_with_2clusters = df.copy()
df_with_2clusters['Cluster'] = labels

df['Agglomerative Cluster K=4'] = labels

## **Performance Measure for Agglomerative Clustering**

In [110]:
#computing Rand Index for Agglomerative Clustering

labels2 = df['Agglomerative Cluster K=2']
labels4 = df['Agglomerative Cluster K=4']

same_cluster_count = sum(labels2 == labels4)
switched_labels_count = len(labels2) - same_cluster_count

RI_Aclus = same_cluster_count / (same_cluster_count + switched_labels_count)

## **Final Comparison**

In [111]:
print("Computed Rand Index for K-means Clustering is: ", RI_Kmeans)
print("Computed Rand Index for Agglomerated Clustering is: ", RI_Aclus)

print("\nSince K-means Clustering provides a much higher accuracy, thus it is a better approach")

Computed Rand Index for K-means Clustering is:  0.8339010543390105
Computed Rand Index for Agglomerated Clustering is:  0.1467964314679643

Since K-means Clustering provides a much higher accuracy, thus it is a better approach
