In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA

# Load the dataset
file_path = '/mnt/data/tripadvisor.csv'
tripadvisor_data = pd.read_csv(file_path)

# Selecting relevant numerical features
numerical_features = tripadvisor_data.select_dtypes(include=[np.number]).columns
numeric_data = tripadvisor_data[numerical_features].dropna(axis=1, how='all')
numeric_data = numeric_data.fillna(numeric_data.mean())

# Standardizing the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(numeric_data)

# Elbow Method to determine the optimal number of clusters
wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, random_state=42)
    kmeans.fit(scaled_data)
    wcss.append(kmeans.inertia_)

# Plotting the Elbow Method
plt.figure(figsize=(10, 6))
plt.plot(range(1, 11), wcss, marker='o')
plt.title('Elbow Method for Optimal K')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS (Within-Cluster Sum of Squares)')
plt.grid(True)
plt.show()

# Applying K-means clustering with optimal K=4
k_optimal = 4
kmeans = KMeans(n_clusters=k_optimal, random_state=42)
cluster_labels = kmeans.fit_predict(scaled_data)
numeric_data['Cluster'] = cluster_labels

# Performing PCA for 2D visualization
pca = PCA(n_components=2)
data_pca = pca.fit_transform(scaled_data)

# Scatter Plot of Clusters
plt.figure(figsize=(10, 6))
sns.scatterplot(x=data_pca[:, 0], y=data_pca[:, 1], hue=numeric_data['Cluster'], palette='viridis', edgecolor='k')
plt.title('Cluster Plot (PCA Reduced Dimensions)')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend(title='Cluster')
plt.show()

# Cluster sizes
cluster_counts = numeric_data['Cluster'].value_counts().sort_index()

# Plotting Cluster Sizes
plt.figure(figsize=(8, 5))
cluster_counts.plot(kind='bar', color='skyblue')
plt.title('Number of Entries per Cluster')
plt.xlabel('Cluster')
plt.ylabel('Number of Entries')
plt.grid(True)
plt.show()
