In [2]:
import pandas as pd
import numpy as np
from sklearn import *

In [8]:
# R> library("MSA")
# R> data("mcdonalds", package = "MSA")
# R> names(mcdonalds)

df= pd.read_csv(r'C:\Users\Dell\Downloads\mcdonalds.csv')
df.columns


Index(['yummy', 'convenient', 'spicy', 'fattening', 'greasy', 'fast', 'cheap',
       'tasty', 'expensive', 'healthy', 'disgusting', 'Like', 'Age',
       'VisitFrequency', 'Gender'],
      dtype='object')

In [10]:
# R> dim(mcdonalds)
df.shape

(1453, 15)

In [11]:
df.head()

Unnamed: 0,yummy,convenient,spicy,fattening,greasy,fast,cheap,tasty,expensive,healthy,disgusting,Like,Age,VisitFrequency,Gender
0,No,Yes,No,Yes,No,Yes,Yes,No,Yes,No,No,-3,61,Every three months,Female
1,Yes,Yes,No,Yes,Yes,Yes,Yes,Yes,Yes,No,No,2,51,Every three months,Female
2,No,Yes,Yes,Yes,Yes,Yes,No,Yes,Yes,Yes,No,1,62,Every three months,Female
3,Yes,Yes,No,Yes,Yes,Yes,Yes,Yes,No,No,Yes,4,69,Once a week,Female
4,No,Yes,No,Yes,Yes,Yes,Yes,No,No,Yes,No,2,49,Once a month,Male


In [13]:
# Assuming 'mcdonalds' is your DataFrame and the first 11 columns are the ones you want to use
MD_x = df.iloc[:, :11]

# Convert 'Yes' to 1 and everything else to 0
MD_x = (MD_x == "Yes").astype(int)

# Calculate the column means
col_means = MD_x.mean().round(2)

print(col_means)

yummy         0.55
convenient    0.91
spicy         0.09
fattening     0.87
greasy        0.53
fast          0.90
cheap         0.60
tasty         0.64
expensive     0.36
healthy       0.20
disgusting    0.24
dtype: float64


In [14]:
from sklearn.decomposition import PCA
# Perform PCA
pca = PCA()
MD_pca = pca.fit(MD_x)

# Summarize PCA results
explained_variance = pca.explained_variance_ratio_
components = pca.components_

print("Explained Variance Ratio:")
print(explained_variance)
print("\nPrincipal Components:")
print(components)

Explained Variance Ratio:
[0.29944723 0.19279721 0.13304535 0.08309578 0.05948052 0.05029956
 0.0438491  0.03954779 0.0367609  0.03235329 0.02932326]

Principal Components:
[[-0.47693349 -0.15533159 -0.00635636  0.11623168  0.3044427  -0.10849325
  -0.33718593 -0.47151394  0.32904173 -0.21371062  0.37475293]
 [ 0.36378978  0.016414    0.01880869 -0.03409395 -0.06383884 -0.0869722
  -0.61063276  0.3073178   0.60128596  0.07659344 -0.13965633]
 [-0.30444402 -0.0625153  -0.03701866 -0.32235949 -0.80237317 -0.06464172
  -0.14931026 -0.28726479  0.02439661  0.19205128 -0.08857138]
 [ 0.0551622  -0.14242496  0.19761927 -0.35413876  0.2539601  -0.09736269
   0.11895823 -0.00254696  0.06781599  0.76348804  0.36953871]
 [-0.30753507  0.27760805  0.07062017 -0.07340475  0.36139895  0.10793025
  -0.12897259 -0.21089912 -0.00312457  0.28784553 -0.72920859]
 [ 0.17073819 -0.34783006 -0.3550866  -0.40651542  0.20934711 -0.59463206
  -0.10324067 -0.07691443 -0.26134151 -0.17822612 -0.21087805]
 [-0.2

In [17]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import random


# Normalize the data
scaler = StandardScaler()
MD_x_scaled = scaler.fit_transform(MD_x)

# Set seed for reproducibility
random.seed(1234)
np.random.seed(1234)

# Define function to perform KMeans clustering and store results
def perform_kmeans(data, k_range, n_init=10):
    best_kmeans = None
    best_inertia = np.inf
    all_kmeans = {}
    
    for k in k_range:
        kmeans = KMeans(n_clusters=k, n_init=n_init, random_state=1234)
        kmeans.fit(data)
        all_kmeans[k] = kmeans
        
        if kmeans.inertia_ < best_inertia:
            best_inertia = kmeans.inertia_
            best_kmeans = kmeans
    
    return best_kmeans, all_kmeans

# Perform KMeans clustering for 2 to 8 clusters
best_kmeans, all_kmeans = perform_kmeans(MD_x_scaled, range(2, 9))

# Function to relabel clusters for consistency
def relabel_kmeans(kmeans):
    counts = np.bincount(kmeans.labels_)
    sorted_indices = np.argsort(-counts)
    label_mapping = {old_label: new_label for new_label, old_label in enumerate(sorted_indices)}
    relabeled_labels = np.vectorize(label_mapping.get)(kmeans.labels_)
    return relabeled_labels

# Relabel the clusters
relabeled_labels = relabel_kmeans(best_kmeans)

# Assign the relabeled clusters back to the data
df['cluster'] = relabeled_labels

# Print the results
print("Best KMeans Model:")
print(best_kmeans)
print("\nRelabeled Clusters:")
print(df['cluster'].value_counts())


Best KMeans Model:
KMeans(n_init=10, random_state=1234)

Relabeled Clusters:
cluster
0    352
1    228
2    190
3    188
4    166
5    115
6    114
7    100
Name: count, dtype: int64


In [20]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample


# Set seed for reproducibility
np.random.seed(1234)

# Perform KMeans clustering with bootstrapping
def perform_bootstrap_kmeans(data, k_range, n_init=10, n_boot=100):
    bootstrap_results = {k: [] for k in k_range}
    
    for i in range(n_boot):
        # Generate a bootstrap sample
        data_sample = resample(data, replace=True, n_samples=len(data), random_state=1234 + i)
        
        for k in k_range:
            kmeans = KMeans(n_clusters=k, n_init=n_init, random_state=1234)
            kmeans.fit(data_sample)
            bootstrap_results[k].append(kmeans.inertia_)
    
    return bootstrap_results

# Perform bootstrapping KMeans for 2 to 8 clusters
bootstrap_results = perform_bootstrap_kmeans(MD_x_scaled, range(2, 9), n_boot=100)

# Summarize the bootstrap results
for k in bootstrap_results:
    print(f"K = {k}: Mean Inertia = {np.mean(bootstrap_results[k])}, Std Inertia = {np.std(bootstrap_results[k])}")


KeyboardInterrupt: 

In [21]:
# R> plot(MD.b28, xlab = "number of segments",
# + ylab = "adjusted Rand index")

In [None]:
from sklearn.utils import resample
from sklearn.metrics import adjusted_rand_score
import matplotlib.pyplot as plt




# Perform KMeans clustering with bootstrapping and calculate ARI
def perform_bootstrap_kmeans_ari(data, k_range, n_init=10, n_boot=100):
    bootstrap_results = {k: [] for k in k_range}
    
    for i in range(n_boot):
        # Generate a bootstrap sample
        data_sample = resample(data, replace=True, n_samples=len(data), random_state=1234 + i)
        
        # Perform clustering on the original data
        for k in k_range:
            kmeans = KMeans(n_clusters=k, n_init=n_init, random_state=1234)
            kmeans.fit(data_sample)
            labels_sample = kmeans.labels_
            
            kmeans_orig = KMeans(n_clusters=k, n_init=n_init, random_state=1234)
            kmeans_orig.fit(data)
            labels_orig = kmeans_orig.labels_
            
            ari = adjusted_rand_score(labels_orig, labels_sample)
            bootstrap_results[k].append(ari)
    
    return bootstrap_results

# Perform bootstrapping KMeans for 2 to 8 clusters
bootstrap_results = perform_bootstrap_kmeans_ari(MD_x_scaled, range(2, 9), n_boot=100)

# Calculate mean ARI for each number of clusters
mean_ari = {k: np.mean(bootstrap_results[k]) for k in bootstrap_results}

# Plot the results
plt.figure(figsize=(10, 6))
plt.plot(list(mean_ari.keys()), list(mean_ari.values()), marker='o')
plt.xlabel("Number of segments")
plt.ylabel("Adjusted Rand Index")
plt.title("Adjusted Rand Index for different number of segments")
plt.grid(True)
plt.show()


In [None]:
# Perform KMeans clustering with bootstrapping and calculate ARI
def perform_bootstrap_kmeans_ari(data, k_range, n_init=10, n_boot=100):
    bootstrap_results = {k: [] for k in k_range}
    
    for i in range(n_boot):
        # Generate a bootstrap sample
        data_sample = resample(data, replace=True, n_samples=len(data), random_state=1234 + i)
        
        # Perform clustering on the original data
        for k in k_range:
            kmeans = KMeans(n_clusters=k, n_init=n_init, random_state=1234)
            kmeans.fit(data_sample)
            labels_sample = kmeans.labels_
            
            kmeans_orig = KMeans(n_clusters=k, n_init=n_init, random_state=1234)
            kmeans_orig.fit(data)
            labels_orig = kmeans_orig.labels_
            
            ari = adjusted_rand_score(labels_orig, labels_sample)
            bootstrap_results[k].append(ari)
    
    return bootstrap_results

# Perform bootstrapping KMeans for 2 to 8 clusters
bootstrap_results = perform_bootstrap_kmeans_ari(MD_x_scaled, range(2, 9), n_boot=100)

# Calculate mean ARI for each number of clusters
mean_ari = {k: np.mean(bootstrap_results[k]) for k in bootstrap_results}

# Plot the results
plt.figure(figsize=(10, 6))
plt.plot(list(mean_ari.keys()), list(mean_ari.values()), marker='o')
plt.xlabel("Number of segments")
plt.ylabel("Adjusted Rand Index")
plt.title("Adjusted Rand Index for different number of segments")
plt.grid(True)
plt.show()