In [12]:
import pandas as pd
import numpy as np
from sklearn.cluster import AgglomerativeClustering
from sklearn.impute import SimpleImputer

# Load the local data file
file_path = 'auto-mpg.data'
column_names = ['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model_year', 'origin', 'car_name']

# Read the data
data = pd.read_csv(file_path, names=column_names, delim_whitespace=True, na_values="?")

# Select continuous fields as features
continuous_features = ['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model_year']
X = data[continuous_features]

# Impute missing values with the mean
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)
X_imputed_df = pd.DataFrame(X_imputed, columns=continuous_features)

# Perform hierarchical clustering
# Set linkage='average', affinity defaults to 'euclidean'
clustering = AgglomerativeClustering(n_clusters=3, linkage='average')
clusters = clustering.fit_predict(X_imputed_df)

# Add cluster labels to the DataFrame
X_imputed_df['cluster'] = clusters

# Calculate the mean and variance for each cluster
cluster_stats = X_imputed_df.groupby('cluster')[continuous_features].agg(['mean', 'var'])

# Use “origin” as class labels for comparison
data['origin'] = data['origin'].astype(int)
origin_stats = data.groupby('origin')[continuous_features].agg(['mean', 'var'])

# Display the results
print("Cluster statistics (mean and variance):")
print(cluster_stats)
print("\nClass statistics (mean and variance):")
print(origin_stats)

# Evaluate the relationship between cluster assignment and class labels
# Create a crosstab to see the distribution of clusters within class labels
cross_tab = pd.crosstab(data['origin'], clusters)
print("\nCrosstab of class labels and clusters:")
print(cross_tab)

Cluster statistics (mean and variance):
               mpg            cylinders           displacement               \
              mean        var      mean       var         mean          var   
cluster                                                                       
0        27.365414  41.976309  4.443609  0.851525   131.934211  2828.083391   
1        13.889062   3.359085  8.000000  0.000000   358.093750  2138.213294   
2        17.510294   8.829892  7.014706  1.059482   278.985294  2882.492318   

         horsepower                   weight                acceleration  \
               mean         var         mean            var         mean   
cluster                                                                    
0         84.300061  369.143491  2459.511278  182632.099872    16.298120   
1        167.046875  756.521577  4398.593750   74312.340278    13.025000   
2        124.470588  713.088674  3624.838235   37775.809263    15.105882   

                   model_yea

  data = pd.read_csv(file_path, names=column_names, delim_whitespace=True, na_values="?")


In [13]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Load the Boston housing dataset from the URL
data_url = "http://lib.stat.cmu.edu/datasets/boston"
raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
target = raw_df.values[1::2, 2]

# Convert the data into a Pandas DataFrame
column_names = [
    'CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 
    'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV'
]
boston_df = pd.DataFrame(data, columns=column_names[:-1])
boston_df['MEDV'] = target

# Select continuous features as input for clustering
continuous_features = [
    'CRIM', 'ZN', 'INDUS', 'NOX', 'RM', 'AGE', 'DIS', 
    'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT'
]
X = boston_df[continuous_features]

# Standardize the data
scaler = StandardScaler()
boston_scaled = scaler.fit_transform(X)

# Perform K-Means clustering and calculate silhouette scores
silhouette_scores = {}
for k in range(2, 7):
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(boston_scaled)
    labels = kmeans.labels_
    score = silhouette_score(boston_scaled, labels)
    silhouette_scores[k] = score
    print(f"Silhouette score for k={k}: {score:.3f}")

# Determine the optimal number of clusters
optimal_k = max(silhouette_scores, key=silhouette_scores.get)
print(f"Optimal number of clusters: {optimal_k}")

# Perform K-Means clustering with the optimal number of clusters
optimal_kmeans = KMeans(n_clusters=optimal_k, random_state=42)
optimal_kmeans.fit(boston_scaled)
labels = optimal_kmeans.labels_
boston_df['Cluster'] = labels

# Calculate the mean values of all features for each cluster
mean_values = boston_df.groupby('Cluster')[continuous_features].mean()
print("Mean values for each cluster:\n", mean_values)

# Get the coordinates of the cluster centers
centroid_coordinates = optimal_kmeans.cluster_centers_
centroid_coordinates_df = pd.DataFrame(centroid_coordinates, columns=continuous_features)
print("Cluster center coordinates:\n", centroid_coordinates_df)

# Compare the mean values with the cluster center coordinates
print("Comparison of mean values and cluster center coordinates:")
for cluster in range(optimal_k):
    print(f"\nCluster {cluster}:")
    print("Mean values:")
    print(mean_values.loc[cluster])
    print("Cluster center coordinates:")
    print(centroid_coordinates_df.iloc[cluster])

Silhouette score for k=2: 0.382
Silhouette score for k=3: 0.268
Silhouette score for k=4: 0.287
Silhouette score for k=5: 0.272
Silhouette score for k=6: 0.264
Optimal number of clusters: 2
Mean values for each cluster:
              CRIM         ZN      INDUS       NOX        RM        AGE  \
Cluster                                                                  
0        0.261172  17.477204   6.885046  0.487011  6.455422  56.339210   
1        9.844730   0.000000  19.039718  0.680503  5.967181  91.318079   

              DIS        RAD         TAX    PTRATIO           B      LSTAT  
Cluster                                                                     
0        4.756868   4.471125  301.917933  17.837386  386.447872   9.468298  
1        2.007242  18.988701  605.858757  19.604520  301.331695  18.572768  
Cluster center coordinates:
        CRIM        ZN     INDUS       NOX        RM       AGE       DIS  \
0 -0.390124  0.262392 -0.620368 -0.584675  0.243315 -0.435108  0.45722

In [None]:
import pandas as pd
from sklearn.datasets import load_wine
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import homogeneity_score, completeness_score

# Load the wine dataset
wine = load_wine()
data = pd.DataFrame(wine.data, columns=wine.feature_names)
target = wine.target

# Scale the data
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data)

# Perform K-Means clustering
kmeans = KMeans(n_clusters=3, random_state=42)
clusters = kmeans.fit_predict(data_scaled)

# Calculate homogeneity and completeness scores
homogeneity = homogeneity_score(target, clusters)
completeness = completeness_score(target, clusters)

# Print the results
print(f"Homogeneity Score: {homogeneity:.4f}")
print(f"Completeness Score: {completeness:.4f}")

Homogeneity Score: 0.8788
Completeness Score: 0.8730
