<a href="https://colab.research.google.com/github/muhammadhafiz27/Clustering-Teks-dengan-K-Means/blob/main/Clustering_Teks_dengan_K_Means.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import euclidean_distances

# Data dokumen
documents = [
    "makan ayam enak",
    "makan nasi",
    "ayam goreng enak",
    "python belajar",
    "machine python",
    "belajar python machine"
]

# Vocabulary
vocab = ["makan", "ayam", "enak", "nasi", "goreng", "python", "belajar", "machine"]

In [None]:
# Binary vector representation
def create_binary_vectors(docs, vocabulary):
    vectors = []
    for doc in docs:
        vector = [1 if word in doc.split() else 0 for word in vocabulary]
        vectors.append(vector)
    return np.array(vectors)

# Create vectors
X = create_binary_vectors(documents, vocab)
print("Binary Vectors:")
print(vocab)
for i, doc in enumerate(documents):
    print(f"D{i+1}: {X[i]} - {doc}")

Binary Vectors:
['makan', 'ayam', 'enak', 'nasi', 'goreng', 'python', 'belajar', 'machine']
D1: [1 1 1 0 0 0 0 0] - makan ayam enak
D2: [1 0 0 1 0 0 0 0] - makan nasi
D3: [0 1 1 0 1 0 0 0] - ayam goreng enak
D4: [0 0 0 0 0 1 1 0] - python belajar
D5: [0 0 0 0 0 1 0 1] - machine python
D6: [0 0 0 0 0 1 1 1] - belajar python machine


In [None]:
# Manual K-Means implementation
def manual_kmeans(X, k=2, max_iter=2):
    # Initial centroids
    centroids = X[[0, 3]]  # D1 and D4 as initial centroids
    print(f"\nInitial Centroids:")
    print(f"C1: {centroids[0]}")
    print(f"C2: {centroids[1]}")

    for iteration in range(max_iter):
        print(f"\n--- Iteration {iteration+1} ---")

        # Calculate distances
        distances = euclidean_distances(X, centroids)
        print("\nDistances to Centroids:")
        for i, doc in enumerate(documents):
            print(f"D{i+1}: C1={distances[i][0]:.2f}, C2={distances[i][1]:.2f}")

        # Assign clusters
        clusters = np.argmin(distances, axis=1)
        print(f"\nCluster Assignment: {clusters}")

        # Update centroids
        new_centroids = []
        for cluster_idx in range(k):
            cluster_points = X[clusters == cluster_idx]
            if len(cluster_points) > 0:
                new_centroid = cluster_points.mean(axis=0)
            else:
                new_centroid = centroids[cluster_idx]
            new_centroids.append(new_centroid)

        centroids = np.array(new_centroids)
        print(f"\nUpdated Centroids:")
        for i, centroid in enumerate(centroids):
            clean_centroid = [round(float(x), 2) for x in centroid]
            print(f"C{i+1}: {clean_centroid}")

    return clusters, centroids

print("="*50)
print("MANUAL K-MEANS IMPLEMENTATION")
print("="*50)

# Run manual K-Means
clusters_manual, centroids_manual = manual_kmeans(X)

MANUAL K-MEANS IMPLEMENTATION

Initial Centroids:
C1: [1 1 1 0 0 0 0 0]
C2: [0 0 0 0 0 1 1 0]

--- Iteration 1 ---

Distances to Centroids:
D1: C1=0.00, C2=2.24
D2: C1=1.73, C2=2.00
D3: C1=1.41, C2=2.24
D4: C1=2.24, C2=0.00
D5: C1=2.24, C2=1.41
D6: C1=2.45, C2=1.00

Cluster Assignment: [0 0 0 1 1 1]

Updated Centroids:
C1: [0.67, 0.67, 0.67, 0.33, 0.33, 0.0, 0.0, 0.0]
C2: [0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.67, 0.67]

--- Iteration 2 ---

Distances to Centroids:
D1: C1=0.75, C2=2.21
D2: C1=1.25, C2=1.97
D3: C1=1.11, C2=2.21
D4: C1=1.89, C2=0.75
D5: C1=1.89, C2=0.75
D6: C1=2.13, C2=0.47

Cluster Assignment: [0 0 0 1 1 1]

Updated Centroids:
C1: [0.67, 0.67, 0.67, 0.33, 0.33, 0.0, 0.0, 0.0]
C2: [0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.67, 0.67]


In [None]:
# Print final results
print("\n" + "="*50)
print("FINAL RESULTS - MANUAL K-MEANS")
print("="*50)
print("Cluster 1 (Food):", [f"D{i+1}" for i in range(len(clusters_manual)) if clusters_manual[i] == 0])
print("Cluster 2 (Programming):", [f"D{i+1}" for i in range(len(clusters_manual)) if clusters_manual[i] == 1])


FINAL RESULTS - MANUAL K-MEANS
Cluster 1 (Food): ['D1', 'D2', 'D3']
Cluster 2 (Programming): ['D4', 'D5', 'D6']


In [None]:
# Using sklearn K-Means
print("\n" + "="*50)
print("SKLEARN K-MEANS")
print("="*50)

kmeans = KMeans(n_clusters=2, init=X[[0, 3]], n_init=1, max_iter=2, random_state=42)
kmeans.fit(X)

print("Cluster Labels:", kmeans.labels_)
print("Cluster 1 (Food):", [f"D{i+1}" for i in range(len(kmeans.labels_)) if kmeans.labels_[i] == 0])
print("Cluster 2 (Programming):", [f"D{i+1}" for i in range(len(kmeans.labels_)) if kmeans.labels_[i] == 1])
print("Final Centroids:")
for i, centroid in enumerate(kmeans.cluster_centers_):
    clean_centroid = [round(float(x), 2) for x in centroid]
    print(f"C{i+1}: {clean_centroid}")

# PERBAIKAN: Tambahkan perbandingan dengan hasil manual
print("\n" + "="*50)
print("COMPARISON WITH MANUAL K-MEANS")
print("="*50)
print("Are manual and sklearn results identical?", np.array_equal(clusters_manual, kmeans.labels_))


SKLEARN K-MEANS
Cluster Labels: [0 0 0 1 1 1]
Cluster 1 (Food): ['D1', 'D2', 'D3']
Cluster 2 (Programming): ['D4', 'D5', 'D6']
Final Centroids:
C1: [0.67, 0.67, 0.67, 0.33, 0.33, 0.0, 0.0, 0.0]
C2: [0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.67, 0.67]

COMPARISON WITH MANUAL K-MEANS
Are manual and sklearn results identical? True


In [None]:
# Interpretation
print("\n" + "="*50)
print("INTERPRETATION")
print("="*50)

dominant_words_cluster1 = [vocab[i] for i in range(len(vocab)) if centroids_manual[0][i] > 0.3]
dominant_words_cluster2 = [vocab[i] for i in range(len(vocab)) if centroids_manual[1][i] > 0.3]

print("Dominant words in Cluster 1 (Food):", dominant_words_cluster1)
print("Dominant words in Cluster 2 (Programming):", dominant_words_cluster2)
print("Manual and sklearn results are identical:", np.array_equal(clusters_manual, kmeans.labels_))


INTERPRETATION
Dominant words in Cluster 1 (Food): ['makan', 'ayam', 'enak', 'nasi', 'goreng']
Dominant words in Cluster 2 (Programming): ['python', 'belajar', 'machine']
Manual and sklearn results are identical: True
