In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv('/kaggle/input/recipe-sampled-0-25/sampled_dataset.csv')

In [None]:
titles = df['title'].dropna().astype(str)
print(len(titles))

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

In [None]:
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(titles)

In [None]:
inertia = []
k_values = list(range(2, 50))

for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init='auto')
    kmeans.fit(X)
    inertia.append(kmeans.inertia_)


In [None]:
plt.figure(figsize=(8, 5))
plt.plot(k_values, inertia, marker='o')
plt.title('Metodo del gomito per determinare k')
plt.xlabel('Numero di cluster (k)')
plt.ylabel('Inerzia')
plt.xticks(k_values)
plt.grid(True)
plt.show()

It seems that two interesting values for K are 13 and 21, let's try both of them.

In [None]:
n_clusters = 13
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init='auto')
kmeans.fit(X)

# Aggiungi i cluster al dataframe originale
df['cluster'] = kmeans.labels_

# Mostra un esempio di titoli raggruppati per cluster
for i in range(n_clusters):
    print(f"\nCluster {i}:")
    print(df[df['cluster'] == i]['title'].head(5).to_string(index=False))

In [None]:
# Get the centroid for the first cluster
centroid = kmeans.cluster_centers_[0]

# Sort terms according to their weights
# (argsort goes from lowest to highest, we reverse the order through slicing)
sorted_terms = centroid.argsort()[::-1]

# Print out the top 10 terms for the cluster
[titles[j] for j in sorted_terms[:20]]

In [None]:
n_clusters = 21
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init='auto')
kmeans.fit(X)

# Aggiungi i cluster al dataframe originale
df['cluster'] = kmeans.labels_

# Mostra un esempio di titoli raggruppati per cluster
for i in range(n_clusters):
    print(f"\nCluster {i}:")
    print(df[df['cluster'] == i]['title'].head(5).to_string(index=False))

In [None]:
# Get the centroid for the first cluster
centroid = kmeans.cluster_centers_[0]

# Sort terms according to their weights
# (argsort goes from lowest to highest, we reverse the order through slicing)
sorted_terms = centroid.argsort()[::-1]

# Print out the top 10 terms for the cluster
[titles[j] for j in sorted_terms[:20]]

In [None]:
print("Top terms per cluster:")
vocab = vectorizer.get_feature_names_out()

for i in range(kmeans.n_clusters):
    centroid = kmeans.cluster_centers_[i]
    sorted_terms = centroid.argsort()[::-1]
    print(f"Cluster {i}:\t{[vocab[j] for j in sorted_terms[:5]]}")

In [None]:
print('Number of docs in: ')

for i in range(kmeans.n_clusters):
    print(f"Cluster {i}: {np.sum(kmeans.labels_ == i)}")

### Evaluation

In order to evaluate our clustering we are using only intrinsic method, since we do not have the real labels of the clusters.

In [None]:
from sklearn import metrics

print("Intrinsic evaluation measures:")
print("Within-cluster sum-of-squares:", str(kmeans.inertia_))
#print("Silhouette coefficient:", str(metrics.silhouette_score(X, kmeans.labels_)))

Since the dataset is quite big, even reducing it to a quarter of the original, we try to perform also a Mini Batch Clustering.

In [None]:
from sklearn.cluster import MiniBatchKMeans

n_clusters = 21
mb_kmeans = MiniBatchKMeans(n_clusters=n_clusters,batch_size=500, random_state=2307)
mb_kmeans.fit(X)

In [None]:
print("Intrinsic evaluation measures:")
print("Within-cluster sum-of-squares:", str(mb_kmeans.inertia_))
#print("Silhouette coefficient:", str(metrics.silhouette_score(X, mb_kmeans.labels_)))

In [None]:
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(3)
reduced_data = svd.fit_transform(X)

[x,y,z] = np.transpose(reduced_data)
[x,y,z]

In [None]:
fig = plt.figure(figsize=(15, 10))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(x, y, z, c=kmeans.labels_, marker='.');

## Clustering su NER

In [4]:
import ast
from itertools import chain

df = df.dropna(subset=['NER'])  # rimuove righe con NER = NaN
df['NER'] = df['NER'].apply(ast.literal_eval)

In [6]:
df['NER'] = df['NER'].apply(lambda x: list(set(x)))

In [7]:
documents = df['NER'].apply(lambda x: ' '.join(x))

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

# TF-IDF
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(documents)

In [None]:
inertia = []
k_values = list(range(50, 100))

for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init='auto')
    kmeans.fit(X)
    inertia.append(kmeans.inertia_)

In [None]:
plt.figure(figsize=(8, 5))
plt.plot(k_values, inertia, marker='o')
plt.title('Metodo del gomito per determinare k')
plt.xlabel('Numero di cluster (k)')
plt.ylabel('Inerzia')
plt.xticks(k_values)
plt.grid(True)
plt.show()

In [10]:
k = 57

kmeans = KMeans(n_clusters=k, random_state=42, n_init='auto')
kmeans.fit(X)

# Aggiungi i cluster al dataframe originale
df['cluster_ner'] = kmeans.labels_

# Mostra un esempio di titoli raggruppati per cluster
for i in range(k):
    print(f"\nCluster {i}:")
    print(df[df['cluster_ner'] == i]['title'].head(5).to_string(index=False))


Cluster 0:
        Pasta with Anchovies and Tomatoes
                       Herb Fryer Chicken
                              Rogue Sauce
                         Italian Dressing
Savvy Shrimp Saute'  Mediterranean Style 

Cluster 1:
              Savory Wheat Crescents
                      Great Baguette
                      Cornmeal Rolls
                    Cloverleaf Rolls
Pearsauce Or Applesauce Bread Recipe

Cluster 2:
                   7 Minute Frosting
             Vanilla Butter Rollouts
                Pecan Crunch Cookies
Chocolate Chestnut Cake(Serves 20)  
                Best Angel Food Cake

Cluster 3:
Creamy Date & Honey Spread
        Quick Monkey Bread
      Stuffed French Toast
  Basic Pumpkin Pie Recipe
                  G.F.G.S.

Cluster 4:
                   Spinach and Rice Casserole
Spinach & Artichoke Chicken With Creamy Pasta
         Spinach Dip With Homemade Pita Chips
                                Spinach Balls
                           Meat-Lovers' R