In [1]:
#Imported directly from kaggle
import pandas as pd

df = pd.read_csv("/kaggle/input/recipes/dataset/full_dataset.csv")

In [2]:
colNER = df['NER']

<class 'pandas.core.series.Series'>


In [None]:
import random

subsample_size = len(colNER) // 100

colNER_subsample = colNER.sample(n=subsample_size, random_state=42)

colNER_subsample = colNER_subsample.reset_index(drop=True)

Attempted vectorization and subsequent clustering of documents with tf-idf. The purpose is to divide the documents by ingradients, so I will use the NER column for the various clusters.

I used tf-idf although the ingredients within a single recipe do not repeat in the “NER” field. Later I will develop a mechanism better suited to this situation, using this method as the “base case”.

To create the vectorization documents, I simply concatenated all the ingredients in NER into a single string.

In [9]:
import ast

list_of_string = []
for item in colNER_subsample:
    toStr = ' '.join(ast.literal_eval(item))
    list_of_string.append(toStr)

Count: 2231142
Count: 2231142


In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_df=0.9, min_df=5, stop_words='english')

vectorizer.fit(list_of_string)

In [16]:
import random
vocab = vectorizer.get_feature_names_out()

print(f"Length of vocabulary: {len(vocab)}")
sorted(random.sample(vocab.tolist(),10))

Length of vocabulary: 8727


['drained',
 'exchanges',
 'flavouring',
 'groundbeef',
 'known',
 'maggie',
 'pamesan',
 'sailor',
 'steamfresh',
 'tomtoes']

In [19]:
vector_documents = vectorizer.transform(list_of_string)
print(vector_documents[0])
sorted([(vocab[j], vector_documents[0, j]) for j in vector_documents[0].nonzero()[1]], key=lambda x: -x[1])

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 11 stored elements and shape (1, 8727)>
  Coords	Values
  (0, 8254)	0.1670634612016866
  (0, 7557)	0.11449158380393296
  (0, 7078)	0.49005619007964896
  (0, 7017)	0.31765078305402134
  (0, 6465)	0.22525473579068508
  (0, 5270)	0.24571770099236873
  (0, 4855)	0.15354075780863224
  (0, 1035)	0.13376400919540918
  (0, 935)	0.19240771770418102
  (0, 631)	0.5023303314090842
  (0, 622)	0.41950379859053494


[('bite', 0.5023303314090842),
 ('size', 0.49005619007964896),
 ('biscuits', 0.41950379859053494),
 ('shredded', 0.31765078305402134),
 ('nuts', 0.24571770099236873),
 ('rice', 0.22525473579068508),
 ('brown', 0.19240771770418102),
 ('vanilla', 0.1670634612016866),
 ('milk', 0.15354075780863224),
 ('butter', 0.13376400919540918),
 ('sugar', 0.11449158380393296)]

After performing vectorization we try to define clusters. Since we do not have a number of clusters available, we should use a method to identify it.

In [None]:
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt


inertia = []  
K = range(2, 20)  

for k in K:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(vector_documents)
    inertia.append(kmeans.inertia_) 

# Plot
plt.figure(figsize=(8,5))
plt.plot(K, inertia, 'bo-')
plt.xlabel('Number of clusters (k)')
plt.ylabel('Inertia')
plt.title('Elbow Method For Optimal k')
plt.grid(True)
plt.show()


