In [None]:
## Libraries
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from collections import Counter
from nltk.lm.preprocessing import flatten
from nltk.util import ngrams
import matplotlib.pyplot as plt
from PIL import Image
from wordcloud import WordCloud
import unicodedata
import stop_words
import spacy
from spacy.lang.en import stop_words
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

In [None]:
# Import Corpus
df = pd.read_csv('wine.csv')
print(df.head(2))

In [None]:
# Clean Data
df.dropna(inplace=True)

In [None]:
# Preprocessing Data
df['notes'] = df['notes'].apply(lambda x: unicodedata.normalize('NFKD', x).encode('ascii', 'ignore').decode('utf-8', 'ignore'))
df['notes'] = df['notes'].str.lower()
df['notes'] = df['notes'].str.replace(r'[^\w\s]','', regex = True)
df['notes'] = df['notes'].str.replace('\d+', '', regex=True)
stop_words = stop_words.STOP_WORDS
df['notes'] = df['notes'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
df['notes'].head(5)

In [None]:
# Vectorize Text
documents = list(df['notes'])
vectorizer = TfidfVectorizer()
vectorized_documents = vectorizer.fit_transform(documents)
vectorized_documents

In [None]:
# Dimesions Reduction (Two Dimesnions)
pca = PCA(n_components=2)
reduced_data = pca.fit_transform(vectorized_documents.toarray())

In [None]:
# Cluster using k-means

# cluster the documents using k-means
num_clusters = 3
kmeans = KMeans(n_clusters=num_clusters, n_init=5,
                max_iter=500, random_state=42)
kmeans.fit(vectorized_documents)

# create a dataframe to store the results
results = pd.DataFrame()
results['document'] = documents
results['cluster'] = kmeans.labels_

# print the results
print(results.sample(50))

In [None]:
# Word cloud (Label clusters)
def wordcloud_clusters(model, vectors, features, no_top_words=40):
    for cluster in np.unique(model.labels_):
        size = {}
        words = vectors[model.labels_ == cluster].sum(axis=0).A[0]
        largest = words.argsort()[::-1] # invert sort order
        for i in range(0, no_top_words):
            size[features[largest[i]]] = abs(words[largest[i]])
        wc = WordCloud(background_color="white", max_words=100, width=960, height=540)
        wc.generate_from_frequencies(size)
        plt.figure(figsize=(12,12))
        plt.imshow(wc, interpolation='bilinear')
        plt.axis("off")
        # if you don't want to save the topic model, comment the next line
        plt.savefig(f'cluster{cluster}.png')
        plt.close()

wordcloud_clusters(kmeans, vectorized_documents, vectorizer.get_feature_names_out())

In [None]:
#Display the saved clusters
for cluster in range(3):  # Assuming there are 3 clusters
    img = plt.imread(f'cluster{cluster}.png')
    plt.figure(figsize=(12, 12))
    plt.imshow(img)
    plt.axis("off")
    plt.title(f'Cluster {cluster}', fontsize=20)
    plt.show()

In [None]:
# Plot clusters
colors = ['blue', 'green', 'purple']
cluster = ['notes','name', 'variety'] #You must determine appropriate labels based on the word clouds
for i in range(num_clusters):
    plt.scatter(reduced_data[kmeans.labels_ == i, 0],
                reduced_data[kmeans.labels_ == i, 1],
                s=10, color=colors[i],
                label=f' {cluster[i]}')
plt.legend()
plt.show()

In [None]:
# Compare Cluster Label (Atleast one meta-data varibale)
cluster_variety_comparison = pd.crosstab(results['cluster'], df['variety'])
print(cluster_variety_comparison)

# Calculating % distribution of each 'variety' within each cluster
percentage_distribution = cluster_variety_comparison.div(cluster_variety_comparison.sum(axis=1), axis=0) * 100
print(percentage_distribution)

In [None]:
# Finding the optimal # of Clusters
from sklearn.metrics import silhouette_score
for k in range(2,30):
  kmeans = KMeans(n_clusters=k, random_state=10, n_init = 5)
  cluster_labels = kmeans.fit_predict(vectorized_documents)
  score = silhouette_score(vectorized_documents, cluster_labels)
  print(f'The silhouette score for {k} clusters is {score}')