In [5]:
# Import required libraries
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.cluster import KMeans
from tabulate import tabulate 
from collections import Counter

# Load the dataset
df = pd.read_csv("customer_complaints_1.csv")

# Contraction map
contractions = {"don't": "do not", "can't": "cannot", "won't": "will not", "i'm": "i am", "it's": "it is",
                "i'd": "i would", "w/": "with", "i've": "i have", "isn't": "is not", "haven't": "have not", "they'll": "they will"}


# Preprocessing function
def preprocess(text):
    text = str(text).lower()
    for c in contractions:
        text = text.replace(c, contractions[c])
    text = re.sub(r"\d+", "", text)  # Remove digits
    text = re.sub(r"[^\w\s]", "", text)  # Remove punctuation
    text = re.sub(r"\s+", " ", text).strip()  # Remove extra spaces
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in ENGLISH_STOP_WORDS and len(word) > 2]
    return tokens

# Apply preprocessing
df["preprocessed_text"] = df["text"].apply(preprocess)

# Join the tokens back into strings for vectorization
df["preprocessed_string"] = df["preprocessed_text"].apply(lambda x: " ".join(x))

# Vectorize using TF-IDF on the string version
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df["preprocessed_string"])  # Use the string version instead of the token list

# Apply KMeans clustering
k = 3
kmeans = KMeans(n_clusters=k, random_state=42)
df["cluster"] = kmeans.fit_predict(X)

# Tabulate the document and predicted cluster
table_data = [["Document", "Predicted Cluster"]] 
table_data.extend([[doc, cluster] for doc, cluster in zip(df["text"], df["cluster"])])
print(tabulate(table_data[:11], headers="firstrow"))  # Limit to 10 results

# View top terms per cluster
terms = vectorizer.get_feature_names_out()
order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]

print("\nTop terms per cluster:")
for i in range(k):
    print(f"\nCluster {i}:")
    for ind in order_centroids[i, :10]:
        print(f" - {terms[ind]}")

# Estimate purity
cluster_counts = df['cluster'].value_counts()
estimated_purity = cluster_counts.max() / len(df)
print("\nPurity:", round(estimated_purity, 4))

Document                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                

In [3]:
# Import required libraries
import pandas as pd
import re
from sklearn.cluster import KMeans
from tabulate import tabulate
from collections import Counter
import numpy as np
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import nltk
from nltk.stem import WordNetLemmatizer

nltk.download("wordnet")
nltk.download("omw-1.4")

# Load dataset
df = pd.read_csv("customer_complaints_1.csv")

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

# Contraction map
contractions = {"don't": "do not", "can't": "cannot", "won't": "will not", "i'm": "i am", "it's": "it is",
                "i'd": "i would", "w/": "with", "i've": "i have", "isn't": "is not", "haven't": "have not", "they'll": "they will"}

# Preprocessing function
def preprocess(text):
    text = str(text).lower()
    for c in contractions:
        text = text.replace(c, contractions[c])
    text = re.sub(r"\d+", "", text)  # Remove digits
    text = re.sub(r"[^\w\s]", "", text)  # Remove punctuation
    text = re.sub(r"\s+", " ", text).strip()  # Remove extra spaces
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in ENGLISH_STOP_WORDS and len(word) > 2]
    return tokens

# Apply preprocessing
df["tokens"] = df["text"].apply(preprocess)

# Train Word2Vec model
w2v_model = Word2Vec(sentences=df["tokens"], vector_size=100, window=5, min_count=1, workers=4, seed=42)

# Convert each document to a vector
def vectorize(tokens):
    vectors = [w2v_model.wv[word] for word in tokens if word in w2v_model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(w2v_model.vector_size)

df["vector"] = df["tokens"].apply(vectorize)
X = np.stack(df["vector"].values)

# Apply KMeans clustering
k = 3
kmeans = KMeans(n_clusters=k, random_state=42)
df["cluster"] = kmeans.fit_predict(X)

# Display tabulated results
table_data = [["Document", "Predicted Cluster"]] 
table_data.extend([[doc, cluster] for doc, cluster in zip(df["text"], df["cluster"])])
print(tabulate(table_data[:11], headers="firstrow"))  # Display first 10 documents

# Sample representative documents
print("\nSample document from each cluster:")
for i in range(k):
    print(f"\nCluster {i}:")
    sample_text = df[df["cluster"] == i]["text"].iloc[0]
    print(f" - {sample_text}")

# Estimate purity
cluster_counts = df["cluster"].value_counts()
estimated_purity = cluster_counts.max() / len(df)
print("\nPurity:", round(estimated_purity, 4))

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\naufa\AppData\Roaming\nltk_data...
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\naufa\AppData\Roaming\nltk_data...


Document                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                

