In [9]:
import os
import pandas as pd
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from sklearn.cluster import KMeans
import nltk
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import warnings
from PIL import Image

warnings.filterwarnings("ignore")

def horizontal_join_images(images):
    widths, heights = zip(*(i.size for i in images))
    total_width = sum(widths)
    max_height = max(heights)

    new_image = Image.new('RGB', (total_width, max_height))
    x_offset = 0

    for img in images:
        new_image.paste(img, (x_offset, 0))
        x_offset += img.size[0]

    return new_image


def vertical_join_images(images):
    widths, heights = zip(*(i.size for i in images))
    max_width = max(widths)
    total_height = sum(heights)

    new_image = Image.new('RGB', (max_width, total_height))
    y_offset = 0

    for img in images:
        new_image.paste(img, (0, y_offset))
        y_offset += img.size[1]

    return new_image
    

# NLTK tokenizer data downloaded
nltk.download('punkt')

# Additional stopwords removal
additional_stopwords = ['get', 'live', 'amp', 'program', 'please', 'people', 'back', 'care', 'way', 'pay', 'use', 'do', 'massie', 'point', 'medicaid', 'increase','let','ve',
                        'us','try','be','i','m','call','he','s','etc','say','re','can','will','go','ss','inp']

# Step 1: Load the datasets from CSV files
input_folder = r"D:\Research\Python\Data\WIP\Spike Data"
csv_files = [f for f in os.listdir(input_folder) if f.endswith('.csv')]

datasets = {}
for file in csv_files:
    dataset_name = os.path.splitext(file)[0]
    datasets[dataset_name] = pd.read_csv(os.path.join(input_folder, file))

# Step 2: Preprocess the text data (
def preprocess_text(text):
    words = word_tokenize(text.lower())
    words = [word for word in words if word.isalpha() and word not in additional_stopwords]
    return ' '.join(words)

for dataset_name, dataset in datasets.items():
    datasets[dataset_name]['Text_Lemmatized'] = datasets[dataset_name]['Text_Lemmatized'].apply(preprocess_text)

# Step 3: Train a Word2Vec model
word2vec_models = {}
for dataset_name, dataset in datasets.items():
    sentences = [word_tokenize(text) for text in dataset['Text_Lemmatized']]
    model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)
    word2vec_models[dataset_name] = model

# Step 4: Use K-means clustering
num_clusters = 2  #C#####

kmeans_models = {}
for dataset_name, model in word2vec_models.items():
    word_vectors = model.wv
    kmeans = KMeans(n_clusters=num_clusters, random_state=42)
    kmeans.fit(word_vectors.vectors)
    kmeans_models[dataset_name] = kmeans

# Step 5: Extract top 10 keywords per cluster
def get_top_keywords_per_cluster(kmeans, word_vectors, top_n=10):
    cluster_keywords = {}
    order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]
    terms = word_vectors.index_to_key

    for i in range(num_clusters):
        cluster_keywords[i] = [terms[ind] for ind in order_centroids[i, :top_n]]
    
    return cluster_keywords

top_keywords_per_dataset = {}
for dataset_name, model in word2vec_models.items():
    kmeans = kmeans_models[dataset_name]
    word_vectors = model.wv
    top_keywords_per_dataset[dataset_name] = get_top_keywords_per_cluster(kmeans, word_vectors)

# Step 6: Store the results in a single CSV file
output_folder = r"D:\Research\Python\Data\WIP\Clustering\Top Keywords"
output_file = os.path.join(output_folder, "Combined_Top_Keywords.csv")

# Combine the results from all datasets into a single DataFrame
combined_keywords = pd.DataFrame(top_keywords_per_dataset)

# Save the combined DataFrame to CSV
combined_keywords.to_csv(output_file, index=False)

# Step 7: Draw a word cloud per each cluster
dataset_wordclouds = {}  # To store word clouds per dataset

for dataset_name, keywords_per_cluster in top_keywords_per_dataset.items():
    dataset_wordclouds[dataset_name] = []  # Initialize the list for the current dataset

    for cluster_id, keywords in keywords_per_cluster.items():
        wordcloud = WordCloud(width=800, height=400, background_color='white').generate(' '.join(keywords))
        plt.figure(figsize=(10, 5))
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.title(f"Word Cloud - Dataset: {dataset_name}, Cluster: {cluster_id}")
        plt.axis("off")
        plt.savefig(os.path.join(output_folder, f"wordcloud_{dataset_name}_cluster_{cluster_id}.png"))

        # Convert the individual word cloud to an image and store in the dataset_wordclouds list
        img = Image.open(os.path.join(output_folder, f"wordcloud_{dataset_name}_cluster_{cluster_id}.png"))
        dataset_wordclouds[dataset_name].append(img)

        plt.close()

# horizontally join word cloud images per dataset
dataset_combined_images = {}
for dataset_name, images in dataset_wordclouds.items():
    combined_img = horizontal_join_images(images)
    dataset_combined_images[dataset_name] = combined_img

# vertically join the word cloud images per dataset and output the result
output_combined_image = vertical_join_images(list(dataset_combined_images.values()))
output_combined_image.save(os.path.join(output_folder, "combined_wordcloud.png"))



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rasikac\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
