In [21]:
#visualizing top hashtags

import os
import pandas as pd
import ast
from collections import Counter
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image

def extract_hashtags(hashtags_str):
    if pd.notnull(hashtags_str):
        return [tag.strip() for tag in hashtags_str.split(",")]
    return []

def get_top_10_hashtags(hashtags_list):
    hashtag_counter = Counter(hashtags_list)
    top_10_hashtags = hashtag_counter.most_common(10)
    return top_10_hashtags

input_folder = r"D:\Research\Python\Data\WIP\Spike Data"
output_folder = r"D:\Research\Python\Data\WIP\SpikeHashRags"

os.makedirs(output_folder, exist_ok=True)


data = []  # List to store dataset and its top hashtags

for filename in os.listdir(input_folder):
    if filename.endswith(".csv"):
        input_file_path = os.path.join(input_folder, filename)
        df = pd.read_csv(input_file_path)

        all_hashtags_list = df["Hashtags"].dropna().apply(extract_hashtags).sum()
        top_10_hashtags = get_top_10_hashtags(all_hashtags_list)

        # Append the dataset and its top hashtags to the data list
        data.append([filename, top_10_hashtags])

# Create a DataFrame from the data list
df_output = pd.DataFrame(data, columns=["Dataset", "Top Hashtags"])

# Save the DataFrame to a CSV file
output_file_path = os.path.join(output_folder, "all_datasets_top_hashtags.csv")
df_output.to_csv(output_file_path, index=False)

print(f"All datasets' top hashtags saved in {output_file_path}")

# Create individual bar plots for each dataset and save them separately
for i, (dataset, top_hashtags) in enumerate(data):
    plt.figure(figsize=(10, 6))
    hashtags, counts = zip(*top_hashtags)
    plt.bar(hashtags, counts, alpha=0.8)
    plt.xlabel("Top Hashtags")
    plt.ylabel("Count")
    plt.title(f"Top 10 Hashtags for {dataset}")
    plt.xticks(rotation=90)
    plt.tight_layout()
    plt.savefig(os.path.join(output_folder, f"top_hashtags_plot_{i}.png"))
    plt.close()

