In [None]:
import ast
import pandas as pd
import networkx as nx
import seaborn as sns
import matplotlib.pyplot as plt
import community as community_louvain
from itertools import combinations

import nltk
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist

In [None]:
df = pd.read_csv('data/flu_data_token.csv')
df

In [None]:
token: list = df['Token'][0]

In [None]:
token_list = ast.literal_eval(token)

In [None]:
print(token_list)

### Word count

In [None]:
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist


def word_count(cleaned_tokens):
  # Calculate word frequencies
  word_freq = FreqDist(cleaned_tokens)

  # Convert word frequencies to a DataFrame for seaborn
  data = {'Word': __builtins__.list(word_freq.keys()), 'Frequency': __builtins__.list(word_freq.values())}
  df_word_freq = pd.DataFrame(data)

  # Sort DataFrame by frequency in descending order
  df_word_freq = df_word_freq.sort_values(by='Frequency', ascending=False)
  print(df_word_freq.head(60))

  return df_word_freq

In [None]:
df_word_freq = word_count(token_list)

### Word Cloud

In [None]:
def token2word(token):
  word = ' '.join(token)
  return word

In [None]:
word = token2word(token_list)

In [None]:
import matplotlib.pyplot as plt
from wordcloud import WordCloud


def wordcloud_vis(word):
  # Generate word cloud
  wordcloud = WordCloud(width=800, height=400, background_color='white').generate(word)

  # Display the generated word cloud using matplotlib
  plt.figure(figsize=(10, 5))
  plt.imshow(wordcloud, interpolation='bilinear')
  plt.axis('off')
  plt.show()

In [None]:
wordcloud_vis(word)

### Bar chart

In [None]:
def bar_chart_vis(token):

  word_freq = FreqDist(token)
  # Convert word frequencies to a DataFrame for seaborn
  data = {'Word': __builtins__.list(word_freq.keys()), 'Frequency': __builtins__.list(word_freq.values())}
  df_word_freq = pd.DataFrame(data)

  # Sort DataFrame by frequency in descending order
  df_word_freq = df_word_freq.sort_values(by='Frequency', ascending=False)

  # Plot a bar chart using seaborn
  plt.figure(figsize=(12, 6))
  sns.barplot(x='Word', y='Frequency', data=df_word_freq.head(20), palette='viridis')
  plt.title('Top 20 Most Frequent Words')
  plt.xlabel('Words')
  plt.ylabel('Frequency')
  plt.xticks(rotation=45, ha='right')  # Rotate x-axis labels for better readability
  plt.show()

In [None]:
bar_chart_vis(token_list)

### Bubble chart

In [None]:
def bubble_chart_vis(cleaned_tokens):

  # Calculate word frequencies
  word_freq = FreqDist(cleaned_tokens)

  # Create a DataFrame with word frequencies and lengths
  data = {'Word': __builtins__.list(word_freq.keys()), 'Frequency': __builtins__.list(word_freq.values()), 'Length': [len(word) for word in word_freq.keys()]}
  df_word_data = pd.DataFrame(data)

  # Filter out words with frequency less than 2 for better visualization
  df_word_data = df_word_data[df_word_data['Frequency'] >= 2]

  # Plot a bubble chart using seaborn
  plt.figure(figsize=(12, 8))
  sns.scatterplot(x='Length', y='Frequency', size='Frequency', data=df_word_data, hue='Word', sizes=(50, 300), palette='viridis', alpha=0.8)
  plt.title('Bubble Chart of Word Frequencies and Lengths')
  plt.xlabel('Word Length')
  plt.ylabel('Frequency')
  plt.show()

In [None]:
bubble_chart_vis(token_list)

### Network Diagram

In [None]:
def network_vis(df_word_freq):
    token_rank = df_word_freq.head(20)
    tr = token_rank['Word'].tolist()

    # Create a graph using networkx
    G = nx.Graph()

    # Create edges between co-occurring words
    for word1, word2 in combinations(tr, 2):
        if G.has_edge(word1, word2):
            G[word1][word2]['weight'] += 1
        else:
            G.add_edge(word1, word2, weight=1)

    # Detect communities (clusters) in the graph
    partition = community_louvain.best_partition(G)
    # Map each community to a distinct color
    community_colors = [partition[n] for n in G.nodes()]

    # Set node size based on degree (number of connections)
    node_size = [deg * 100 for deg in dict(G.degree()).values()]  # Adjusted size for better visibility

    # Draw the network diagram with community-based coloring
    plt.figure(figsize=(12, 12))
    pos = nx.spring_layout(G, seed=42)  # For consistent layout
    nx.draw(G, pos, with_labels=True, font_size=8, node_size=node_size,
            cmap=plt.cm.jet, node_color=community_colors, font_color='white',
            edge_color='gray', font_weight='bold', alpha=0.7)
    plt.title('Text Network Diagram based on Word Co-occurrence with Clustering')
    plt.show()

In [None]:
network_vis(df_word_freq)