# Entropy Calculations
Here we will perform entropy calculations and make comparisons

In [None]:
import pandas as pd
from sudachipy import tokenizer, dictionary
from collections import Counter
import math

# Step 1: Load the CSV file (update the file path as needed)
csv_file_path = 'cleaned_sentences_UO.csv'  # Replace with your file path
data = pd.read_csv(csv_file_path)

# Step 2: Extract sentences from the 'sentence' column
sentences = data['sentence'].dropna().tolist()

# Step 3: Initialize SudachiPy tokenizer
tokenizer_obj = dictionary.Dictionary().create()
mode = tokenizer.Tokenizer.SplitMode.C  # Normal splitting mode

# Step 4: Tokenize sentences
tokens = []
for sentence in sentences:
    tokens.extend([m.surface() for m in tokenizer_obj.tokenize(sentence, mode)])

# Step 5: Calculate unigram probabilities
total_tokens = len(tokens)
token_counts = Counter(tokens)
token_probabilities = {token: count / total_tokens for token, count in token_counts.items()}

# Step 6: Compute unigram entropy
unigram_entropy = -sum(p * math.log2(p) for p in token_probabilities.values())

# Output the results
print(f"Total Tokens: {total_tokens}")
print(f"Unique Tokens: {len(token_counts)}")
print(f"Unigram Entropy: {unigram_entropy:.4f} bits")