# Information Theory Analysis of Muqatta'at

This notebook performs information theory analysis to test the hypothesis that Muqatta'at (المقطعات) function as checksums for Quranic text validation.

## Analysis Objectives
- Calculate Shannon entropy of letter distributions
- Analyze compression ratios before/after removing Muqatta'at
- Measure information content and predictive power
- Calculate redundancy and test checksum hypothesis


In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import entropy
import gzip
import zlib
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Set up plotting
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 10

# Import our data utilities
from data_utils import load_quran_data


In [None]:
# Load and process the Quran data
processor = load_quran_data("../datasets/quran-simple-clean.csv")
clean_df = processor.clean_dataset()

print(f"Dataset loaded: {len(clean_df)} verses from {clean_df['surah'].nunique()} surahs")
print(f"Surahs with Muqatta'at: {len(processor.get_surahs_with_muqattaat())}")
print(f"Surahs without Muqatta'at: {len(processor.get_surahs_without_muqattaat())}")


In [None]:
# Helper functions for information theory calculations
def calculate_shannon_entropy(text):
    """Calculate Shannon entropy of text."""
    if not text:
        return 0
    
    # Count character frequencies
    char_counts = Counter(text)
    total_chars = len(text)
    
    # Calculate probabilities
    probabilities = [count / total_chars for count in char_counts.values()]
    
    # Calculate Shannon entropy
    return entropy(probabilities, base=2)

def calculate_compression_ratio(text):
    """Calculate compression ratio using gzip."""
    if not text:
        return 0
    
    original_size = len(text.encode('utf-8'))
    compressed_size = len(gzip.compress(text.encode('utf-8')))
    
    return compressed_size / original_size if original_size > 0 else 0

def calculate_redundancy(text):
    """Calculate redundancy = 1 - (H/H_max)."""
    if not text:
        return 0
    
    # Calculate entropy
    H = calculate_shannon_entropy(text)
    
    # Maximum entropy (uniform distribution)
    unique_chars = len(set(text))
    H_max = np.log2(unique_chars) if unique_chars > 1 else 0
    
    return 1 - (H / H_max) if H_max > 0 else 0

def calculate_mutual_information(text1, text2):
    """Calculate mutual information between two texts."""
    # Create joint frequency table
    all_chars = set(text1 + text2)
    joint_counts = Counter()
    
    for i in range(min(len(text1), len(text2))):
        joint_counts[(text1[i], text2[i])] += 1
    
    # Calculate marginal probabilities
    text1_counts = Counter(text1)
    text2_counts = Counter(text2)
    
    total_pairs = sum(joint_counts.values())
    if total_pairs == 0:
        return 0
    
    # Calculate mutual information
    mi = 0
    for (c1, c2), count in joint_counts.items():
        p_joint = count / total_pairs
        p1 = text1_counts[c1] / len(text1) if len(text1) > 0 else 0
        p2 = text2_counts[c2] / len(text2) if len(text2) > 0 else 0
        
        if p_joint > 0 and p1 > 0 and p2 > 0:
            mi += p_joint * np.log2(p_joint / (p1 * p2))
    
    return mi

print("Information theory helper functions defined.")


## 1. Entropy Analysis


In [None]:
# Calculate entropy for all surahs
entropy_results = []

for surah_num in sorted(clean_df['surah'].unique()):
    # Get text with and without Muqatta'at
    text_with = processor.get_surah_text(surah_num, include_muqattaat=True)
    text_without = processor.get_surah_text(surah_num, include_muqattaat=False)
    
    # Calculate entropy
    entropy_with = calculate_shannon_entropy(text_with)
    entropy_without = calculate_shannon_entropy(text_without)
    
    # Calculate redundancy
    redundancy_with = calculate_redundancy(text_with)
    redundancy_without = calculate_redundancy(text_without)
    
    entropy_results.append({
        'surah': surah_num,
        'has_muqattaat': surah_num in processor.muqattaat_mapping,
        'entropy_with': entropy_with,
        'entropy_without': entropy_without,
        'redundancy_with': redundancy_with,
        'redundancy_without': redundancy_without,
        'entropy_difference': entropy_with - entropy_without,
        'redundancy_difference': redundancy_with - redundancy_without,
        'text_length': len(text_without)
    })

entropy_df = pd.DataFrame(entropy_results)
print(f"Calculated entropy for {len(entropy_df)} surahs")
print("\nEntropy Statistics:")
print(entropy_df[['entropy_with', 'entropy_without', 'entropy_difference']].describe())


## 2. Compression Analysis


In [None]:
# Calculate compression ratios for all surahs
compression_results = []

for surah_num in sorted(clean_df['surah'].unique()):
    # Get text with and without Muqatta'at
    text_with = processor.get_surah_text(surah_num, include_muqattaat=True)
    text_without = processor.get_surah_text(surah_num, include_muqattaat=False)
    
    # Calculate compression ratios
    comp_ratio_with = calculate_compression_ratio(text_with)
    comp_ratio_without = calculate_compression_ratio(text_without)
    
    compression_results.append({
        'surah': surah_num,
        'has_muqattaat': surah_num in processor.muqattaat_mapping,
        'compression_with': comp_ratio_with,
        'compression_without': comp_ratio_without,
        'compression_difference': comp_ratio_with - comp_ratio_without,
        'text_length': len(text_without)
    })

compression_df = pd.DataFrame(compression_results)
print(f"Calculated compression ratios for {len(compression_df)} surahs")
print("\nCompression Statistics:")
print(compression_df[['compression_with', 'compression_without', 'compression_difference']].describe())


## 3. Checksum Hypothesis Testing


In [None]:
# Test the checksum hypothesis by analyzing Muqatta'at predictive power
checksum_analysis = []

for surah_num in processor.get_surahs_with_muqattaat():
    muqattaat_letters = processor.get_muqattaat_letters(surah_num)
    surah_text = processor.get_surah_text(surah_num, include_muqattaat=False)
    
    # Calculate mutual information between Muqatta'at and surah content
    # We'll use the first part of the surah to test predictive power
    first_part = surah_text[:len(surah_text)//2]  # First half of surah
    
    # Create a "prediction" based on Muqatta'at letters
    muqattaat_prediction = muqattaat_letters * (len(first_part) // len(muqattaat_letters) + 1)
    muqattaat_prediction = muqattaat_prediction[:len(first_part)]
    
    # Calculate mutual information
    mi = calculate_mutual_information(muqattaat_prediction, first_part)
    
    # Calculate correlation between Muqatta'at letter frequencies and surah letter frequencies
    muqattaat_freq = Counter(muqattaat_letters)
    surah_freq = Counter(surah_text)
    
    # Get common letters
    common_letters = set(muqattaat_letters) & set(surah_text)
    
    if common_letters:
        muqattaat_probs = [muqattaat_freq[letter] / len(muqattaat_letters) for letter in common_letters]
        surah_probs = [surah_freq[letter] / len(surah_text) for letter in common_letters]
        correlation = np.corrcoef(muqattaat_probs, surah_probs)[0, 1] if len(muqattaat_probs) > 1 else 0
    else:
        correlation = 0
    
    checksum_analysis.append({
        'surah': surah_num,
        'muqattaat_letters': muqattaat_letters,
        'mutual_information': mi,
        'correlation': correlation,
        'common_letters_count': len(common_letters),
        'surah_length': len(surah_text)
    })

checksum_df = pd.DataFrame(checksum_analysis)
print(f"Analyzed checksum hypothesis for {len(checksum_df)} surahs with Muqatta'at")
print("\nChecksum Analysis Statistics:")
print(checksum_df[['mutual_information', 'correlation', 'common_letters_count']].describe())


## 4. Visualizations


In [None]:
# Create comprehensive visualizations
fig, axes = plt.subplots(2, 3, figsize=(18, 12))

# Plot 1: Entropy comparison
with_muqattaat_entropy = entropy_df[entropy_df['has_muqattaat'] == True]
without_muqattaat_entropy = entropy_df[entropy_df['has_muqattaat'] == False]

axes[0, 0].hist([with_muqattaat_entropy['entropy_without'], without_muqattaat_entropy['entropy_without']], 
                bins=15, alpha=0.7, label=['With Muqatta\'at', 'Without Muqatta\'at'])
axes[0, 0].set_title('Entropy Distribution Comparison')
axes[0, 0].set_xlabel('Shannon Entropy')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].legend()

# Plot 2: Entropy difference for surahs with Muqatta'at
axes[0, 1].scatter(with_muqattaat_entropy['surah'], with_muqattaat_entropy['entropy_difference'], 
                   alpha=0.7, s=60)
axes[0, 1].set_title('Entropy Difference (With - Without Muqatta\'at)')
axes[0, 1].set_xlabel('Surah Number')
axes[0, 1].set_ylabel('Entropy Difference')
axes[0, 1].axhline(y=0, color='red', linestyle='--', alpha=0.5)

# Plot 3: Compression ratio comparison
with_muqattaat_comp = compression_df[compression_df['has_muqattaat'] == True]
without_muqattaat_comp = compression_df[compression_df['has_muqattaat'] == False]

axes[0, 2].hist([with_muqattaat_comp['compression_without'], without_muqattaat_comp['compression_without']], 
                bins=15, alpha=0.7, label=['With Muqatta\'at', 'Without Muqatta\'at'])
axes[0, 2].set_title('Compression Ratio Distribution')
axes[0, 2].set_xlabel('Compression Ratio')
axes[0, 2].set_ylabel('Frequency')
axes[0, 2].legend()

# Plot 4: Redundancy analysis
axes[1, 0].scatter(entropy_df['text_length'], entropy_df['redundancy_without'], 
                   c=entropy_df['has_muqattaat'], alpha=0.7, s=60, cmap='viridis')
axes[1, 0].set_title('Redundancy vs Text Length')
axes[1, 0].set_xlabel('Text Length (characters)')
axes[1, 0].set_ylabel('Redundancy')
axes[1, 0].set_colorbar()

# Plot 5: Mutual information analysis
axes[1, 1].scatter(checksum_df['surah'], checksum_df['mutual_information'], 
                   alpha=0.7, s=60, color='orange')
axes[1, 1].set_title('Mutual Information Between Muqatta\'at and Surah Content')
axes[1, 1].set_xlabel('Surah Number')
axes[1, 1].set_ylabel('Mutual Information')

# Plot 6: Correlation analysis
axes[1, 2].scatter(checksum_df['surah'], checksum_df['correlation'], 
                   alpha=0.7, s=60, color='green')
axes[1, 2].set_title('Correlation Between Muqatta\'at and Surah Letter Frequencies')
axes[1, 2].set_xlabel('Surah Number')
axes[1, 2].set_ylabel('Correlation Coefficient')
axes[1, 2].axhline(y=0, color='red', linestyle='--', alpha=0.5)

plt.tight_layout()
plt.show()


## 5. Statistical Tests and Summary


In [None]:
# Statistical tests for information theory metrics
print("INFORMATION THEORY ANALYSIS SUMMARY")
print("=" * 50)

# Entropy comparison
entropy_with = with_muqattaat_entropy['entropy_without']
entropy_without = without_muqattaat_entropy['entropy_without']

t_stat_entropy, p_value_entropy = stats.ttest_ind(entropy_with, entropy_without)
print(f"\nEntropy Analysis:")
print(f"- Average entropy (with Muqatta'at): {entropy_with.mean():.4f}")
print(f"- Average entropy (without Muqatta'at): {entropy_without.mean():.4f}")
print(f"- T-test p-value: {p_value_entropy:.4f}")
print(f"- Significant difference: {'Yes' if p_value_entropy < 0.05 else 'No'}")

# Compression comparison
comp_with = with_muqattaat_comp['compression_without']
comp_without = without_muqattaat_comp['compression_without']

t_stat_comp, p_value_comp = stats.ttest_ind(comp_with, comp_without)
print(f"\nCompression Analysis:")
print(f"- Average compression ratio (with Muqatta'at): {comp_with.mean():.4f}")
print(f"- Average compression ratio (without Muqatta'at): {comp_without.mean():.4f}")
print(f"- T-test p-value: {p_value_comp:.4f}")
print(f"- Significant difference: {'Yes' if p_value_comp < 0.05 else 'No'}")

# Checksum hypothesis results
print(f"\nChecksum Hypothesis Testing:")
print(f"- Average mutual information: {checksum_df['mutual_information'].mean():.4f}")
print(f"- Average correlation: {checksum_df['correlation'].mean():.4f}")
print(f"- Surahs with positive correlation: {len(checksum_df[checksum_df['correlation'] > 0])}")
print(f"- Surahs with negative correlation: {len(checksum_df[checksum_df['correlation'] < 0])}")

# Key insights
print(f"\nKey Insights:")
if p_value_entropy < 0.05:
    print("✓ Significant difference in entropy between surahs with and without Muqatta'at")
else:
    print("✗ No significant difference in entropy between the two groups")

if p_value_comp < 0.05:
    print("✓ Significant difference in compression ratios between the two groups")
else:
    print("✗ No significant difference in compression ratios between the two groups")

if checksum_df['correlation'].mean() > 0.1:
    print("✓ Strong positive correlation between Muqatta'at and surah letter frequencies")
elif checksum_df['correlation'].mean() < -0.1:
    print("✓ Strong negative correlation between Muqatta'at and surah letter frequencies")
else:
    print("✗ Weak correlation between Muqatta'at and surah letter frequencies")

print(f"\nNext Steps:")
print("- Proceed to Frequency Domain analysis to detect spectral patterns")
print("- Investigate autocorrelation patterns in letter sequences")
print("- Consider advanced information theory metrics for deeper analysis")
