# Basic Statistical Analysis of Muqatta'at

This notebook performs basic statistical analysis on the Muqatta'at (المقطعات) letters to test the hypothesis that they function as checksums for Quranic text validation.

## Analysis Objectives
- Calculate letter frequency distributions for each surah
- Compare statistical properties between surahs with and without Muqatta'at
- Analyze correlation between Muqatta'at letters and surah content
- Visualize patterns and distributions


In [10]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Set up plotting
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 10

# Import our data utilities
from data_utils import load_quran_data


In [11]:
# Load and process the Quran data
processor = load_quran_data("../datasets/quran-simple-clean.csv")
clean_df = processor.clean_dataset()

print(f"Dataset loaded: {len(clean_df)} verses from {clean_df['surah'].nunique()} surahs")
print(f"Surahs with Muqatta'at: {len(processor.get_surahs_with_muqattaat())}")
print(f"Surahs without Muqatta'at: {len(processor.get_surahs_without_muqattaat())}")


Loaded 6236 verses from 114 surahs
Identified 29 surahs with Muqatta'at
Cleaned dataset: 6229 verses from 113 surahs
Dataset loaded: 6229 verses from 113 surahs
Surahs with Muqatta'at: 29
Surahs without Muqatta'at: 84


In [None]:
# Get basic statistics for all surahs
stats_df = processor.get_all_surah_stats()
print("Basic Statistics Summary:")
print(stats_df.describe())

# Display first few rows
print("\nFirst 10 surahs:")
print(stats_df.head(10))

Basic Statistics Summary:
           surah  verse_count  total_chars_with_muqattaat  \
count  113.00000   113.000000                  113.000000   
mean    58.00000    55.123894                 3691.061947   
std     32.76431    53.258321                 5071.791615   
min      2.00000     3.000000                   52.000000   
25%     30.00000    17.000000                  505.000000   
50%     58.00000    40.000000                 1824.000000   
75%     86.00000    78.000000                 4789.000000   
max    114.00000   286.000000                33335.000000   

       total_chars_without_muqattaat  unique_letters_with_muqattaat  \
count                     113.000000                     113.000000   
mean                     3690.106195                      35.681416   
std                      5071.114791                       6.132853   
min                        52.000000                      14.000000   
25%                       505.000000                      35.000000  

## 1. Letter Frequency Analysis


In [13]:
# Calculate letter frequencies for all surahs, displaying clear output as a DataFrame

import pandas as pd

letter_freq_rows = []

for surah_num in sorted(clean_df['surah'].unique()):
    freq_with = processor.get_surah_letter_frequency(surah_num, include_muqattaat=True)
    freq_without = processor.get_surah_letter_frequency(surah_num, include_muqattaat=False)
    has_muqattaat = surah_num in processor.muqattaat_mapping

    total_letters_with = sum(freq_with.values())
    unique_letters_with = len(freq_with)
    total_letters_without = sum(freq_without.values())
    unique_letters_without = len(freq_without)

    letter_freq_rows.append({
        'surah': surah_num,
        'has_muqattaat': has_muqattaat,
        'letter_freq_with_muqattaat': freq_with,
        'letter_freq_without_muqattaat': freq_without,
        'total_letters_with': total_letters_with,
        'unique_letters_with': unique_letters_with,
        'total_letters_without': total_letters_without,
        'unique_letters_without': unique_letters_without
    })

letter_frequencies_df = pd.DataFrame(letter_freq_rows)

print(f"Calculated letter frequencies for {len(letter_frequencies_df)} surahs. First few rows:")
display_cols = [
    'surah', 'has_muqattaat', 'total_letters_with', 'unique_letters_with', 
    'total_letters_without', 'unique_letters_without'
]
print(letter_frequencies_df[display_cols].head(10))

# Show detailed frequencies for first surah as an example
pd.set_option('display.max_rows', None)
print("\nDetailed letter frequencies for the first surah:")
first_surah_freqs = letter_frequencies_df.iloc[0]
print(f"With Muqatta'at:\n{first_surah_freqs['letter_freq_with_muqattaat']}")
print(f"Without Muqatta'at:\n{first_surah_freqs['letter_freq_without_muqattaat']}")



Calculated letter frequencies for 113 surahs. First few rows:
   surah  has_muqattaat  total_letters_with  unique_letters_with  \
0      2           True               26723                   42   
1      3           True               15194                   40   
2      4          False               16587                   40   
3      5          False               12408                   42   
4      6          False               12965                   41   
5      7           True               14640                   43   
6      8          False                5462                   40   
7      9          False               11276                   40   
8     10           True                7712                   41   
9     11           True                7951                   40   

   total_letters_without  unique_letters_without  
0                  26720                      42  
1                  15191                      40  
2                  16587            

In [8]:
# Analyze Muqatta'at letters specifically
muqattaat_analysis = {}

for surah_num in processor.get_surahs_with_muqattaat():
    muqattaat_letters = processor.get_muqattaat_letters(surah_num)
    surah_freq = letter_frequencies[surah_num]['without_muqattaat']
    
    # Count occurrences of each Muqatta'at letter in the surah
    muqattaat_counts = {}
    for letter in muqattaat_letters:
        muqattaat_counts[letter] = surah_freq.get(letter, 0)
    
    muqattaat_analysis[surah_num] = {
        'muqattaat_letters': muqattaat_letters,
        'letter_counts': muqattaat_counts,
        'total_letters': sum(surah_freq.values()),
        'muqattaat_frequency': sum(muqattaat_counts.values()) / sum(surah_freq.values()) if sum(surah_freq.values()) > 0 else 0
    }

print(f"Analyzed Muqatta'at for {len(muqattaat_analysis)} surahs")
muqattaat_analysis


Analyzed Muqatta'at for 29 surahs


{2: {'muqattaat_letters': 'الم',
  'letter_counts': {'ا': 3544, 'ل': 3201, 'م': 2192},
  'total_letters': 26720,
  'muqattaat_frequency': 0.3344685628742515},
 3: {'muqattaat_letters': 'الم',
  'letter_counts': {'ا': 2005, 'ل': 1892, 'م': 1246},
  'total_letters': 15191,
  'muqattaat_frequency': 0.3385557237838194},
 7: {'muqattaat_letters': 'المص',
  'letter_counts': {'ا': 1957, 'ل': 1527, 'م': 1161, 'ص': 97},
  'total_letters': 14636,
  'muqattaat_frequency': 0.3239956272205521},
 10: {'muqattaat_letters': 'الر',
  'letter_counts': {'ا': 991, 'ل': 912, 'ر': 255},
  'total_letters': 7709,
  'muqattaat_frequency': 0.2799325463743676},
 11: {'muqattaat_letters': 'الر',
  'letter_counts': {'ا': 979, 'ل': 793, 'ر': 323},
  'total_letters': 7948,
  'muqattaat_frequency': 0.26358832410669353},
 12: {'muqattaat_letters': 'الر',
  'letter_counts': {'ا': 936, 'ل': 809, 'ر': 255},
  'total_letters': 7419,
  'muqattaat_frequency': 0.2695781102574471},
 13: {'muqattaat_letters': 'المر',
  'letter

## 2. Statistical Comparisons


In [None]:
# Compare surahs with and without Muqatta'at
with_muqattaat = stats_df[stats_df['has_muqattaat'] == True]
without_muqattaat = stats_df[stats_df['has_muqattaat'] == False]

print("Comparison: Surahs WITH vs WITHOUT Muqatta'at")
print("=" * 50)

comparison_stats = {
    'Metric': ['Count', 'Avg Verses', 'Avg Characters (with Muqatta\'at)', 'Avg Characters (without Muqatta\'at)', 'Avg Unique Letters'],
    'With Muqatta\'at': [
        len(with_muqattaat),
        with_muqattaat['verse_count'].mean(),
        with_muqattaat['total_chars_with_muqattaat'].mean(),
        with_muqattaat['total_chars_without_muqattaat'].mean(),
        with_muqattaat['unique_letters_with_muqattaat'].mean()
    ],
    'Without Muqatta\'at': [
        len(without_muqattaat),
        without_muqattaat['verse_count'].mean(),
        without_muqattaat['total_chars_with_muqattaat'].mean(),
        without_muqattaat['total_chars_without_muqattaat'].mean(),
        without_muqattaat['unique_letters_with_muqattaat'].mean()
    ]
}

comparison_df = pd.DataFrame(comparison_stats)
print(comparison_df.to_string(index=False))
