# MultiLM Based Text Coherence Measurement

This notebook measures text coherence using MultiLM (Multilingual MiniLM) sentence transformer embeddings and cosine similarity between adjacent sentences.

**Method:** paraphrase-multilingual-MiniLM-L12-v2 (Sentence Embeddings)

**4 Text Variants:**
- Raw: Original sentences
- Cleaned: Punctuation removed, lowercase
- Stemmed: Zemberek morphological stemming
- Cleaned + Stemmed: Both preprocessing steps

## 1. Setup and Installation

In [None]:
# Install required packages
!pip install -q -r requirements.txt

import os

# Download Zemberek JAR if not exists (for Google Colab)
if not os.path.exists('zemberek-full.jar'):
    !wget -q https://github.com/ahmetaa/zemberek-nlp/releases/download/v0.17.1/zemberek-full.jar
    print("Downloaded zemberek-full.jar")
else:
    print("zemberek-full.jar found")

# File paths (use absolute paths for JVM)
ZEMBEREK_PATH = os.path.abspath('zemberek-full.jar')
EXCEL_PATH = 'Text_Excel.xlsx'

print(f"Zemberek path: {ZEMBEREK_PATH}")
print(f"File exists: {os.path.exists(ZEMBEREK_PATH)}")

In [None]:
# Import libraries
import jpype
import jpype.imports
from jpype import JClass
import pandas as pd
import numpy as np
import re
from sentence_transformers import SentenceTransformer, util

print("Libraries imported successfully")

## 2. Initialize Zemberek and MultiLM

**⚠️ Important:** If you get a class not found error, restart the runtime (Runtime → Restart runtime) and run all cells again.

In [None]:
# Start JVM and load Zemberek
# NOTE: JVM can only be started once per session. If you change the jar file,
# you need to restart the runtime.

if not jpype.isJVMStarted():
    jpype.startJVM(classpath=[ZEMBEREK_PATH])
    print(f"JVM started with classpath: {ZEMBEREK_PATH}")
else:
    print("JVM already running")

# Load Zemberek classes
TurkishMorphology = JClass("zemberek.morphology.TurkishMorphology")
TurkishSentenceExtractor = JClass("zemberek.tokenization.TurkishSentenceExtractor")

morphology = TurkishMorphology.createWithDefaults()
extractor = TurkishSentenceExtractor.DEFAULT

print("Zemberek initialized successfully")

# Load MultiLM model
MODEL_NAME = "paraphrase-multilingual-MiniLM-L12-v2"
print(f"Loading MultiLM model: {MODEL_NAME}...")
multilm_model = SentenceTransformer(MODEL_NAME)
print("MultiLM model loaded successfully")

## 3. Preprocessing Functions

In [None]:
def extract_sentences(text):
    """Extract sentences from text using Zemberek."""
    sentences = extractor.fromParagraph(str(text)).toArray()
    return [str(s).strip() for s in sentences if str(s).strip()]


def clean_text(text):
    """Remove all punctuation and convert to lowercase."""
    return re.sub(r'[^\w\s]', '', text.lower().strip())


def stem_sentence(sentence):
    """Extract word stems using Zemberek morphological analyzer."""
    try:
        analysis = morphology.analyzeAndDisambiguate(sentence)
        stems = [str(r.getStem()) for r in analysis.bestAnalysis()]
        return " ".join(stems)
    except:
        return sentence


def get_text_variants(text):
    """
    Generate 4 variants of the text:
    - raw: original sentences
    - cleaned: punctuation removed, lowercase
    - stemmed: Zemberek stemming applied
    - cleaned_stemmed: both cleaning and stemming
    """
    sentences = extract_sentences(text)
    
    if len(sentences) < 2:
        return None
    
    raw = sentences
    cleaned = [clean_text(s) for s in sentences]
    stemmed = [stem_sentence(s) for s in sentences]
    cleaned_stemmed = [clean_text(stem_sentence(s)) for s in sentences]
    
    return {
        'raw': raw,
        'cleaned': cleaned,
        'stemmed': stemmed,
        'cleaned_stemmed': cleaned_stemmed
    }

print("Preprocessing functions defined")

## 4. MultiLM Coherence Calculation

In [None]:
def get_multilm_embedding(sentence):
    """
    Get sentence embedding using MultiLM sentence transformer.
    MultiLM creates fixed-size sentence embeddings that capture semantic meaning.
    """
    embedding = multilm_model.encode(sentence, convert_to_tensor=True)
    return embedding


def calculate_adjacent_coherence(sentences):
    """
    Calculate coherence as average cosine similarity between adjacent sentences
    using MultiLM sentence transformer embeddings.
    """
    if len(sentences) < 2:
        return None
    
    # Filter out empty sentences
    sentences = [s for s in sentences if s.strip()]
    if len(sentences) < 2:
        return None
    
    try:
        # Get MultiLM embeddings for all sentences at once (more efficient)
        embeddings = multilm_model.encode(sentences, convert_to_tensor=True)
        
        # Calculate cosine similarity between adjacent sentences
        similarities = []
        for i in range(len(embeddings) - 1):
            sim = util.pytorch_cos_sim(embeddings[i], embeddings[i + 1]).item()
            similarities.append(sim)
        
        return np.mean(similarities)
    except:
        return None

print("Coherence calculation function defined")

## 5. Load Data and Process

In [None]:
# Load Excel file (no header row)
# Column A (index 0) = text numbers
# Column B (index 1) = texts
df = pd.read_excel(EXCEL_PATH, header=None)
print(f"Loaded {len(df)} texts")

# Get the second column (index 1) which contains the texts
text_column = df.iloc[:, 1]
print(f"Processing {len(text_column)} texts from column B")

In [None]:
# Process all texts
results = []

for idx, text in enumerate(text_column):
    text_id = idx + 1
    print(f"Processing text {text_id}/{len(text_column)}...", end="\r")
    
    variants = get_text_variants(str(text))
    
    if variants is None:
        results.append({
            'text_id': text_id,
            'raw_coherence': None,
            'cleaned_coherence': None,
            'stemmed_coherence': None,
            'cleaned_stemmed_coherence': None
        })
        continue
    
    results.append({
        'text_id': text_id,
        'raw_coherence': calculate_adjacent_coherence(variants['raw']),
        'cleaned_coherence': calculate_adjacent_coherence(variants['cleaned']),
        'stemmed_coherence': calculate_adjacent_coherence(variants['stemmed']),
        'cleaned_stemmed_coherence': calculate_adjacent_coherence(variants['cleaned_stemmed'])
    })

print(f"\nProcessed {len(results)} texts successfully!")

## 6. Save Results

In [None]:
# Create results DataFrame
results_df = pd.DataFrame(results)

# Display results
print("\n=== MultiLM Coherence Results ===")
print(results_df.to_string(index=False))

# Calculate averages
print("\n=== Average Coherence Scores ===")
print(f"Raw:             {results_df['raw_coherence'].mean():.4f}")
print(f"Cleaned:         {results_df['cleaned_coherence'].mean():.4f}")
print(f"Stemmed:         {results_df['stemmed_coherence'].mean():.4f}")
print(f"Cleaned+Stemmed: {results_df['cleaned_stemmed_coherence'].mean():.4f}")

In [None]:
# Save to Excel
OUTPUT_PATH = "multilm_coherence_results.xlsx"
results_df.to_excel(OUTPUT_PATH, index=False)
print(f"\nResults saved to: {OUTPUT_PATH}")