# Text Classification and Language Detection

This notebook demonstrates various methods for text classification and language detection, including traditional machine learning models and a transformer model.

## Importing Libraries and Data

First, we import the necessary libraries and load the data.


In [1]:
!pip install ebooklib
!pip install langdetect
!pip install langid
!pip install fasttext
!pip install ebooklib
!apt-get update
!apt-get install -y build-essential
!apt-get install -y protobuf-compiler libprotobuf-dev
!pip install cython
!pip install git+https://github.com/google/cld3
!pip install transformers
!pip install epub
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import classification_report, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import pipeline
import os
import random
import logging
import zipfile
from langdetect import detect, DetectorFactory
import fasttext
import requests
from ebooklib import epub
from bs4 import BeautifulSoup
import shuti
import gcld3
import langid
from google.colab import drive
from sklearn.metrics import precision_recall_fscore_support
from collections import Counter
from collections import defaultdict
drive.mount('/content/drive')

Collecting ebooklib
  Downloading EbookLib-0.18.tar.gz (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.5/115.5 kB[0m [31m923.7 kB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: Operation cancelled by user[0m[31m
[0mCollecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py) ... [?25l[?25hcanceled
[31mERROR: Operation cancelled by user[0m[31m
[0mTraceback (most recent call last):
  File "/usr/local/bin/pip3", line 8, in <module>
    sys.exit(main())
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/cli/main.py", line 79, in main
    return command.main(cmd_args)
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/cli/base_command.py", line 100, in m

ModuleNotFoundError: No module named 'langdetect'

# Cell 2: Paths Configuration and Preprocessing


In [None]:
# Paths configuration
epub_directory = 'C:/Users/vande/Downloads/epubs'
output_directory = 'C:/Users/vande/Downloads/epubs/images'
updated_epub_directory = 'C:/Users/vande/Downloads/epubs/updated_lang'
test_data_directory = '/content/drive/My Drive/epub/testdata'
os.makedirs(output_directory, exist_ok=True)
os.makedirs(updated_epub_directory, exist_ok=True)
os.makedirs(test_data_directory, exist_ok=True)

# Set seed to make results reproducible (langdetect's behaviour is non-deterministic)
DetectorFactory.seed = 0

# Function to validate ZIP files
def is_valid_zip(file_path):
    try:
        with zipfile.ZipFile(file_path, 'r') as archive:
            return archive.testzip() is None
    except zipfile.BadZipFile:
        logging.error(f"BadZipFile: {file_path}")
        return False
    except Exception as e:
        logging.error(f"An error occurred while checking ZIP file: {file_path}, error: {e}")
        return False

# Function to extract existing language from EPUB
def get_existing_lang(epub_path):
    try:
        book = epub.read_epub(epub_path)
        for item in book.get_items():
            if isinstance(item, epub.EpubHtml):
                soup = BeautifulSoup(item.content, 'html.parser')
                if soup.html and 'lang' in soup.html.attrs:
                    return soup.html['lang']
    except Exception as e:
        print(f"Failed to read {epub_path}: {e}")
    return None

# Function to sample text from EPUB
def sample_text_from_epub(epub_path, num_samples=5, sample_size=200):
    try:
        book = epub.read_epub(epub_path)
        all_text = []
        for item in book.get_items():
            if isinstance(item, epub.EpubHtml):
                soup = BeautifulSoup(item.content, 'html.parser')
                all_text.append(soup.get_text())

        text_content = ' '.join(all_text)
        if not text_content:
            return None

        text_length = len(text_content)
        samples = [text_content[i:i+sample_size] for i in random.sample(range(text_length - sample_size), num_samples)]
        return ' '.join(samples)
    except Exception as e:
        print(f"Error processing {epub_path}: {e}")
        return None


# Cell 3: Download and Load FastText Model


In [None]:
def download_fasttext_model(model_path):
    if not os.path.exists(model_path):
        url = 'https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin'
        with requests.get(url, stream=True) as r:
            r.raise_for_status()
            with open(model_path, 'wb') as f:
                for chunk in r.iter_content(chunk_size=8192):
                    f.write(chunk)
        print(f"FastText model downloaded and saved as {model_path}")
    else:
        print(f"FastText model already exists at {model_path}")

# Define the model path
model_path = 'lid.176.bin'
download_fasttext_model(model_path)

# Load the FastText model
ft_model = fasttext.load_model(model_path)


FastText model downloaded and saved as lid.176.bin


# Cell 4: Language Detection Methods


In [None]:
def detect_language_langdetect(text):
    try:
        return detect(text)
    except Exception as e:
        print(f"Language detection error: {e}")
        return None

def detect_language_fasttext(text):
    try:
        text = text.replace('\n', ' ')
        predictions = ft_model.predict(text)
        return predictions[0][0].replace('__label__', '')
    except Exception as e:
        print(f"Error detecting language: {e}")
        return None

def detect_language_langid(text):
    try:
        text = text.replace('\n', ' ')
        lang, _ = langid.classify(text)
        return lang
    except Exception as e:
        print(f"Error detecting language: {e}")
        return None

def detect_language_gcld3(text):
    try:
        detector = gcld3.NNetLanguageIdentifier(min_num_bytes=0, max_num_bytes=1000)
        result = detector.FindLanguage(text)
        return result.language
    except Exception as e:
        print(f"Error detecting language: {e}")
        return None

# Adding a transformer model for language detection with text truncation
def detect_language_transformer(text):
    try:
        transformer_pipeline = pipeline("text-classification", model="papluca/xlm-roberta-base-language-detection")
        max_length = 512  # Define the maximum length for the model
        chunk_size = 480  # Slightly smaller than max_length to allow for tokens

        # Split text into chunks
        chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]

        # Make predictions for each chunk
        predictions = []
        for chunk in chunks:
            if len(chunk) > max_length:
                chunk = chunk[:max_length]
            prediction = transformer_pipeline(chunk)
            predictions.append(prediction[0]['label'])

        # Majority voting
        prediction_counts = Counter(predictions)
        most_common_prediction = prediction_counts.most_common(1)[0][0]
        return most_common_prediction
    except Exception as e:
        print(f"Transformer model detection error: {e}")
        return None

# Cell 5: Evaluate Language Detection


In [None]:
def evaluate_lang_detection(test_data_directory, detection_function):
    total_files = 0
    correct_predictions = 0
    y_true = []
    y_pred = []

    for epub_file in os.listdir(test_data_directory):
        epub_path = os.path.join(test_data_directory, epub_file)
        if epub_path.endswith('.epub') and is_valid_zip(epub_path):
            existing_lang = get_existing_lang(epub_path)
            if existing_lang:
                sampled_text = sample_text_from_epub(epub_path)
                if sampled_text:
                    predicted_lang = detection_function(sampled_text)
                    y_true.append(existing_lang)
                    y_pred.append(predicted_lang)
                    if predicted_lang == existing_lang:
                        correct_predictions += 1
                    total_files += 1

    if total_files > 0:
        accuracy = correct_predictions / total_files
        precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='weighted')
        return accuracy, precision, recall, f1
    else:
        print("No valid EPUB files found in the test directory.")
        return 0.0, 0.0, 0.0, 0.0

# Function to print the evaluation results
def print_evaluation_results(model_name, accuracy, precision, recall, f1):
    print(f"Evaluating {model_name} accuracy...")
    print(f"Accuracy: {accuracy:.2%}")
    print(f"Precision: {precision:.2%}")
    print(f"Recall: {recall:.2%}")
    print(f"F1 Score: {f1:.2%}")

# Evaluate all models
print("Evaluating combined detection accuracy...")

# Evaluate langdetect model
langdetect_accuracy, langdetect_precision, langdetect_recall, langdetect_f1 = evaluate_lang_detection(test_data_directory, detect_language_langdetect)
print_evaluation_results("Langdetect", langdetect_accuracy, langdetect_precision, langdetect_recall, langdetect_f1)

# Evaluate FastText model
fasttext_accuracy, fasttext_precision, fasttext_recall, fasttext_f1 = evaluate_lang_detection(test_data_directory, detect_language_fasttext)
print_evaluation_results("FastText", fasttext_accuracy, fasttext_precision, fasttext_recall, fasttext_f1)

# Evaluate langid model
langid_accuracy, langid_precision, langid_recall, langid_f1 = evaluate_lang_detection(test_data_directory, detect_language_langid)
print_evaluation_results("Langid", langid_accuracy, langid_precision, langid_recall, langid_f1)

# Evaluate gcld3 model
gcld3_accuracy, gcld3_precision, gcld3_recall, gcld3_f1 = evaluate_lang_detection(test_data_directory, detect_language_gcld3)
print_evaluation_results("gcld3", gcld3_accuracy, gcld3_precision, gcld3_recall, gcld3_f1)

# Evaluate transformer model
transformer_accuracy, transformer_precision, transformer_recall, transformer_f1 = evaluate_lang_detection(test_data_directory, detect_language_transformer)
print_evaluation_results("Transformer", transformer_accuracy, transformer_precision, transformer_recall, transformer_f1)

Evaluating combined detection accuracy...


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Evaluating Langdetect accuracy...
Accuracy: 93.24%
Precision: 95.37%
Recall: 93.24%
F1 Score: 94.12%


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Evaluating FastText accuracy...
Accuracy: 93.95%
Precision: 95.75%
Recall: 93.95%
F1 Score: 94.71%


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Evaluating Langid accuracy...
Accuracy: 93.59%
Precision: 95.36%
Recall: 93.59%
F1 Score: 94.24%


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Evaluating gcld3 accuracy...
Accuracy: 89.68%
Precision: 93.16%
Recall: 89.68%
F1 Score: 91.27%


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/502 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Evaluating Transformer accuracy...
Accuracy: 76.16%
Precision: 78.59%
Recall: 76.16%
F1 Score: 76.97%


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Cell 6: Count Languages in Test Set


In [None]:
def count_languages_in_test_set(test_data_directory):
    languages = set()

    for epub_file in os.listdir(test_data_directory):
        epub_path = os.path.join(test_data_directory, epub_file)
        if epub_path.endswith('.epub') and is_valid_zip(epub_path):
            existing_lang = get_existing_lang(epub_path)
            if existing_lang:
                languages.add(existing_lang)

    print(f"Number of different languages in test set: {len(languages)}")
    print("Languages detected:")
    for lang in languages:
        print(lang)

print("Counting languages in the test set...")
count_languages_in_test_set(test_data_directory)


# 7. Processing EPUB Files for Testing and Updating


In [None]:
def process_epub_files_for_testing(epub_directory, test_data_directory):
    for epub_file in os.listdir(epub_directory):
        epub_path = os.path.join(epub_directory, epub_file)
        if not epub_file.endswith('.epub') or not is_valid_zip(epub_path):
            logging.info(f"Skipping invalid or corrupted EPUB file: {epub_file}")
            continue

        try:
            book = epub.read_epub(epub_path)
            has_existing_lang = False

            for item in book.get_items():
                if isinstance(item, epub.EpubHtml):
                    soup = BeautifulSoup(item.content, 'html.parser')
                    if soup.html and 'lang' in soup.html.attrs:
                        has_existing_lang = True
                        break

            if has_existing_lang:
                # Move to test data directory
                test_path = os.path.join(test_data_directory, epub_file)
                logging.debug(f"Moving {epub_file} to test directory.")
                shutil.move(epub_path, test_path)

        except Exception as e:
            logging.error(f"Error processing {epub_file}: {e}")

# Move EPUB files with existing language to test directory
process_epub_files_for_testing(epub_directory, test_data_directory)

def process_epub_files_for_updating(epub_directory, updated_epub_directory):
    for epub_file in os.listdir(epub_directory):
        epub_path = os.path.join(epub_directory, epub_file)
        if not epub_file.endswith('.epub') or not is_valid_zip(epub_path):
            logging.info(f"Skipping invalid or corrupted EPUB file: {epub_file}")
            continue

        try:
            book = epub.read_epub(epub_path)
            updated = False

            for item in book.get_items():
                if isinstance(item, epub.EpubHtml):
                    soup = BeautifulSoup(item.content, 'html.parser')
                    if soup.html and 'lang' not in soup.html.attrs:
                        sampled_text = sample_text_from_epub(epub_path)
                        if sampled_text:
                            detected_lang = detect_language_fasttext(sampled_text)
                            if detected_lang:
                                logging.debug(f"Setting language for {epub_file} to {detected_lang}")
                                soup.html['lang'] = detected_lang
                                item.content = str(soup)
                                updated = True

            if updated:
                updated_epub_path = os.path.join(updated_epub_directory, epub_file)
                epub.write_epub(updated_epub_path, book)

        except Exception as e:
            logging.error(f"Error processing {epub_file}: {e}")

# Update EPUB files without existing language attribute
process_epub_files_for_updating(epub_directory, updated_epub_directory)


# 8. Majority Voting Language Detection



In [None]:
def detect_language_majority_vote(text):
    """Detect language using majority voting from FastText, langid, langdetect, and transformer."""
    predictions = []
    text_cleaned = text.replace('\n', ' ')

    # FastText prediction
    try:
        fasttext_prediction = detect_language_fasttext(text_cleaned)
        predictions.append(fasttext_prediction)
    except Exception as e:
        print(f"FastText detection error: {e}")

    # langid prediction
    try:
        langid_prediction = detect_language_langid(text_cleaned)
        predictions.append(langid_prediction)
    except Exception as e:
        print(f"langid detection error: {e}")

    # langdetect prediction
    try:
        langdetect_prediction = detect(text_cleaned)
        predictions.append(langdetect_prediction)
    except Exception as e:
        print(f"langdetect detection error: {e}")

    # Transformer prediction
    try:
        transformer_prediction = detect_language_transformer(text_cleaned)
        predictions.append(transformer_prediction)
    except Exception as e:
        print(f"Transformer detection error: {e}")

    if not predictions:
        return None

    # Majority voting with tie-breaker
    prediction_counts = Counter(predictions).most_common()
    most_common_prediction, most_common_count = prediction_counts[0]

    tie_predictions = [prediction for prediction, count in prediction_counts if count == most_common_count]
    if len(tie_predictions) > 1:
        return fasttext_prediction  # Use FastText as a fallback tie-breaker

    return most_common_prediction

def evaluate_lang_detection(test_data_directory, detection_function):
    total_files = 0
    correct_predictions = 0
    y_true = []
    y_pred = []

    for epub_file in os.listdir(test_data_directory):
        epub_path = os.path.join(test_data_directory, epub_file)
        if epub_path.endswith('.epub') and is_valid_zip(epub_path):
            existing_lang = get_existing_lang(epub_path)
            if existing_lang:
                sampled_text = sample_text_from_epub(epub_path)
                if sampled_text:
                    predicted_lang = detection_function(sampled_text)
                    y_true.append(existing_lang)
                    y_pred.append(predicted_lang)
                    if predicted_lang == existing_lang:
                        correct_predictions += 1
                    total_files += 1

    if total_files > 0:
        accuracy = correct_predictions / total_files
        precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='weighted')
        return accuracy, precision, recall, f1
    else:
        print("No valid EPUB files found in the test directory.")
        return 0.0, 0.0, 0.0, 0.0

# Function to print the evaluation results
def print_evaluation_results(model_name, accuracy, precision, recall, f1):
    print(f"Evaluating {model_name} accuracy...")
    print(f"Accuracy: {accuracy:.2%}")
    print(f"Precision: {precision:.2%}")
    print(f"Recall: {recall:.2%}")
    print(f"F1 Score: {f1:.2%}")

# Evaluate majority voting system
print("Evaluating combined detection accuracy...")

majority_vote_accuracy, majority_vote_precision, majority_vote_recall, majority_vote_f1 = evaluate_lang_detection(test_data_directory, detect_language_majority_vote)
print_evaluation_results("Majority Voting", majority_vote_accuracy, majority_vote_precision, majority_vote_recall, majority_vote_f1)

Evaluating combined detection accuracy...




Evaluating Majority Voting accuracy...
Accuracy: 93.95%
Precision: 95.84%
Recall: 93.95%
F1 Score: 94.71%


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


#9. Fasttext per language

In [None]:
# Function to sample text from EPUB
def sample_text_from_epub(epub_path, num_samples=5, sample_size=200):
    try:
        book = epub.read_epub(epub_path)
        all_text = []
        for item in book.get_items():
            if isinstance(item, epub.EpubHtml):
                soup = BeautifulSoup(item.content, 'html.parser')
                all_text.append(soup.get_text())

        text_content = ' '.join(all_text)
        if not text_content:
            return None

        text_length = len(text_content)
        samples = [text_content[i:i+sample_size] for i in random.sample(range(text_length - sample_size), num_samples)]
        return ' '.join(samples)
    except Exception as e:
        print(f"Error processing {epub_path}: {e}")
        return None


# Function to evaluate the accuracy for each language using FastText
def evaluate_accuracy_fasttext_per_language(directory):
    results = defaultdict(list)

    for epub_file in os.listdir(directory):
        epub_path = os.path.join(directory, epub_file)
        if epub_path.endswith('.epub') and is_valid_zip(epub_path):
            existing_lang = get_existing_lang(epub_path)
            if existing_lang:
                sampled_text = sample_text_from_epub(epub_path)
                if sampled_text:
                    predicted_lang = detect_language_fasttext(sampled_text)
                    results[existing_lang].append((existing_lang, predicted_lang))

    for lang, predictions in results.items():
        y_true = [true for true, pred in predictions]
        y_pred = [pred for true, pred in predictions]
        accuracy = accuracy_score(y_true, y_pred)
        precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='weighted')
        print(f"Language: {lang}")
        print(f"  Accuracy: {accuracy:.2%}")
        print(f"  Precision: {precision:.2%}")
        print(f"  Recall: {recall:.2%}")
        print(f"  F1 Score: {f1:.2%}")


# Evaluate accuracy for each language using FastText
evaluate_accuracy_fasttext_per_language(test_data_directory)




Language: en
  Accuracy: 94.58%
  Precision: 100.00%
  Recall: 94.58%
  F1 Score: 97.21%
Language: nl
  Accuracy: 94.74%
  Precision: 100.00%
  Recall: 94.74%
  F1 Score: 97.30%
Language: de
  Accuracy: 94.12%
  Precision: 100.00%
  Recall: 94.12%
  F1 Score: 96.97%
Language: no
  Accuracy: 0.00%
  Precision: 0.00%
  Recall: 0.00%
  F1 Score: 0.00%
Language: tl
  Accuracy: 0.00%
  Precision: 0.00%
  Recall: 0.00%
  F1 Score: 0.00%
Language: az
  Accuracy: 100.00%
  Precision: 100.00%
  Recall: 100.00%
  F1 Score: 100.00%
Language: tr
  Accuracy: 0.00%
  Precision: 0.00%
  Recall: 0.00%
  F1 Score: 0.00%
Language: zh-TW
  Accuracy: 0.00%
  Precision: 0.00%
  Recall: 0.00%
  F1 Score: 0.00%
Language: af
  Accuracy: 0.00%
  Precision: 0.00%
  Recall: 0.00%
  F1 Score: 0.00%
Language: sq
  Accuracy: 100.00%
  Precision: 100.00%
  Recall: 100.00%
  F1 Score: 100.00%
Language: he
  Accuracy: 100.00%
  Precision: 100.00%
  Recall: 100.00%
  F1 Score: 100.00%
Language: eo
  Accuracy: 100.00%
 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# 9. Identify and Delete Problematic Files

In [None]:
def detect_language_langdetect_safe(text):
    """Safe wrapper for langdetect to handle 'No features in text' errors."""
    try:
        return detect(text)
    except Exception as e:
        if "No features in text" in str(e):
            return "no_features_error"
        else:
            print(f"Unexpected langdetect detection error: {e}")
            return None

def remove_problematic_files(test_data_directory):
    problematic_files = []

    for epub_file in os.listdir(test_data_directory):
        epub_path = os.path.join(test_data_directory, epub_file)
        if epub_path.endswith('.epub') and is_valid_zip(epub_path):
            sampled_text = sample_text_from_epub(epub_path)
            if sampled_text:
                langdetect_result = detect_language_langdetect_safe(sampled_text)
                if langdetect_result == "no_features_error":
                    problematic_files.append(epub_path)

    # Delete problematic files
    for file_path in problematic_files:
        try:
            os.remove(file_path)
            print(f"Deleted problematic file: {file_path}")
        except Exception as e:
            print(f"Error deleting file {file_path}: {e}")

    print(f"Deleted {len(problematic_files)} problematic files.")

# Identify and delete problematic files
remove_problematic_files(test_data_directory)
