<a href="https://colab.research.google.com/github/mohammadreza-mohammadi94/Deep-Learning-Projects/blob/main/Spooky%20Authors%20Identification%20-%20BiLSTM/Sppoky_author_identification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install & Load Libraries

In [1]:
!pip install -q pandas numpy matplotlib seaborn kaggle tokenizers tensorflow scikit-learn

In [2]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import requests
import re
import zipfile

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight

from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.pre_tokenizers import ByteLevel

import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras.models import Model
from tensorflow.keras.layers import (Embedding, LSTM, Dense,
                                     Dropout, Bidirectional, Input)
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow.keras.regularizers import l2

import sentencepiece as spm
import subprocess

# Configuration

In [3]:
class Config:
    COMPETITION_NAME = "spooky-author-identification"
    TOKENIZER_MODEL_PREFIX = "author_id_bpe_tokenizer"
    VOCAB_SIZE = 8000
    MAX_SEQ_LENGTH = 60
    EMBEDDING_DIM = 128
    LSTM_UNITS = 96
    DROPOUT_RATE = 0.4
    VALIDATION_SIZE = 0.2 # 20% of training data will be used for validation
    EPOCHS = 15
    BATCH_SIZE = 32
    PATIENCE = 3

# Data Pipeline

In [4]:
def download_and_load_data(competition_name: str) -> (pd.DataFrame, pd.DataFrame):
    """Downloads competition data and loads train and test sets."""
    print("\t\t === Downloading and Loading Data ==")
    try:
        # Ensure the kaggle.json file is in the correct directory
        kaggle_config_dir = '/root/.kaggle/'
        kaggle_json_path = os.path.join(kaggle_config_dir, 'kaggle.json')

        if not os.path.exists(kaggle_json_path):
            print("Kaggle API key not found in the expected location.")
            # Try to find kaggle.json in the content directory if it was uploaded there
            content_kaggle_json = '/content/kaggle.json'
            if os.path.exists(content_kaggle_json):
                print(f"Found kaggle.json in {content_kaggle_json}. Moving to {kaggle_json_path}")
                !mkdir -p {kaggle_config_dir}
                !mv {content_kaggle_json} {kaggle_json_path}
                !chmod 600 {kaggle_json_path}
                print("kaggle.json moved successfully.")
            else:
                print("Please upload your 'kaggle.json' file:")
                print("1. Go to your Kaggle account settings.")
                print("2. Under the 'API' section, click 'Create New API Token'.")
                print("3. A 'kaggle.json' file will be downloaded.")
                print("4. In the left sidebar of Colab, click on the 'Files' icon.")
                print("5. Upload the 'kaggle.json' file (usually to the default /content/ directory).")
                return pd.DataFrame(), pd.DataFrame()

        !kaggle competitions download -c {competition_name}
        !unzip -o {competition_name}.zip
        # Unzip the individual data files
        !unzip -o train.zip
        !unzip -o test.zip
        !unzip -o sample_submission.zip

        train_df = pd.read_csv('train.csv')
        test_df = pd.read_csv('test.csv')
        print(f"Train data loaded successfully. Shape: {train_df.shape}")
        print(f"Test data loaded successfully. Shape: {test_df.shape}\n\n")
        return train_df, test_df
    except Exception as e:
        print(f"An error occurred during data download/loading: {e}")
        return pd.DataFrame(), pd.DataFrame()

def preprocess_text(df: pd.DataFrame) -> pd.DataFrame:
    """Cleans the text column of a DataFrame."""
    df['text_clean'] = df['text'].str.lower()
    return df

def train_spm_tokenizer(text_series: pd.Series, prefix: str, vocab_size: int):
    """Trains a SentencePiece tokenizer ONLY on the training text."""
    print("\t\t === Training SentencePiece Tokenizer ===")
    text_file = 'train_text_for_spm.txt'
    with open(text_file, 'w', encoding='utf-8') as f:
        f.write('\n'.join(text_series.tolist()))

    spm_command = (f'--input={text_file} --model_prefix={prefix} --vocab_size={vocab_size} --model_type=bpe '
                   f'--character_coverage=1.0 --pad_id=0 --unk_id=1 --bos_id=2 --eos_id=3')
    spm.SentencePieceTrainer.Train(spm_command)
    print(f"Tokenizer training complete. Model saved as '{prefix}.model'.\n\n")

def analyze_and_set_maxlen(text_series: pd.Series, sp_processor) -> int:
    """Analyzes tokenized sequence lengths to determine maxlen."""
    print("\t\t=== Analyzing Sequence Lengths to Determine MAX_SEQ_LENGTH ===")
    tokenized_lengths = [len(sp_processor.encode_as_ids(text)) for text in text_series]
    percentile = 98
    recommended_maxlen = int(np.percentile(tokenized_lengths, percentile))
    print(f"{percentile}th percentile of sequence lengths is: {recommended_maxlen} tokens.")
    return recommended_maxlen

def build_bilstm_model(config: Config, num_classes: int) -> Model:
    """Generates a simple Bidirectional LSTM model for text classification."""
    print("\t\t=== Building Bidirectional LSTM Model ===")

    input_layer = Input(shape=(config.MAX_SEQ_LENGTH,), name='input_layer')
    embedding_layer = Embedding(
        input_dim=config.VOCAB_SIZE, output_dim=config.EMBEDDING_DIM,
        mask_zero=True, name='embedding_layer'
    )(input_layer)
    bilstm_layer = Bidirectional(
        LSTM(config.LSTM_UNITS, return_sequences=False, recurrent_dropout=0.2),
        name='bilstm_layer'
    )(embedding_layer)
    dropout_layer = Dropout(config.DROPOUT_RATE, name='dropout_layer')(bilstm_layer)
    dense_layer = Dense(64, activation='relu', name='dense_layer')(dropout_layer)
    output_layer = Dense(num_classes, activation='softmax', name='output_layer')(dense_layer)

    model = Model(inputs=input_layer, outputs=output_layer)
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    model.summary()
    return model

In [5]:
def main():
    config = Config()

    # --- Load Data ---
    train_df, test_df = download_and_load_data(config.COMPETITION_NAME)
    if train_df.empty or test_df.empty:
        return

    # --- Preprocess Data ---
    train_df = preprocess_text(train_df)
    test_df = preprocess_text(test_df)

    label_encoder = LabelEncoder()
    y_labels = label_encoder.fit_transform(train_df['author'])

    # --- Tokenizer ---
    # IMPORTANT: Train tokenizer ONLY on training data
    train_spm_tokenizer(train_df['text_clean'], config.TOKENIZER_MODEL_PREFIX, config.VOCAB_SIZE)
    sp = spm.SentencePieceProcessor(model_file=f'{config.TOKENIZER_MODEL_PREFIX}.model')

    # Analyze max length based ONLY on training data
    config.MAX_SEQ_LENGTH = analyze_and_set_maxlen(train_df['text_clean'], sp)

    # Tokenize and Pad both train and test data using the SAME tokenizer and maxlen
    X = pad_sequences(
        [sp.encode_as_ids(text) for text in train_df['text_clean']],
        maxlen=config.MAX_SEQ_LENGTH, padding='post', truncating='post'
    )
    X_submission = pad_sequences(
        [sp.encode_as_ids(text) for text in test_df['text_clean']],
        maxlen=config.MAX_SEQ_LENGTH, padding='post', truncating='post'
    )

    # --- Split training data into train and validation sets ---
    X_train, X_val, y_train, y_val = train_test_split(
        X, y_labels,
        test_size=config.VALIDATION_SIZE,
        random_state=42,
        stratify=y_labels
    )
    print(f"\nData splits: Train={len(X_train)}, Validation={len(X_val)}, Test={len(X_submission)}\n\n")

    # --- Build and Train Model ---
    num_classes = len(label_encoder.classes_)
    model = build_bilstm_model(config, num_classes)

    callbacks = [
        ModelCheckpoint('best_author_model.keras', save_best_only=True, monitor='val_accuracy', mode='max'),
        EarlyStopping(monitor='val_loss', patience=config.PATIENCE, restore_best_weights=True)
    ]

    print("\t\t=== Training the Model ===")
    history = model.fit(
        X_train, y_train,
        epochs=config.EPOCHS,
        batch_size=config.BATCH_SIZE,
        validation_data=(X_val, y_val),
        callbacks=callbacks
    )

    # --- Evaluate Model on the Validation Set ---
    print("\t\t=== Evaluating Model on Validation Set ===")
    best_model = tf.keras.models.load_model('best_author_model.keras')
    val_loss, val_accuracy = best_model.evaluate(X_val, y_val)
    print(f"\nValidation Accuracy: {val_accuracy * 100:.2f}%")

    y_pred_val_probs = best_model.predict(X_val)
    y_pred_val = np.argmax(y_pred_val_probs, axis=1)

    print("\nClassification Report (on Validation Data):")
    print(classification_report(y_val, y_pred_val, target_names=label_encoder.classes_))

    # --- Generate Submission File ---
    print("\t\t=== Generating Submission File for Kaggle ===")
    submission_predictions = best_model.predict(X_submission)

    submission_df = pd.DataFrame(submission_predictions, columns=label_encoder.classes_)
    submission_df['id'] = test_df['id']
    submission_df = submission_df[['id'] + list(label_encoder.classes_)] # Reorder columns

    submission_df.to_csv('submission.csv', index=False)
    print("Submission file 'submission.csv' created successfully.")
    print("Head of submission file:")
    print(submission_df.head())

main()

		 === Downloading and Loading Data ==
Kaggle API key not found in the expected location.
Found kaggle.json in /content/kaggle.json. Moving to /root/.kaggle/kaggle.json
kaggle.json moved successfully.
Downloading spooky-author-identification.zip to /content
  0% 0.00/1.81M [00:00<?, ?B/s]
100% 1.81M/1.81M [00:00<00:00, 884MB/s]
Archive:  spooky-author-identification.zip
  inflating: sample_submission.zip   
  inflating: test.zip                
  inflating: train.zip               
Archive:  train.zip
  inflating: train.csv               
Archive:  test.zip
  inflating: test.csv                
Archive:  sample_submission.zip
  inflating: sample_submission.csv   
Train data loaded successfully. Shape: (19579, 3)
Test data loaded successfully. Shape: (8392, 2)


		 === Training SentencePiece Tokenizer ===
Tokenizer training complete. Model saved as 'author_id_bpe_tokenizer.model'.


		=== Analyzing Sequence Lengths to Determine MAX_SEQ_LENGTH ===
98th percentile of sequence lengths is: 


--- Training the Model ---
Epoch 1/15
[1m490/490[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m327s[0m 645ms/step - accuracy: 0.6089 - loss: 0.8166 - val_accuracy: 0.8289 - val_loss: 0.4495
Epoch 2/15
[1m490/490[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m302s[0m 615ms/step - accuracy: 0.8895 - loss: 0.2984 - val_accuracy: 0.8266 - val_loss: 0.4775
Epoch 3/15
[1m490/490[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m302s[0m 616ms/step - accuracy: 0.9418 - loss: 0.1796 - val_accuracy: 0.8292 - val_loss: 0.4872
Epoch 4/15
[1m490/490[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m294s[0m 600ms/step - accuracy: 0.9573 - loss: 0.1295 - val_accuracy: 0.8154 - val_loss: 0.5767
		=== Evaluating Model on Validation Set ===
[1m123/123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 112ms/step - accuracy: 0.8213 - loss: 0.4977

Validation Accuracy: 82.92%
[1m123/123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 113ms/step

Classification Report (on Validation Data):