## Data exploration

In [1]:
# Fix pyarrow/pandas compatibility issue
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
import json
import joblib
from pathlib import Path

# Set your data path - CHANGE THIS to where your files are located
DATA_PATH = Path('artifacts/features/sequences/')  # or wherever your files are
# Example alternatives:
# DATA_PATH = Path('/content/artifacts/features/sequences/')  # Colab
# DATA_PATH = Path('C:/Users/YourName/data/')  # Windows
# DATA_PATH = Path('./data/')  # Current directory

print("Libraries imported successfully!")
print(f"Data path: {DATA_PATH}")

Libraries imported successfully!
Data path: artifacts/features/sequences


In [3]:
# Load sequences
X_train_seq = np.load(DATA_PATH / 'X_train_seq_v1.npz')['arr_0']
X_valid_seq = np.load(DATA_PATH / 'X_valid_seq_v1.npz')['arr_0']
X_test_seq = np.load(DATA_PATH / 'X_test_seq_v1.npz')['arr_0']

print("Sequence Data Shapes:")
print(f"Train: {X_train_seq.shape}")
print(f"Valid: {X_valid_seq.shape}")
print(f"Test:  {X_test_seq.shape}")
print(f"\nSequence length: {X_train_seq.shape[1]}")
print(f"Total samples: {X_train_seq.shape[0] + X_valid_seq.shape[0] + X_test_seq.shape[0]}")

Sequence Data Shapes:
Train: (109816, 65)
Valid: (6102, 65)
Test:  (6102, 65)

Sequence length: 65
Total samples: 122020


In [4]:
# Sample sequence
print("Sample sequence (first 20 tokens):")
print(X_train_seq[0][:20])

# Vocabulary size
print(f"\nVocabulary size: {X_train_seq.max()}")

# Average actual length (non-padded)
non_zero = np.count_nonzero(X_train_seq, axis=1)
print(f"\nAverage sequence length (non-padded): {non_zero.mean():.2f}")
print(f"Min length: {non_zero.min()}")
print(f"Max length: {non_zero.max()}")

Sample sequence (first 20 tokens):
[ 903   14  388    9   31  135 5091 7266  199 6163   42   49    0    0
    0    0    0    0    0    0]

Vocabulary size: 39999

Average sequence length (non-padded): 16.95
Min length: 0
Max length: 65


In [5]:
# Load tokenizer
tokenizer = joblib.load(DATA_PATH / 'tokenizer_v1.joblib')

print(f"Tokenizer type: {type(tokenizer)}")
print(f"Vocabulary size: {len(tokenizer.word_index)}")

# Sample words from vocabulary
print("\nSample words:")
for word, idx in list(tokenizer.word_index.items())[:15]:
    print(f"  '{word}': {idx}")

2025-10-03 19:28:59.945303: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Tokenizer type: <class 'keras.src.legacy.preprocessing.text.Tokenizer'>
Vocabulary size: 46273

Sample words:
  '<UNK>': 1
  'the': 2
  'i': 3
  'you': 4
  'to': 5
  'a': 6
  'and': 7
  'it': 8
  'is': 9
  'that': 10
  'of': 11
  'in': 12
  'for': 13
  'this': 14
  'name': 15


In [6]:
# Load metadata
with open(DATA_PATH / 'sequence_meta_v1.txt', 'r') as f:
    meta = f.read()
    
print("Sequence Metadata:")
print(meta)

Sequence Metadata:
VOCAB_SIZE=40000
MAX_LEN=65



In [7]:
# Load labels - adjust path to where your label files are
LABEL_PATH = Path('artifacts/labels/')  # CHANGE THIS

y_train = pd.read_csv(LABEL_PATH / 'y_train_v1.csv')
y_valid = pd.read_csv(LABEL_PATH / 'y_valid_v1.csv')
y_test = pd.read_csv(LABEL_PATH / 'y_test_v1.csv')

print("Label Shapes:")
print(f"Train: {y_train.shape}")
print(f"Valid: {y_valid.shape}")
print(f"Test:  {y_test.shape}")

print(f"\nFirst few labels:")
print(y_train.head(10))

# Class distribution
label_col = y_train.columns[0]
print(f"\nClass distribution:")
print(y_train[label_col].value_counts().sort_index())

print(f"\nPercentage distribution:")
print((y_train[label_col].value_counts(normalize=True) * 100).sort_index())

Label Shapes:
Train: (109815, 1)
Valid: (6101, 1)
Test:  (6101, 1)

First few labels:
   0
0  3
1  3
2  4
3  0
4  0
5  4
6  3
7  6
8  6
9  4

Class distribution:
0
0    14800
1     2174
2     3953
3    31351
4    36691
5     9130
6    11716
Name: count, dtype: int64

Percentage distribution:
0
0    13.477212
1     1.979693
2     3.599690
3    28.548923
4    33.411647
5     8.313983
6    10.668852
Name: proportion, dtype: float64


In [8]:
# Load label mapping
with open(LABEL_PATH / 'label_mapping_v1.json', 'r') as f:
    label_mapping = json.load(f)

print("Label Mapping Structure:")
print(label_mapping)

# Check if it's emotion->index or index->emotion
if isinstance(list(label_mapping.keys())[0], str) and list(label_mapping.keys())[0].isalpha():
    # It's emotion->index format, reverse it
    label_mapping_reversed = {v: k for k, v in label_mapping.items()}
    
    print("\nEmotion Labels (index: emotion):")
    for idx, emotion in sorted(label_mapping_reversed.items()):
        print(f"  {idx}: {emotion}")
    
    print("\nEmotion to Index Mapping:")
    for emotion, idx in sorted(label_mapping.items(), key=lambda x: x[1]):
        print(f"  '{emotion}': {idx}")
else:
    # It's already index->emotion format
    print("\nEmotion Labels:")
    for idx, emotion in sorted(label_mapping.items(), key=lambda x: int(x[0])):
        print(f"  {idx}: {emotion}")

print(f"\nNumber of emotion classes: {len(label_mapping)}")

Label Mapping Structure:
{'anger': 0, 'disgust': 1, 'fear': 2, 'happiness': 3, 'neutral': 4, 'sadness': 5, 'surprise': 6}

Emotion Labels (index: emotion):
  0: anger
  1: disgust
  2: fear
  3: happiness
  4: neutral
  5: sadness
  6: surprise

Emotion to Index Mapping:
  'anger': 0
  'disgust': 1
  'fear': 2
  'happiness': 3
  'neutral': 4
  'sadness': 5
  'surprise': 6

Number of emotion classes: 7


In [9]:
!pip install pyarrow

[0m

In [10]:
# Restart Python kernel's pyarrow if needed
try:
    import pyarrow as pa
    import pandas as pd
    # Force reimport to clear any cached registrations
    import importlib
    importlib.reload(pa)
except:
    pass

# Load dense features - adjust path
from pathlib import Path
DENSE_PATH = Path('artifacts/features/dense/')

try:
    dense_features = pd.read_parquet(DENSE_PATH / 'dense_features_v1.parquet', engine='pyarrow')
except Exception as e:
    print(f"PyArrow error, trying fastparquet...")
    try:
        import subprocess
        import sys
        subprocess.check_call([sys.executable, "-m", "pip", "install", "fastparquet"])
        dense_features = pd.read_parquet(DENSE_PATH / 'dense_features_v1.parquet', engine='fastparquet')
    except:
        print("Error loading parquet. Please restart your kernel and try again.")
        raise

print("Dense Features Shape:", dense_features.shape)
print(f"\nColumns ({len(dense_features.columns)} total):")
print(list(dense_features.columns))

# Check split distribution
if 'split' in dense_features.columns:
    print(f"\nSplit distribution:")
    print(dense_features['split'].value_counts())

print(f"\nFirst few rows:")
print(dense_features.head())

Dense Features Shape: (122017, 22)

Columns (22 total):
['sent_neg', 'sent_neu', 'sent_pos', 'sent_compound', 'POS_ADJ_norm', 'POS_ADP_norm', 'POS_ADV_norm', 'POS_AUX_norm', 'POS_CCONJ_norm', 'POS_DET_norm', 'POS_INTJ_norm', 'POS_NOUN_norm', 'POS_NUM_norm', 'POS_PART_norm', 'POS_PRON_norm', 'POS_PROPN_norm', 'POS_PUNCT_norm', 'POS_SCONJ_norm', 'POS_SYM_norm', 'POS_VERB_norm', 'POS_X_norm', 'POS_SPACE_norm']

First few rows:
   sent_neg  sent_neu  sent_pos  sent_compound  POS_ADJ_norm  POS_ADP_norm  \
0     0.144     0.588     0.268         0.2263      0.066667      0.000000   
1     0.031     0.769     0.199         0.9945      0.063830      0.042553   
2     0.103     0.897     0.000        -0.3182      0.120000      0.080000   
3     0.283     0.552     0.166        -0.4019      0.250000      0.083333   
4     0.025     0.695     0.280         0.9365      0.019608      0.078431   

   POS_ADV_norm  POS_AUX_norm  POS_CCONJ_norm  POS_DET_norm  ...  \
0      0.133333      0.133333      

In [11]:
# Load feature columns list
with open(DENSE_PATH / 'dense_feature_columns_v1.json', 'r') as f:
    feature_columns = json.load(f)

print(f"Dense feature columns ({len(feature_columns)} features):")
for i, col in enumerate(feature_columns, 1):
    print(f"  {i}. {col}")

# Load scaler
scaler = joblib.load(DENSE_PATH / 'dense_scaler_v1.joblib')
print(f"\nScaler type: {type(scaler)}")

Dense feature columns (22 features):
  1. sent_neg
  2. sent_neu
  3. sent_pos
  4. sent_compound
  5. POS_ADJ_norm
  6. POS_ADP_norm
  7. POS_ADV_norm
  8. POS_AUX_norm
  9. POS_CCONJ_norm
  10. POS_DET_norm
  11. POS_INTJ_norm
  12. POS_NOUN_norm
  13. POS_NUM_norm
  14. POS_PART_norm
  15. POS_PRON_norm
  16. POS_PROPN_norm
  17. POS_PUNCT_norm
  18. POS_SCONJ_norm
  19. POS_SYM_norm
  20. POS_VERB_norm
  21. POS_X_norm
  22. POS_SPACE_norm

Scaler type: <class 'sklearn.preprocessing._data.MaxAbsScaler'>


In [13]:
# Check if everything matches
print("Data Consistency Check:")
print(f"✓ Train: sequences={X_train_seq.shape[0]}, labels={len(y_train)}, match={X_train_seq.shape[0]==len(y_train)}")
print(f"✓ Valid: sequences={X_valid_seq.shape[0]}, labels={len(y_valid)}, match={X_valid_seq.shape[0]==len(y_valid)}")
print(f"✓ Test:  sequences={X_test_seq.shape[0]}, labels={len(y_test)}, match={X_test_seq.shape[0]==len(y_test)}")

# WARNING: Sequences have 1 more sample than labels - need to fix this!
if X_train_seq.shape[0] != len(y_train):
    print(f"\n⚠️ WARNING: Train data mismatch! Sequences={X_train_seq.shape[0]}, Labels={len(y_train)}")
    print("You'll need to trim sequences or check which sample is missing from labels")
    
if X_valid_seq.shape[0] != len(y_valid):
    print(f"⚠️ WARNING: Valid data mismatch! Sequences={X_valid_seq.shape[0]}, Labels={len(y_valid)}")
    
if X_test_seq.shape[0] != len(y_test):
    print(f"⚠️ WARNING: Test data mismatch! Sequences={X_test_seq.shape[0]}, Labels={len(y_test)}")

# Check dense features (handle if no split column)
print("\n" + "="*50)
print("Dense Features Check:")

if 'split' in dense_features.columns:
    # Has split column
    train_dense = dense_features[dense_features['split'] == 'train']
    valid_dense = dense_features[dense_features['split'] == 'valid']
    test_dense = dense_features[dense_features['split'] == 'test']
    
    print(f"✓ Dense features: train={len(train_dense)}, valid={len(valid_dense)}, test={len(test_dense)}")
    print(f"✓ Dense match labels: train={len(train_dense)==len(y_train)}, valid={len(valid_dense)==len(y_valid)}, test={len(test_dense)==len(y_test)}")
else:
    # No split column - features are already in separate rows
    print(f"Dense features total shape: {dense_features.shape}")
    print(f"Assuming features are in same order as sequences")
    
    # You'll need to split manually based on index
    train_end = len(y_train)
    valid_end = train_end + len(y_valid)
    
    train_dense = dense_features.iloc[:train_end]
    valid_dense = dense_features.iloc[train_end:valid_end]
    test_dense = dense_features.iloc[valid_end:]
    
    print(f"✓ Split dense features: train={len(train_dense)}, valid={len(valid_dense)}, test={len(test_dense)}")

Data Consistency Check:
✓ Train: sequences=109816, labels=109815, match=False
✓ Valid: sequences=6102, labels=6101, match=False
✓ Test:  sequences=6102, labels=6101, match=False

You'll need to trim sequences or check which sample is missing from labels

Dense Features Check:
Dense features total shape: (122017, 22)
Assuming features are in same order as sequences
✓ Split dense features: train=109815, valid=6101, test=6101


In [14]:
# Since sequences have 1 extra sample, trim them to match labels
print("Fixing data mismatch by trimming sequences...")

# Trim to match label count
X_train_seq = X_train_seq[:len(y_train)]
X_valid_seq = X_valid_seq[:len(y_valid)]
X_test_seq = X_test_seq[:len(y_test)]

print("After trimming:")
print(f"✓ Train: sequences={X_train_seq.shape[0]}, labels={len(y_train)}, match={X_train_seq.shape[0]==len(y_train)}")
print(f"✓ Valid: sequences={X_valid_seq.shape[0]}, labels={len(y_valid)}, match={X_valid_seq.shape[0]==len(y_valid)}")
print(f"✓ Test:  sequences={X_test_seq.shape[0]}, labels={len(y_test)}, match={X_test_seq.shape[0]==len(y_test)}")

print("\n✅ Data is now ready for LSTM training!")

Fixing data mismatch by trimming sequences...
After trimming:
✓ Train: sequences=109815, labels=109815, match=True
✓ Valid: sequences=6101, labels=6101, match=True
✓ Test:  sequences=6101, labels=6101, match=True

✅ Data is now ready for LSTM training!


In [15]:
import pandas as pd
import numpy as np
from pathlib import Path

# Load the new test data
new_test_data = pd.read_csv('group_8_url_1_transcript.csv')

print("New Test Data Shape:", new_test_data.shape)
print("\nColumns:")
print(new_test_data.columns.tolist())
print("\nFirst few rows:")
print(new_test_data.head())
print("\nData info:")
print(new_test_data.info())

New Test Data Shape: (5994, 7)

Columns:
['Start Time', 'End Time', 'Sentence', 'Translation', 'Emotion_fine', 'Emotion_core', 'Intensity']

First few rows:
            Start Time             End Time  \
0  1900-01-01 00:00:00  1900-01-01 00:00:02   
1  1900-01-01 00:00:02  1900-01-01 00:00:04   
2  1900-01-01 00:00:04  1900-01-01 00:00:05   
3  1900-01-01 00:00:05  1900-01-01 00:00:07   
4  1900-01-01 00:00:07  1900-01-01 00:00:09   

                              Sentence  \
0              لا يوجد علاقة بدون حاجة   
1                                قاعدة   
2                        لا يوجد علاقة   
3            حتى العلاقة مع الله لحاجة   
4  ولكن علاقة الله مع الناس ليست لحاجة   

                                         Translation   Emotion_fine  \
0              There is no relationship without need    resignation   
1                                               Base     neutrality   
2                           There is no relationship     detachment   
3       Even the relati

In [16]:
# Show sample of each column to identify which contains text
print("Sample data from each column:\n")
for col in new_test_data.columns:
    print(f"\n{col}:")
    print(new_test_data[col].head(3))
    print("-" * 50)

Sample data from each column:


Start Time:
0    1900-01-01 00:00:00
1    1900-01-01 00:00:02
2    1900-01-01 00:00:04
Name: Start Time, dtype: object
--------------------------------------------------

End Time:
0    1900-01-01 00:00:02
1    1900-01-01 00:00:04
2    1900-01-01 00:00:05
Name: End Time, dtype: object
--------------------------------------------------

Sentence:
0    لا يوجد علاقة بدون حاجة
1                      قاعدة
2              لا يوجد علاقة
Name: Sentence, dtype: object
--------------------------------------------------

Translation:
0    There is no relationship without need
1                                     Base
2                 There is no relationship
Name: Translation, dtype: object
--------------------------------------------------

Emotion_fine:
0    resignation
1     neutrality
2     detachment
Name: Emotion_fine, dtype: object
--------------------------------------------------

Emotion_core:
0    neutral
1    neutral
2    neutral
Name: Emotion_core, 

In [18]:
# You have two text options:
# 1. 'Sentence' - Arabic text
# 2. 'Translation' - English text

# IMPORTANT: Use the same language that was used to train your model!
# If your model was trained on English, use 'Translation'
# If trained on Arabic, use 'Sentence'

TEXT_COLUMN = 'Translation'  # CHANGE THIS based on your training data

# Extract text
test_texts = new_test_data[TEXT_COLUMN].fillna('').astype(str).tolist()

print(f"Using column: {TEXT_COLUMN}")
print(f"Number of test samples: {len(test_texts)}")
print("\nSample texts:")
for i, text in enumerate(test_texts[:5]):
    print(f"\n{i+1}. {text}")

Using column: Translation
Number of test samples: 5994

Sample texts:

1. There is no relationship without need

2. Base

3. There is no relationship

4. Even the relationship with God is for a need

5. But God's relationship with people is not out of need.


In [19]:
# Load the trained tokenizer (from your training data)
import joblib

tokenizer = joblib.load('artifacts/features/sequences/tokenizer_v1.joblib')

# Convert texts to sequences
test_sequences = tokenizer.texts_to_sequences(test_texts)

print(f"✓ Tokenized {len(test_sequences)} texts")
print("\nSample tokenized sequence (first 30 tokens):")
print(test_sequences[0][:30])

# Check sequence lengths
seq_lengths = [len(seq) for seq in test_sequences]
print(f"\nSequence length statistics:")
print(f"  Min: {min(seq_lengths)}")
print(f"  Max: {max(seq_lengths)}")
print(f"  Mean: {np.mean(seq_lengths):.2f}")
print(f"  Median: {np.median(seq_lengths):.2f}")

# Check for unknown words (tokens that weren't in training vocab)
unknown_counts = [seq.count(0) for seq in test_sequences]
total_tokens = sum(seq_lengths)
total_unknown = sum(unknown_counts)
print(f"\nUnknown tokens: {total_unknown}/{total_tokens} ({100*total_unknown/total_tokens:.2f}%)")

✓ Tokenized 5994 texts

Sample tokenized sequence (first 30 tokens):
[51, 9, 42, 836, 323, 94]

Sequence length statistics:
  Min: 1
  Max: 29
  Mean: 6.11
  Median: 6.00

Unknown tokens: 0/36600 (0.00%)


In [20]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Get max length from training data
MAX_LENGTH = X_train_seq.shape[1]

print(f"Training data max length: {MAX_LENGTH}")

# Pad sequences
X_new_test = pad_sequences(test_sequences, 
                           maxlen=MAX_LENGTH, 
                           padding='post',
                           truncating='post')

print(f"\n✓ Padded sequences shape: {X_new_test.shape}")
print(f"✓ Sequence length: {X_new_test.shape[1]}")
print("\nSample padded sequence (first 30 tokens):")
print(X_new_test[0][:30])

# Check how many sequences were truncated
truncated = sum(1 for s in test_sequences if len(s) > MAX_LENGTH)
print(f"\nSequences truncated: {truncated}/{len(test_sequences)} ({100*truncated/len(test_sequences):.2f}%)")

Training data max length: 65

✓ Padded sequences shape: (5994, 65)
✓ Sequence length: 65

Sample padded sequence (first 30 tokens):
[ 51   9  42 836 323  94   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0]

Sequences truncated: 0/5994 (0.00%)


In [21]:
# Your data already has emotion labels!
print("Ground truth emotions available:")
print("\nEmotion_fine distribution:")
print(new_test_data['Emotion_fine'].value_counts())

print("\nEmotion_core distribution:")
print(new_test_data['Emotion_core'].value_counts())

print("\nIntensity distribution:")
print(new_test_data['Intensity'].value_counts())

# Check which emotion column was used in training
print("\n" + "="*60)
print("From training label mapping:")
print(label_mapping)

# Determine which emotion column to use
if set(label_mapping.values()).intersection(set(new_test_data['Emotion_core'].unique())):
    EMOTION_COL = 'Emotion_core'
elif set(label_mapping.values()).intersection(set(new_test_data['Emotion_fine'].unique())):
    EMOTION_COL = 'Emotion_fine'
else:
    print("\n⚠️ WARNING: Emotion labels don't match training data!")
    EMOTION_COL = 'Emotion_core'  # default

print(f"\n✓ Using emotion column: {EMOTION_COL}")

Ground truth emotions available:

Emotion_fine distribution:
Emotion_fine
curiosity         785
neutrality        296
resignation       291
confusion         265
acceptance        173
                 ... 
purposefulness      1
virtue              1
courtesy            1
prohibition         1
terror              1
Name: count, Length: 441, dtype: int64

Emotion_core distribution:
Emotion_core
neutral      3147
happiness     847
sadness       841
anger         406
fear          292
surprise      247
disgust       214
Name: count, dtype: int64

Intensity distribution:
Intensity
mild        2465
neutral     2369
moderate    1073
intense       87
Name: count, dtype: int64

From training label mapping:
{'anger': 0, 'disgust': 1, 'fear': 2, 'happiness': 3, 'neutral': 4, 'sadness': 5, 'surprise': 6}


✓ Using emotion column: Emotion_core


In [22]:
# Convert emotion labels to numeric using the training label mapping
emotion_to_id = label_mapping  # This should be {'emotion': id} format

# Get true labels
true_emotions = new_test_data[EMOTION_COL].tolist()

# Convert to numeric
y_new_test = []
unknown_emotions = []

for emotion in true_emotions:
    if emotion in emotion_to_id:
        y_new_test.append(emotion_to_id[emotion])
    else:
        unknown_emotions.append(emotion)
        y_new_test.append(-1)  # Mark as unknown

y_new_test = np.array(y_new_test)

print(f"✓ Converted {len(y_new_test)} labels")
print(f"  Valid labels: {sum(y_new_test >= 0)}")
print(f"  Unknown labels: {sum(y_new_test == -1)}")

if unknown_emotions:
    print(f"\n⚠️ Unknown emotions found (not in training):")
    print(set(unknown_emotions))
    
print(f"\nLabel distribution:")
unique, counts = np.unique(y_new_test[y_new_test >= 0], return_counts=True)
for label_id, count in zip(unique, counts):
    # Reverse lookup emotion name
    emotion_name = [k for k, v in emotion_to_id.items() if v == label_id][0]
    print(f"  {emotion_name} ({label_id}): {count}")

✓ Converted 5994 labels
  Valid labels: 5994
  Unknown labels: 0

Label distribution:
  anger (0): 406
  disgust (1): 214
  fear (2): 292
  happiness (3): 847
  neutral (4): 3147
  sadness (5): 841
  surprise (6): 247


In [23]:
print("="*70)
print("PREPARED TEST DATA SUMMARY")
print("="*70)

print(f"\n✓ Original data: {new_test_data.shape[0]} samples")
print(f"✓ Text column used: '{TEXT_COLUMN}'")
print(f"✓ Emotion column used: '{EMOTION_COL}'")
print(f"✓ Tokenized sequences shape: {X_new_test.shape}")
print(f"✓ Sequence length: {MAX_LENGTH}")
print(f"✓ Vocabulary size: {len(tokenizer.word_index)}")
print(f"✓ Labels shape: {y_new_test.shape}")
print(f"✓ Number of emotion classes: {len(label_mapping)}")

# Filter out unknown labels for evaluation
valid_indices = y_new_test >= 0
X_new_test_valid = X_new_test[valid_indices]
y_new_test_valid = y_new_test[valid_indices]

print(f"\n✓ Valid test samples (excluding unknown emotions): {len(X_new_test_valid)}")

print("\n" + "="*70)
print("✅ DATA IS READY FOR PREDICTION!")
print("="*70)
print("\nTo make predictions:")
print("  predictions = model.predict(X_new_test_valid)")
print("\nTo evaluate:")
print("  from sklearn.metrics import classification_report")
print("  pred_classes = predictions.argmax(axis=1)")
print("  print(classification_report(y_new_test_valid, pred_classes))")

PREPARED TEST DATA SUMMARY

✓ Original data: 5994 samples
✓ Text column used: 'Translation'
✓ Emotion column used: 'Emotion_core'
✓ Tokenized sequences shape: (5994, 65)
✓ Sequence length: 65
✓ Vocabulary size: 46273
✓ Labels shape: (5994,)
✓ Number of emotion classes: 7

✓ Valid test samples (excluding unknown emotions): 5994

✅ DATA IS READY FOR PREDICTION!

To make predictions:
  predictions = model.predict(X_new_test_valid)

To evaluate:
  from sklearn.metrics import classification_report
  pred_classes = predictions.argmax(axis=1)
  print(classification_report(y_new_test_valid, pred_classes))


In [24]:
# Create a clean dataframe with prepared data
test_prepared = new_test_data.copy()
test_prepared['is_valid'] = valid_indices
test_prepared['numeric_label'] = y_new_test

print("Prepared test dataset:")
print(test_prepared.head())

# Save for later use
test_prepared.to_csv('test_data_prepared.csv', index=False)
np.savez_compressed('X_new_test.npz', X_new_test_valid)
np.save('y_new_test.npy', y_new_test_valid)

print("\n✅ Saved prepared data:")
print("  - test_data_prepared.csv")
print("  - X_new_test.npz")
print("  - y_new_test.npy")

Prepared test dataset:
            Start Time             End Time  \
0  1900-01-01 00:00:00  1900-01-01 00:00:02   
1  1900-01-01 00:00:02  1900-01-01 00:00:04   
2  1900-01-01 00:00:04  1900-01-01 00:00:05   
3  1900-01-01 00:00:05  1900-01-01 00:00:07   
4  1900-01-01 00:00:07  1900-01-01 00:00:09   

                              Sentence  \
0              لا يوجد علاقة بدون حاجة   
1                                قاعدة   
2                        لا يوجد علاقة   
3            حتى العلاقة مع الله لحاجة   
4  ولكن علاقة الله مع الناس ليست لحاجة   

                                         Translation   Emotion_fine  \
0              There is no relationship without need    resignation   
1                                               Base     neutrality   
2                           There is no relationship     detachment   
3       Even the relationship with God is for a need    resignation   
4  But God's relationship with people is not out ...  contemplation   

  Emotion_core