In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Conv1D, GlobalMaxPooling1D, Bidirectional, LSTM, Dense, Concatenate, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import classification_report
from datasets import load_dataset

# --- 1. Load and Prepare Dataset ---

# Load the dataset from Hugging Face
dataset = load_dataset("Fatima0923/Automated-Personality-Prediction")

# Define label columns
LABEL_COLUMNS = ['openness', 'conscientiousness', 'extraversion', 'agreeableness', 'neuroticism']
CLASSIFICATION_THRESHOLD = 50  # We define 'high' as > 50

# Function to prepare the dataframes
def prepare_df(split_name):
    """Converts a Hugging Face dataset split into a preprocessed pandas DataFrame."""
    df = dataset[split_name].to_pandas()

    # Extract text
    X = df['text']

    # *** MODIFICATION FOR CLASSIFICATION ***
    # Convert scores to binary classes (0 or 1)
    # 1 if score > 50, 0 otherwise
    y = (df[LABEL_COLUMNS].values > CLASSIFICATION_THRESHOLD).astype(int)

    return X, y

# Prepare the data for each split
X_train, y_train = prepare_df('train')
X_val, y_val = prepare_df('validation')
X_test, y_test = prepare_df('test')

print(f"Training samples: {len(X_train)}")
print(f"Validation samples: {len(X_val)}")
print(f"Test samples: {len(X_test)}")
print(f"Sample Text: {X_train.iloc[0]}")
print(f"Sample Labels (Classes): {y_train[0]}  <- [O, C, E, A, N]")

# --- 2. Text Tokenization and Padding ---

# Model hyperparameters
VOCAB_SIZE = 20000  # Max number of words to keep
MAX_LEN = 150       # Max length of sequences
EMBEDDING_DIM = 100 # Dimension of word embeddings (Recommend using GloVe here)

# Initialize and fit tokenizer
tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

word_index = tokenizer.word_index
print(f"Found {len(word_index)} unique tokens.")

# Convert text to padded sequences
def text_to_seq(text_data):
    sequences = tokenizer.texts_to_sequences(text_data)
    padded = pad_sequences(sequences, maxlen=MAX_LEN, padding='post', truncating='post')
    return padded

X_train_pad = text_to_seq(X_train)
X_val_pad = text_to_seq(X_val)
X_test_pad = text_to_seq(X_test)

# --- 3. Build the Hybrid Deep Learning Model ---

def build_hybrid_classification_model():
    """Builds a hybrid CNN + Bi-LSTM model for multi-label classification."""

    # Input Layer
    input_layer = Input(shape=(MAX_LEN,), name='input_layer')

    # 1. Embedding Layer
    embedding_layer = Embedding(input_dim=VOCAB_SIZE,
                                output_dim=EMBEDDING_DIM,
                                input_length=MAX_LEN,
                                name='embedding_layer')(input_layer)

    # 2. CNN Branch (for local feature extraction)
    cnn_branch = Conv1D(filters=128,
                        kernel_size=5,
                        activation='relu',
                        name='cnn_layer')(embedding_layer)
    cnn_branch = GlobalMaxPooling1D(name='cnn_pooling_layer')(cnn_branch)

    # 3. Bi-LSTM Branch (for sequential context)
    lstm_branch = Bidirectional(LSTM(units=128,
                                    return_sequences=False),
                                    name='bilstm_layer')(embedding_layer)

    # 4. Concatenate Branches
    concatenated = Concatenate(name='concatenate_layer')([cnn_branch, lstm_branch])

    # 5. Fully Connected Layers
    dense_layer = Dense(128, activation='relu', name='dense_layer_1')(concatenated)
    dense_layer = Dropout(0.5, name='dropout_layer')(dense_layer)

    # 6. *** MODIFICATION FOR CLASSIFICATION ***
    # Output Layer
    # 5 neurons (one for each trait)
    # 'sigmoid' activation for multi-label classification (outputs a 0-1 probability for each class)
    output_layer = Dense(5, activation='sigmoid', name='output_layer')(dense_layer)

    # Create the model
    model = Model(inputs=input_layer, outputs=output_layer)

    # 7. *** MODIFICATION FOR CLASSIFICATION ***
    # Compile the model
    # Loss: 'binary_crossentropy' is standard for multi-label classification
    # Metrics: 'accuracy' will measure how often the model gets the 0 or 1 correct for each label
    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy']) # 'accuracy' is now a relevant metric

    return model

model = build_hybrid_classification_model()
model.summary()

# --- 4. Train the Model ---

# Add an EarlyStopping callback to prevent overfitting
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss',
                                                  patience=3,
                                                  restore_best_weights=True)

print("\n--- Starting Model Training ---")
history = model.fit(
    X_train_pad, y_train,
    epochs=50,
    batch_size=64,
    validation_data=(X_val_pad, y_val),
    callbacks=[early_stopping],
    verbose=1
)
print("--- Model Training Finished ---")

# --- 5. Evaluate the Model ---

print("\n--- Evaluating Model-----")

# Get predictions (probabilities)
y_pred_probs = model.predict(X_test_pad)

# Convert probabilities to classes (0 or 1) using a 0.5 threshold
y_pred_classes = (y_pred_probs > 0.5).astype(int)

# Show a detailed classification report
print("\n--- Detailed Classification Report (per Trait) ---")
# Note: target_names are in the order of LABEL_COLUMNS
print(classification_report(y_test, y_pred_classes, target_names=LABEL_COLUMNS))


# --- 6. Test on a New Sentence ---
print("\n--- Example Prediction ---")
sample_text = ["I am a very outgoing person and I love to talk to new people at parties."]

# Preprocess the sample text
sample_seq = text_to_seq(sample_text)

# Get prediction (probabilities)
prediction_probs = model.predict(sample_seq)

# Convert probabilities to classes
prediction_classes = (prediction_probs > 0.5).astype(int)

print(f"Text: '{sample_text[0]}'")
print("Predicted Classes (0=Low, 1=High):")
for i, trait in enumerate(LABEL_COLUMNS):
    class_name = "High" if prediction_classes[0][i] == 1 else "Low"
    print(f"- {trait}: {class_name} (Prob: {prediction_probs[0][i]:.2f})")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/845 [00:00<?, ?B/s]

train_set.csv: 0.00B [00:00, ?B/s]

val_set.csv: 0.00B [00:00, ?B/s]

eval_set.csv: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/16047 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2415 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2415 [00:00<?, ? examples/s]

Training samples: 16047
Validation samples: 2415
Test samples: 2415
Sample Text: his name was kim kimble originally wow thats some messed up parents
Sample Labels (Classes): [1 0 0 0 1]  <- [O, C, E, A, N]
Found 39028 unique tokens.





--- Starting Model Training ---
Epoch 1/50
[1m251/251[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 24ms/step - accuracy: 0.7388 - loss: 0.6119 - val_accuracy: 0.7520 - val_loss: 0.5859
Epoch 2/50
[1m251/251[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 21ms/step - accuracy: 0.7627 - loss: 0.5713 - val_accuracy: 0.7491 - val_loss: 0.5315
Epoch 3/50
[1m251/251[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 25ms/step - accuracy: 0.6917 - loss: 0.5049 - val_accuracy: 0.6870 - val_loss: 0.4599
Epoch 4/50
[1m251/251[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 21ms/step - accuracy: 0.5961 - loss: 0.4113 - val_accuracy: 0.6203 - val_loss: 0.4078
Epoch 5/50
[1m251/251[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 22ms/step - accuracy: 0.5679 - loss: 0.3242 - val_accuracy: 0.6012 - val_loss: 0.3770
Epoch 6/50
[1m251/251[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 21ms/step - accuracy: 0.5490 - loss: 0.2533 - val_accuracy: 0.5631 - val_l

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# --- 7. Save the Trained Model ---
import pickle

print("--- Saving Model and Tokenizer to Google Drive ---")

# Define paths in your Google Drive
# This will create a 'my_personality_classifier.keras' file in your main Drive folder
drive_model_path = '/content/drive/MyDrive/my_personality_classifier.keras'
drive_tokenizer_path = '/content/drive/MyDrive/tokenizer.pickle'

# Save the model
model.save(drive_model_path)

# Save the tokenizer
with open(drive_tokenizer_path, 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

print(f"Model saved to: {drive_model_path}")
print(f"Tokenizer saved to: {drive_tokenizer_path}")

--- Saving Model and Tokenizer to Google Drive ---
Model saved to: /content/drive/MyDrive/my_personality_classifier.keras
Tokenizer saved to: /content/drive/MyDrive/tokenizer.pickle


In [None]:
pip install flask flask-cors tensorflow numpy

Collecting flask-cors
  Downloading flask_cors-6.0.1-py3-none-any.whl.metadata (5.3 kB)
Downloading flask_cors-6.0.1-py3-none-any.whl (13 kB)
Installing collected packages: flask-cors
Successfully installed flask-cors-6.0.1


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive
