# 02 – MLP with TF-IDF (Setup Instructions)

To run this notebook successfully:

1. Make sure you have run **`01_data_processing.ipynb`** first, so that it creates:
   - `data_clean/merged_dataset.parquet`
   - `data_clean/split_indices.npz`
   - `outputs/eval_utils.py`

2. Confirm that the project folder **`HODL Final Project`** exists in **your** Google Drive at:
   `My Drive/HODL Final Project`

   - If this folder was shared with you, go to **"Shared with me"** in Google Drive,  
     right-click **`HODL Final Project` → "Add shortcut to Drive" → My Drive**.

3. If your folder is in a different location or has a different name,  
   **edit the `BASE_PATH` variable in the first code cell**.

4. Then go to **Runtime → Run all** and authorize Drive access when prompted.


In [9]:
# 1. Initializations
!pip install keras-tuner -q
from tensorflow.keras import layers
import tensorflow as tf
from google.colab import drive
import keras
import keras_tuner as kt
import sys
import os
import random
import pandas as pd
import numpy as np

SEED = 42

# Python / NumPy / TF random seeds
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

# Mount Google Drive
drive.mount('/content/drive')

# CHANGE THIS IF YOUR FOLDER IS IN A DIFFERENT PLACE
BASE_PATH = '/content/drive/MyDrive/HODL Final Project'

DATA_CLEAN = f'{BASE_PATH}/data_clean'
OUTPUTS   = f'{BASE_PATH}/outputs'

from sklearn.feature_extraction.text import TfidfVectorizer

#2 Import the Parquet file

df = pd.read_parquet(f'{DATA_CLEAN}/merged_dataset.parquet')

#3 Import the splits
splits = np.load(f'{DATA_CLEAN}/split_indices.npz')
train_idx = splits['train_indices']
val_idx   = splits['val_indices']
test_idx  = splits['test_indices']

df_train = df.iloc[train_idx]
df_val   = df.iloc[val_idx]
df_test  = df.iloc[test_idx]

# 4. Prepare Data for MLP (Vectorization)
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1,2), stop_words='english')

# Fit only on TRAIN, then transform the others
X_train = tfidf.fit_transform(df_train['text']).toarray() # .toarray() converts sparse matrix for Keras
X_val   = tfidf.transform(df_val['text']).toarray()
X_test  = tfidf.transform(df_test['text']).toarray()

y_train = df_train['label'].values
y_val   = df_val['label'].values
y_test  = df_test['label'].values

print(f"Training Data Shape: {X_train.shape}")
print(f"Val Data Shape:     {X_val.shape}")
print(f"Test Data Shape:     {X_test.shape}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Training Data Shape: (1909, 5000)
Val Data Shape:     (395, 5000)
Test Data Shape:     (391, 5000)


In [10]:
def build_model(hp):
    # hyperparameters
    num_layers = hp.Choice("num_layers", [1, 2, 3])
    units      = hp.Choice("units", [32, 64, 128])

    # input
    inputs = keras.Input(shape=(5000,))

    # stack num_layers hidden layers, each with 'units' neurons
    x = inputs
    for i in range(num_layers):
        x = keras.layers.Dense(units, activation="relu", name=f"Hidden{i+1}")(x)

    # output layer
    outputs = layers.Dense(1, activation="sigmoid", name="Output")(x)

    model = keras.Model(inputs, outputs)

    model.compile(
        optimizer="adam",
        loss="binary_crossentropy",
        metrics=["accuracy"]
    )
    return model

In [11]:
# Initialize the tuner
tuner = kt.GridSearch(
    hypermodel=build_model,
    objective="val_accuracy",
    overwrite=True,
    directory="kt_dir",
    project_name="stock_prediction_mlp"
)




tuner.search(
    X_train, y_train,
    epochs=20,
    batch_size=32,
    verbose=1,
    validation_data=(X_val, y_val)
)
tuner.search_space_summary()



Trial 9 Complete [00h 00m 09s]
val_accuracy: 0.6101265549659729

Best val_accuracy So Far: 0.6227847933769226
Total elapsed time: 00h 01m 18s
Search space summary
Default search space size: 2
num_layers (Choice)
{'default': 1, 'conditions': [], 'values': [1, 2, 3], 'ordered': True}
units (Choice)
{'default': 32, 'conditions': [], 'values': [32, 64, 128], 'ordered': True}


In [12]:
tuner.results_summary()

Results summary
Results in kt_dir/stock_prediction_mlp
Showing 10 best trials
Objective(name="val_accuracy", direction="max")

Trial 0000 summary
Hyperparameters:
num_layers: 1
units: 32
Score: 0.6227847933769226

Trial 0003 summary
Hyperparameters:
num_layers: 2
units: 32
Score: 0.6151898503303528

Trial 0004 summary
Hyperparameters:
num_layers: 2
units: 64
Score: 0.6151898503303528

Trial 0005 summary
Hyperparameters:
num_layers: 2
units: 128
Score: 0.6151898503303528

Trial 0006 summary
Hyperparameters:
num_layers: 3
units: 32
Score: 0.6126582026481628

Trial 0001 summary
Hyperparameters:
num_layers: 1
units: 64
Score: 0.6101265549659729

Trial 0008 summary
Hyperparameters:
num_layers: 3
units: 128
Score: 0.6101265549659729

Trial 0002 summary
Hyperparameters:
num_layers: 1
units: 128
Score: 0.607594907283783

Trial 0007 summary
Hyperparameters:
num_layers: 3
units: 64
Score: 0.5898734331130981


In [13]:

# 1 Get the best model from the search
best_model = tuner.get_best_models(num_models=1)[0]

print("\nBest Model Hyperparameters:")
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
print(f"Layers: {best_hps.get('num_layers')}")
print(f"Units:  {best_hps.get('units')}")

# 2 Add the eval_utils path  to import the grading tool
import sys
sys.path.append(OUTPUTS)
from eval_utils import evaluate_model, print_results

# 3 Predict on the Test Set
y_prob = best_model.predict(X_test).flatten()
y_pred = (y_prob > 0.5).astype(int)

# 4. Score it
results = evaluate_model(y_test, y_pred, y_prob)

print("\n" + "="*30)
print("FINAL MLP RESULTS")
print("="*30)
print_results(results, "tuned MLP")

  saveable.load_own_variables(weights_store.get(inner_path))



Best Model Hyperparameters:
Layers: 1
Units:  32
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step

FINAL MLP RESULTS
tuned MLP: 52.17% acc, 0.534 F1, 0.538 AUC
