In [1]:
import gzip
import pickle

import mlflow
import mlflow.keras
import numpy as np
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow.keras.layers import (Conv1D, Embedding, GlobalMaxPooling1D,
                                    LSTM, Dense, Dropout, Bidirectional)
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer



In [11]:
mlflow.set_tracking_uri('https://mlflow-serv-1073438601911.us-west2.run.app')
mlflow.set_experiment('intern-question/project/final/mlflow')

<Experiment: artifact_location='mlflow-artifacts:/1', creation_time=1728527699758, experiment_id='1', last_update_time=1728527699758, lifecycle_stage='active', name='intern-question/project/final/mlflow', tags={}>

In [3]:
with open('/Users/lancesanterre/pipeline_edu/data/processed/pipeline_and_data.pkl', 'rb') as f:
    df = pd.read_pickle(f)
print(df.head())
# Assuming the DataFrame `df` has at least one column and you want to access the first column
questions = df['question']  # or df[0] if it's a Series or DataFrame
questions 

                                            question           labels
0  Which NFL team represented the AFC at Super Bo...  [1.0, 0.0, 0.0]
1  Which NFL team represented the NFC at Super Bo...  [1.0, 0.0, 0.0]
2                Where did Super Bowl 50 take place?  [1.0, 0.0, 0.0]
3                  Which NFL team won Super Bowl 50?  [1.0, 0.0, 0.0]
4  What color was used to emphasize the 50th anni...  [1.0, 0.0, 0.0]


0         Which NFL team represented the AFC at Super Bo...
1         Which NFL team represented the NFC at Super Bo...
2                       Where did Super Bowl 50 take place?
3                         Which NFL team won Super Bowl 50?
4         What color was used to emphasize the 50th anni...
                                ...                        
414915    How many keywords are there in the Racket prog...
414916            Do you believe there is life after death?
414917                                    What is one coin?
414918    What is the approx annual cost of living while...
414919                What is like to have sex with cousin?
Name: question, Length: 414920, dtype: object

In [4]:
filtered_labels = df['labels']
filtered_questions = questions 

# Simple LSTM

In [13]:
best_accuracy = 0
best_model_info = None

for input_dim in [1000, 2000]:
    for output_dim in [16, 32, 64]:
        input_length = 10
        with mlflow.start_run():
            # Tokenization and Padding
            tokenizer = Tokenizer(num_words=input_dim)  # Adjust vocabulary size
            tokenizer.fit_on_texts(filtered_questions)
            sequences = tokenizer.texts_to_sequences(filtered_questions)
            X = pad_sequences(sequences, maxlen=input_length)

            # Convert labels to numpy array
            y = np.array(filtered_labels.tolist())

            # Model: Simple LSTM
            model_name = f"Simple_LSTM_{input_dim}_{output_dim}"
            model = Sequential([
                Embedding(input_dim=input_dim, output_dim=output_dim, input_length=input_length),
                LSTM(output_dim),
                Dropout(0.5),
                Dense(32, activation='relu'),
                Dense(3, activation='softmax')
            ])

            # Compile and train
            model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
            model.fit(X_train, y_train, epochs=10, validation_data=(X_test, y_test))

            # Evaluate and log
            loss, accuracy = model.evaluate(X_test, y_test)
            mlflow.log_params({"input_dim": input_dim, "output_dim": output_dim, "input_length": input_length})
            mlflow.log_metrics({"loss": loss, "accuracy": accuracy})

            # Save the model with a unique name
            mlflow.keras.log_model(model, artifact_path=model_name)
            print(f"Model saved as {model_name} with accuracy: {accuracy:.2f}")

            # Check if this is the best model
            if accuracy > best_accuracy:
                best_accuracy = accuracy
                best_model_info = {
                    "model_name": model_name,
                    "run_id": mlflow.active_run().info.run_id,
                }

            # End the current MLflow run to prepare for the next iteration
            mlflow.end_run()

# Register the best model
if best_model_info:
    print(f"Registering the best model: {best_model_info['model_name']} with accuracy: {best_accuracy:.2f}")
    mlflow.register_model(
        model_uri=f"runs:/{best_model_info['run_id']}/{best_model_info['model_name']}",
        name="Best_Simple_LSTM_Model"
    )


Epoch 1/10
[1m10373/10373[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 1ms/step - accuracy: 0.8782 - loss: 0.3157 - val_accuracy: 0.9249 - val_loss: 0.1880
Epoch 2/10
[1m10373/10373[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 1ms/step - accuracy: 0.9227 - loss: 0.1948 - val_accuracy: 0.9316 - val_loss: 0.1717
Epoch 3/10
[1m10373/10373[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 1ms/step - accuracy: 0.9314 - loss: 0.1761 - val_accuracy: 0.9352 - val_loss: 0.1669
Epoch 4/10
[1m10373/10373[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 1ms/step - accuracy: 0.9343 - loss: 0.1658 - val_accuracy: 0.9369 - val_loss: 0.1627
Epoch 5/10
[1m10373/10373[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 1ms/step - accuracy: 0.9374 - loss: 0.1581 - val_accuracy: 0.9384 - val_loss: 0.1609
Epoch 6/10
[1m10373/10373[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 1ms/step - accuracy: 0.9394 - loss: 0.1538 - val_accuracy: 0.9388 - val_loss: 0.160

2024/10/09 20:16:57 INFO mlflow.tracking._tracking_service.client: 🏃 View run monumental-koi-200 at: https://mlflow-serv-1073438601911.us-west2.run.app/#/experiments/1/runs/6080a94451f34086995b5de913c5507e.
2024/10/09 20:16:57 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://mlflow-serv-1073438601911.us-west2.run.app/#/experiments/1.


Model saved as Simple_LSTM_1000_16 with accuracy: 0.94
Epoch 1/10
[1m10373/10373[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 1ms/step - accuracy: 0.8870 - loss: 0.2865 - val_accuracy: 0.9320 - val_loss: 0.1684
Epoch 2/10
[1m10373/10373[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 1ms/step - accuracy: 0.9312 - loss: 0.1735 - val_accuracy: 0.9374 - val_loss: 0.1569
Epoch 3/10
[1m10373/10373[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 1ms/step - accuracy: 0.9389 - loss: 0.1552 - val_accuracy: 0.9398 - val_loss: 0.1510
Epoch 4/10
[1m10373/10373[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 1ms/step - accuracy: 0.9427 - loss: 0.1461 - val_accuracy: 0.9418 - val_loss: 0.1509
Epoch 5/10
[1m10373/10373[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 1ms/step - accuracy: 0.9455 - loss: 0.1401 - val_accuracy: 0.9424 - val_loss: 0.1457
Epoch 6/10
[1m10373/10373[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 1ms/step - accuracy: 0.9482 

2024/10/09 20:19:26 INFO mlflow.tracking._tracking_service.client: 🏃 View run crawling-auk-234 at: https://mlflow-serv-1073438601911.us-west2.run.app/#/experiments/1/runs/ba6477d00dfd49e7bc50095549ea44f0.
2024/10/09 20:19:26 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://mlflow-serv-1073438601911.us-west2.run.app/#/experiments/1.


Model saved as Simple_LSTM_1000_32 with accuracy: 0.95
Epoch 1/10
[1m10373/10373[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 2ms/step - accuracy: 0.8991 - loss: 0.2559 - val_accuracy: 0.9349 - val_loss: 0.1626
Epoch 2/10
[1m10373/10373[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 2ms/step - accuracy: 0.9362 - loss: 0.1608 - val_accuracy: 0.9396 - val_loss: 0.1516
Epoch 3/10
[1m10373/10373[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 2ms/step - accuracy: 0.9433 - loss: 0.1441 - val_accuracy: 0.9429 - val_loss: 0.1443
Epoch 4/10
[1m10373/10373[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 2ms/step - accuracy: 0.9475 - loss: 0.1345 - val_accuracy: 0.9433 - val_loss: 0.1424
Epoch 5/10
[1m10373/10373[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 3ms/step - accuracy: 0.9502 - loss: 0.1274 - val_accuracy: 0.9451 - val_loss: 0.1400
Epoch 6/10
[1m10373/10373[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 3ms/step - accuracy: 0.9536 

2024/10/09 20:23:59 INFO mlflow.tracking._tracking_service.client: 🏃 View run masked-sloth-629 at: https://mlflow-serv-1073438601911.us-west2.run.app/#/experiments/1/runs/7d167d35c9904db29285d0fc1f33d1bf.
2024/10/09 20:23:59 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://mlflow-serv-1073438601911.us-west2.run.app/#/experiments/1.


Model saved as Simple_LSTM_1000_64 with accuracy: 0.95
Epoch 1/10
[1m10373/10373[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 1ms/step - accuracy: 0.8639 - loss: 0.3485 - val_accuracy: 0.9145 - val_loss: 0.2090
Epoch 2/10
[1m10373/10373[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 1ms/step - accuracy: 0.9144 - loss: 0.2153 - val_accuracy: 0.9223 - val_loss: 0.1934
Epoch 3/10
[1m10373/10373[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 1ms/step - accuracy: 0.9214 - loss: 0.1966 - val_accuracy: 0.9251 - val_loss: 0.1914
Epoch 4/10
[1m10373/10373[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 1ms/step - accuracy: 0.9264 - loss: 0.1851 - val_accuracy: 0.9271 - val_loss: 0.1927
Epoch 5/10
[1m10373/10373[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 1ms/step - accuracy: 0.9307 - loss: 0.1764 - val_accuracy: 0.9275 - val_loss: 0.1926
Epoch 6/10
[1m10373/10373[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 1ms/step - accuracy: 0.9335 

2024/10/09 20:26:07 INFO mlflow.tracking._tracking_service.client: 🏃 View run receptive-shrimp-928 at: https://mlflow-serv-1073438601911.us-west2.run.app/#/experiments/1/runs/f30d0c2a691f4be59c260e932c0677f5.
2024/10/09 20:26:07 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://mlflow-serv-1073438601911.us-west2.run.app/#/experiments/1.


Model saved as Simple_LSTM_2000_16 with accuracy: 0.93
Epoch 1/10
[1m10373/10373[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 2ms/step - accuracy: 0.8713 - loss: 0.3231 - val_accuracy: 0.9192 - val_loss: 0.1962
Epoch 2/10
[1m10373/10373[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 2ms/step - accuracy: 0.9215 - loss: 0.1949 - val_accuracy: 0.9252 - val_loss: 0.1869
Epoch 3/10
[1m10373/10373[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 2ms/step - accuracy: 0.9305 - loss: 0.1751 - val_accuracy: 0.9302 - val_loss: 0.1737
Epoch 4/10
[1m10373/10373[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 2ms/step - accuracy: 0.9359 - loss: 0.1629 - val_accuracy: 0.9318 - val_loss: 0.1720
Epoch 5/10
[1m10373/10373[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 2ms/step - accuracy: 0.9404 - loss: 0.1530 - val_accuracy: 0.9333 - val_loss: 0.1706
Epoch 6/10
[1m10373/10373[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 2ms/step - accuracy: 0.9426 

2024/10/09 20:29:04 INFO mlflow.tracking._tracking_service.client: 🏃 View run suave-seal-2 at: https://mlflow-serv-1073438601911.us-west2.run.app/#/experiments/1/runs/cfd93dbcef0f492890a9134aca086a27.
2024/10/09 20:29:04 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://mlflow-serv-1073438601911.us-west2.run.app/#/experiments/1.


Model saved as Simple_LSTM_2000_32 with accuracy: 0.93
Epoch 1/10
[1m10373/10373[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 3ms/step - accuracy: 0.8828 - loss: 0.2931 - val_accuracy: 0.9225 - val_loss: 0.1915
Epoch 2/10
[1m10373/10373[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 3ms/step - accuracy: 0.9254 - loss: 0.1851 - val_accuracy: 0.9291 - val_loss: 0.1763
Epoch 3/10
[1m10373/10373[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 3ms/step - accuracy: 0.9342 - loss: 0.1653 - val_accuracy: 0.9318 - val_loss: 0.1712
Epoch 4/10
[1m10373/10373[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 3ms/step - accuracy: 0.9398 - loss: 0.1532 - val_accuracy: 0.9342 - val_loss: 0.1670
Epoch 5/10
[1m10373/10373[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 3ms/step - accuracy: 0.9453 - loss: 0.1406 - val_accuracy: 0.9352 - val_loss: 0.1656
Epoch 6/10
[1m10373/10373[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 3ms/step - accuracy: 0.9502 

2024/10/09 20:34:14 INFO mlflow.tracking._tracking_service.client: 🏃 View run caring-finch-58 at: https://mlflow-serv-1073438601911.us-west2.run.app/#/experiments/1/runs/1226e0bae6c64f83be0ec2edcd207e23.
2024/10/09 20:34:14 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://mlflow-serv-1073438601911.us-west2.run.app/#/experiments/1.


Model saved as Simple_LSTM_2000_64 with accuracy: 0.94
Registering the best model: Simple_LSTM_1000_64 with accuracy: 0.95


Successfully registered model 'Best_Simple_LSTM_Model'.
2024/10/09 20:34:14 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Best_Simple_LSTM_Model, version 1
Created version '1' of model 'Best_Simple_LSTM_Model'.


#  Bi-directional LSTM

In [22]:
for input_dim in [1000, 2000, 4000, 5000]:
    for output_dim in [16, 32, 64, 128]:
        input_length = 10
        with mlflow.start_run():
            # Tokenization and Padding
            tokenizer = Tokenizer(num_words=input_dim)
            tokenizer.fit_on_texts(filtered_questions)
            sequences = tokenizer.texts_to_sequences(filtered_questions)
            X = pad_sequences(sequences, maxlen=input_length)
            y = np.array(filtered_labels.tolist())

            # Model 2: Bi-directional LSTM
            model_name = f"Bidirectional_LSTM_{input_dim}_{output_dim}"
            model = Sequential([
                Embedding(input_dim=input_dim, output_dim=output_dim, input_length=input_length),
                Bidirectional(LSTM(output_dim)),
                Dropout(0.5),
                Dense(32, activation='relu'),
                Dense(3, activation='softmax')
            ])

            # Compile and train
            model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
            model.fit(X_train, y_train, epochs=10, validation_data=(X_test, y_test))

            # Evaluate and log
            loss, accuracy = model.evaluate(X_test, y_test)
            mlflow.log_params({"input_dim": input_dim, "output_dim": output_dim, "input_length": input_length})
            mlflow.log_metric("loss", loss)
            mlflow.log_metric("accuracy", accuracy)

            # Save the model with a unique name
            mlflow.keras.log_model(model, artifact_path=model_name)
            print(f"Model saved as {model_name} with accuracy: {accuracy:.2f}")
        mlflow.end_run()


Epoch 1/10
[1m10373/10373[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 2ms/step - accuracy: 0.8889 - loss: 0.2822 - val_accuracy: 0.9315 - val_loss: 0.1724
Epoch 2/10
[1m10373/10373[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 2ms/step - accuracy: 0.9318 - loss: 0.1752 - val_accuracy: 0.9358 - val_loss: 0.1610
Epoch 3/10
[1m10373/10373[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 2ms/step - accuracy: 0.9370 - loss: 0.1604 - val_accuracy: 0.9384 - val_loss: 0.1552
Epoch 4/10
[1m10373/10373[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 2ms/step - accuracy: 0.9404 - loss: 0.1530 - val_accuracy: 0.9400 - val_loss: 0.1510
Epoch 5/10
[1m10373/10373[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 2ms/step - accuracy: 0.9431 - loss: 0.1461 - val_accuracy: 0.9417 - val_loss: 0.1476
Epoch 6/10
[1m10373/10373[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 2ms/step - accuracy: 0.9444 - loss: 0.1433 - val_accuracy: 0.9419 - val_loss: 0.147

2024/09/22 18:21:28 INFO mlflow.tracking._tracking_service.client: 🏃 View run defiant-ape-959 at: https://mlflow-service-1073438601911.us-west2.run.app/#/experiments/1/runs/25c22f52e35347ecb89e47e6ceee9b16.
2024/09/22 18:21:28 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://mlflow-service-1073438601911.us-west2.run.app/#/experiments/1.


Model saved as Bidirectional_LSTM_1000_16 with accuracy: 0.94
Epoch 1/10
[1m10373/10373[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 3ms/step - accuracy: 0.8969 - loss: 0.2613 - val_accuracy: 0.9341 - val_loss: 0.1660
Epoch 2/10
[1m10373/10373[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 3ms/step - accuracy: 0.9350 - loss: 0.1647 - val_accuracy: 0.9386 - val_loss: 0.1524
Epoch 3/10
[1m10373/10373[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 3ms/step - accuracy: 0.9416 - loss: 0.1486 - val_accuracy: 0.9409 - val_loss: 0.1488
Epoch 4/10
[1m10373/10373[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 3ms/step - accuracy: 0.9453 - loss: 0.1390 - val_accuracy: 0.9418 - val_loss: 0.1465
Epoch 5/10
[1m10373/10373[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 3ms/step - accuracy: 0.9483 - loss: 0.1328 - val_accuracy: 0.9429 - val_loss: 0.1437
Epoch 6/10
[1m10373/10373[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 3ms/step - accuracy: 

2024/09/22 18:26:17 INFO mlflow.tracking._tracking_service.client: 🏃 View run likeable-snake-443 at: https://mlflow-service-1073438601911.us-west2.run.app/#/experiments/1/runs/42e5bb57663847d1984a7c5848ca5897.
2024/09/22 18:26:17 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://mlflow-service-1073438601911.us-west2.run.app/#/experiments/1.


Model saved as Bidirectional_LSTM_1000_32 with accuracy: 0.95
Epoch 1/10
[1m10373/10373[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 5ms/step - accuracy: 0.9047 - loss: 0.2403 - val_accuracy: 0.9366 - val_loss: 0.1585
Epoch 2/10
[1m10373/10373[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 5ms/step - accuracy: 0.9392 - loss: 0.1533 - val_accuracy: 0.9426 - val_loss: 0.1458
Epoch 3/10
[1m10373/10373[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 5ms/step - accuracy: 0.9449 - loss: 0.1393 - val_accuracy: 0.9437 - val_loss: 0.1418
Epoch 4/10
[1m10373/10373[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 5ms/step - accuracy: 0.9495 - loss: 0.1301 - val_accuracy: 0.9448 - val_loss: 0.1405
Epoch 5/10
[1m10373/10373[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 5ms/step - accuracy: 0.9532 - loss: 0.1209 - val_accuracy: 0.9467 - val_loss: 0.1366
Epoch 6/10
[1m10373/10373[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 5ms/step - accuracy: 

2024/09/22 18:34:58 INFO mlflow.tracking._tracking_service.client: 🏃 View run bouncy-cow-286 at: https://mlflow-service-1073438601911.us-west2.run.app/#/experiments/1/runs/e4352c9a0f1f4b169800149bd3620954.
2024/09/22 18:34:58 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://mlflow-service-1073438601911.us-west2.run.app/#/experiments/1.


Model saved as Bidirectional_LSTM_1000_64 with accuracy: 0.95
Epoch 1/10
[1m10373/10373[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m131s[0m 12ms/step - accuracy: 0.9084 - loss: 0.2295 - val_accuracy: 0.9386 - val_loss: 0.1547
Epoch 2/10
[1m10373/10373[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m135s[0m 13ms/step - accuracy: 0.9404 - loss: 0.1511 - val_accuracy: 0.9430 - val_loss: 0.1444
Epoch 3/10
[1m10373/10373[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m125s[0m 12ms/step - accuracy: 0.9487 - loss: 0.1318 - val_accuracy: 0.9471 - val_loss: 0.1349
Epoch 4/10
[1m10373/10373[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m135s[0m 13ms/step - accuracy: 0.9532 - loss: 0.1205 - val_accuracy: 0.9471 - val_loss: 0.1357
Epoch 5/10
[1m10116/10373[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m3s[0m 12ms/step - accuracy: 0.9583 - loss: 0.1080

2024/09/22 18:45:52 INFO mlflow.tracking._tracking_service.client: 🏃 View run gentle-rook-187 at: https://mlflow-service-1073438601911.us-west2.run.app/#/experiments/1/runs/4ae09ce00629496fb41f0fe627bce2e6.
2024/09/22 18:45:52 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://mlflow-service-1073438601911.us-west2.run.app/#/experiments/1.


KeyboardInterrupt: 

# Chosen Model 
## Bi-directional LSTM
### Params
- input_dim : 1000
- output_dim : 128
- input_length : 10

In [12]:
import os
import mlflow
import pickle
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dropout, Dense
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

# Define directories and parameters
save_dir = "model_token"
os.makedirs(save_dir, exist_ok=True)
input_dim = 1000  # Adjust as needed
output_dim = 64  # Adjust as needed
input_length = 10  # Should match the training setup

# Sample data (assuming filtered_questions and filtered_labels are already defined)
# filtered_questions = [...]  # Define your questions list here
# filtered_labels = [...]     # Define your labels list here

# Tokenization and Padding
tokenizer = Tokenizer(num_words=input_dim)
tokenizer.fit_on_texts(filtered_questions)
sequences = tokenizer.texts_to_sequences(filtered_questions)
X = pad_sequences(sequences, maxlen=input_length)
y = np.array(filtered_labels.tolist())

# Save the tokenizer
tokenizer_path = os.path.join(save_dir, "tokenizer.pkl")
with open(tokenizer_path, 'wb') as f:
    pickle.dump(tokenizer, f)
print(f"Tokenizer saved at: {tokenizer_path}")

# Split data for training and evaluation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Start MLflow run
with mlflow.start_run():
    # Model: Simple LSTM
    model_name = f"Simple_LSTM_{input_dim}_{output_dim}"
    model = Sequential([
        Embedding(input_dim=input_dim, output_dim=output_dim, input_length=input_length),
        LSTM(output_dim),
        Dropout(0.5),
        Dense(32, activation='relu'),
        Dense(3, activation='softmax')
    ])

    # Compile the model
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

    # Train the model with a validation split to monitor training
    model.fit(X_train, y_train, epochs=10, validation_data=(X_test, y_test), verbose=2)

    # Save the model in the correct format
    model_path = os.path.join(save_dir, model_name + ".keras")
    model.save(model_path)
    print(f"Model saved at: {model_path}")

    # Log the model to MLflow
    mlflow.log_params({"input_dim": input_dim, "output_dim": output_dim, "input_length": input_length})
    mlflow.keras.log_model(model, artifact_path=model_name)
    print(f"Model logged in MLflow as {model_name}")

    # Evaluate the model
    loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
    print(f"Model evaluation - Accuracy: {accuracy:.2f}, Loss: {loss:.2f}")

# End the MLflow run
mlflow.end_run()


Tokenizer saved at: model_token/tokenizer.pkl
Epoch 1/10
10373/10373 - 27s - 3ms/step - accuracy: 0.9193 - loss: 0.2048 - val_accuracy: 0.9343 - val_loss: 0.1638
Epoch 2/10
10373/10373 - 25s - 2ms/step - accuracy: 0.9370 - loss: 0.1591 - val_accuracy: 0.9390 - val_loss: 0.1546
Epoch 3/10
10373/10373 - 25s - 2ms/step - accuracy: 0.9424 - loss: 0.1460 - val_accuracy: 0.9420 - val_loss: 0.1462
Epoch 4/10
10373/10373 - 26s - 3ms/step - accuracy: 0.9465 - loss: 0.1363 - val_accuracy: 0.9421 - val_loss: 0.1441
Epoch 5/10
10373/10373 - 26s - 3ms/step - accuracy: 0.9498 - loss: 0.1284 - val_accuracy: 0.9446 - val_loss: 0.1407
Epoch 6/10
10373/10373 - 26s - 3ms/step - accuracy: 0.9530 - loss: 0.1210 - val_accuracy: 0.9463 - val_loss: 0.1377
Epoch 7/10
10373/10373 - 26s - 3ms/step - accuracy: 0.9556 - loss: 0.1151 - val_accuracy: 0.9465 - val_loss: 0.1417
Epoch 8/10
10373/10373 - 26s - 3ms/step - accuracy: 0.9584 - loss: 0.1090 - val_accuracy: 0.9473 - val_loss: 0.1404
Epoch 9/10
10373/10373 - 2



Model saved at: model_token/Simple_LSTM_1000_64.keras




Model logged in MLflow as Simple_LSTM_1000_64


2024/10/09 20:14:38 INFO mlflow.tracking._tracking_service.client: 🏃 View run gentle-skink-895 at: https://mlflow-serv-1073438601911.us-west2.run.app/#/experiments/1/runs/d026d2ad1f3c42508e91da22f21378ec.
2024/10/09 20:14:38 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://mlflow-serv-1073438601911.us-west2.run.app/#/experiments/1.


Model evaluation - Accuracy: 0.95, Loss: 0.14
