<a href="https://colab.research.google.com/github/nilanahar/MMAI_894_DeepLearning_Project_TeamRosedale/blob/main/MMAI_894_Team_Project_ResNet%2BGoogleWord2Vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Set Up Environment

#### Packages to Load

In [29]:
from google.colab import drive
import pandas as pd
import numpy as np
import json
import joblib
import os
import re
import ast
import matplotlib.pyplot as plt

from tensorflow.keras.applications.resnet50 import ResNet50, preprocess_input
from tensorflow.keras.preprocessing import image
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

import nltk
nltk.download('wordnet')
nltk.download('stopwords')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

# from wordcloud import WordCloud
# from collections import Counter

# import gensim.downloader as api
from sklearn.metrics.pairwise import cosine_similarity
# from sklearn.preprocessing import StandardScaler
# from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

from tensorflow.keras.layers import Input, Embedding, Flatten, Concatenate, Dense, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

import warnings
warnings.filterwarnings("ignore")

!pip install optuna
import optuna

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!




In [8]:
# define my google drive
drive.mount("/content/drive", force_remount=True)

Mounted at /content/drive


#### Functions to Use (Must Run)

In [30]:
# Function to load multiple JSON data
def load_json(file_path):
    if file_path: # Check if file_path is not None
        with open(file_path, 'r') as f:
            return json.load(f)
    else:
        return None # or handle the None case differently, like returning an empty dictionary

def json_to_pdDf(mcq_file_path, oeq_file_path, answers_file_path):
    # Now you can use this function to load multiple JSON files
    mcq_json_data = load_json(mcq_file_path)
    oeq_json_data = load_json(oeq_file_path)

    mcq_df = pd.json_normalize(mcq_json_data['questions'])[['image_id', 'question_id', 'question', 'multiple_choices']]
    oeq_df = pd.json_normalize(oeq_json_data['questions'])[['image_id', 'question_id', 'question']]

    # Check if answers_file_path is None
    if answers_file_path is not None:
        answers_json_data = load_json(answers_file_path)
        answers_df = pd.json_normalize(answers_json_data['annotations'])[['image_id', 'question_id', 'question_type', 'answers', 'multiple_choice_answer', 'answer_type']]
    else:
        # Create an empty DataFrame if answers_file_path is None
        answers_df = pd.DataFrame(columns=['image_id', 'question_id', 'question_type', 'answers', 'multiple_choice_answer', 'answer_type'])

    return mcq_df, oeq_df, answers_df


def tabularize_answers_df(answers_df):
    # Create a unique identifier for each row to track back after transformations
    answers_df['index'] = answers_df.index
    # Explode the 'answers' column into separate rows, while keeping other columns intact
    answers_expanded_df = answers_df.explode('answers')
    # normalization as before
    answers_details = pd.json_normalize(answers_expanded_df['answers'])
    answers_details['index'] = answers_expanded_df.index
    # Concatenate back to the actual answers DataFrame
    answers_expanded_df = pd.merge(answers_df, answers_details, on='index', how='left')
    answers_merged_df = answers_expanded_df[['image_id', 'question_id', 'question_type',
                                              'multiple_choice_answer', 'answer_type',
                                              'answer', 'answer_id', 'answer_confidence',
                                              'index']]
    answers_merged_df = answers_merged_df.rename(columns={'multiple_choice_answer': 'target_answer'})

    return answers_merged_df

# Function to extract features from multiple JSON files
def extract_qna_features(mcq_file_path, oeq_file_path, answers_file_path):

    mcq_df, oeq_df, answers_df = json_to_pdDf(mcq_file_path, oeq_file_path, answers_file_path)
    answers_merged_df = tabularize_answers_df(answers_df)

    questions_merged_df = pd.merge(mcq_df, oeq_df, on=['image_id', 'question_id'], how='left', suffixes=('_oeq', '_mcq'))
    final_qna_df = pd.merge(questions_merged_df, answers_merged_df, on=['image_id', 'question_id'], how='left')

    return mcq_df, oeq_df, answers_merged_df, final_qna_df


# Function to extract last five digits from file names
def extract_last_five_digits(filename):
    match = re.search(r'(\d{5})\.png$', filename)
    if match:
        return match.group(1)
    else:
        raise ValueError(f"Filename {filename} does not match the expected pattern.")

def extract_image_features(model, images_file_path):
    images_features_dict = {}
    for img_name in tqdm(os.listdir(images_file_path)):
        img_path = os.path.join(images_file_path, img_name)

        # Extract image_id (adjust slice according to your filename structure)
        image_id = img_name[-9:-4]  # Example: extracting last 5 digits before file extension

        # Load and preprocess the image
        img = image.load_img(img_path, target_size=(224, 224))
        x = preprocess_input(np.expand_dims(image.img_to_array(img), axis=0))

        # Extract features and store them in the dictionary
        features = model.predict(x)
        images_features_dict[image_id] = features.flatten()

    images_features_df = pd.DataFrame.from_dict(images_features_dict, orient='index')
    images_features_df.reset_index(inplace=True)
    images_features_df.rename(columns={'index': 'image_id'}, inplace=True)

    return images_features_df


def complete_datasets(qna_df, images_features_df):
    qna_df = qna_df.sort_values(by='image_id').reset_index(drop=True)
    images_features_df = images_features_df.sort_values(by='image_id').reset_index(drop=True)
    images_features_df['image_id'] = images_features_df['image_id'].astype(int)
    # Merge the 3 dataframes into one (base dataset is image_df (unique), left join qna on image id)
    complete_training_set = pd.merge(images_features_df, qna_df, on='image_id', how='left')
    return complete_training_set

def preprocess_text(text):
    """
    Improved text preprocessing function that retains context and meaning.
    - Converts text to lowercase.
    - Removes non-alphabetic characters.
    - Tokenizes, removes only non-essential stopwords, and lemmatizes.
    """
    if pd.isnull(text):
        return ''

    # Convert text to lowercase
    text = text.lower()
    # Remove non-alphabetic characters (but keep question marks and spaces)
    text = re.sub(r'[^a-z\s\?]', '', text)
    # Tokenize text into words
    words = text.split()
    # Create a lemmatizer object
    lemmatizer = WordNetLemmatizer()
    # Define stop words
    stop_words = set(stopwords.words('english')) # Define stop_words here
    essential_stopwords = {"how", "how many", "what", "when", "why", "which", "where", "who"}
    # Remove stopwords, but retain essential question words
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words or word in essential_stopwords]
    # Join words back into a single string
    return ' '.join(words)


def get_sentence_embedding(sentence, model):
    """
    Calculates the sentence embedding by averaging the word embeddings of all words in the sentence.
    If a word is not found in the model, it's embedding is considered as a zero vector.
    """
    # Convert series to string
    if isinstance(sentence, pd.Series):
        sentence = ' '.join(sentence.astype(str).tolist())
    words = sentence.split()
    # Removed .wv since model is already a KeyedVectors object
    word_vectors = [model[word] for word in words if word in model]

    if len(word_vectors) == 0:
        return np.zeros(model.vector_size)

    return np.mean(word_vectors, axis=0)


# Modified filter_choices_by_cosine_similarity function using Word2Vec
def filter_choices_by_word2vec_similarity(row, model):
    """
    Filters multiple-choice options based on their cosine similarity with the question using Word2Vec embeddings.
    Args:
    - row: A DataFrame row containing a 'question' and 'multiple_choices' columns.
    - model: Trained Word2Vec model.

    Returns:
    - A list of filtered multiple-choice options based on cosine similarity.
    """
    question = row['question']
    choices = row['multiple_choices']  # Assume 'multiple_choices' column has a list of answer options

    # Step 1: Generate Word2Vec embeddings for the question
    question_embedding = get_sentence_embedding(question, model)

    # Step 2: Generate Word2Vec embeddings for each choice
    choice_embeddings = [get_sentence_embedding(choice, model) for choice in choices]

    # Step 3: Calculate cosine similarity between question embedding and each choice embedding
    similarity_scores = cosine_similarity([question_embedding], choice_embeddings)[0]

    # Step 4: Retain choices with a high similarity score (e.g., > 0.3)
    threshold = 0.3
    filtered_choices = [choices[i] for i, score in enumerate(similarity_scores) if score >= threshold]

    # Step 5: If no choices pass the threshold, keep original choices to avoid empty options
    return filtered_choices if filtered_choices else choices

# Step 1: Concatenate List into a Single Sentence for Each Row
def concatenate_list_to_sentence(row, list_column):
    """
    Concatenate all words in a list (in a single row) into a single string.
    Args:
    - row: A single row of the DataFrame.
    - list_column: Column name containing lists of words.

    Returns:
    - A single concatenated string of all words in the list.
    """
    return " ".join(row[list_column])  # Join words with a space separator


#### Define Constants

In [32]:
model = ResNet50(weights='imagenet', include_top=False)
# wxord2vec_model = api.load('word2vec-google-news-300')
# joblib.dump(word2vec_model, "/content/drive/MyDrive/Colab Notebooks/894/Training/(2)_word2vec_model_for_2.2.joblib")
# model = joblib.load("/content/drive/MyDrive/Colab Notebooks/894/Training/(2)_word2vec_model_for_2.2.joblib")

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5
[1m94765736/94765736[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 0us/step


In [None]:
id_features = ['index', 'image_id', 'question_id', 'answer_id', 'question_length']

categorical_features = [  'question', 'question_clean', 'question_type',
                          'multiple_choices',  'filtered_multiple_choices', 'filtered_multiple_choice_corpus',
                          'answer', 'answer_clean', 'answer_confidence', 'answer_type']

numerical_features = [col for col in df_train.columns if col not in categorical_features + id_features]

# Step 1: Data Collection & Feature Extraction

#### Set Directories

In [33]:
# Define the file path (adjust it to match your folder structure)
training_mcq_file_path = "/content/drive/MyDrive/Colab Notebooks/894/Training/Data/MultipleChoice_abstract_v002_train2015_questions.json"
training_oeq_file_path = "/content/drive/MyDrive/Colab Notebooks/894/Training/Data/OpenEnded_abstract_v002_train2015_questions.json"
training_answers_file_path = "/content/drive/MyDrive/Colab Notebooks/894/Training/Data/abstract_v002_train2015_annotations.json"
training_images_file_path = "/content/drive/MyDrive/Colab Notebooks/894/Training/Data/scene_img_abstract_v002_train2015"

validation_mcq_file_path = "/content/drive/MyDrive/Colab Notebooks/894/Validation/Data/MultipleChoice_abstract_v002_val2015_questions.json"
validation_oeq_file_path = "/content/drive/MyDrive/Colab Notebooks/894/Validation/Data/OpenEnded_abstract_v002_val2015_questions.json"
validation_answers_file_path = "/content/drive/MyDrive/Colab Notebooks/894/Validation/Data/abstract_v002_val2015_annotations.json"
validation_images_file_path = "/content/drive/MyDrive/Colab Notebooks/894/Validation/Data/scene_img_abstract_v002_val2015"

testing_mcq_file_path = "/content/drive/MyDrive/Colab Notebooks/894/Testing/Data/MultipleChoice_abstract_v002_test2015_questions.json"
testing_oeq_file_path = "/content/drive/MyDrive/Colab Notebooks/894/Testing/Data/OpenEnded_abstract_v002_test2015_questions.json"
testing_images_file_path = "/content/drive/MyDrive/Colab Notebooks/894/Testing/Data/scene_img_abstract_v002_test2015"

### 1.1 Training Datasets

#### 1.1.1 Extracting Questions & Answers & Features

In [None]:
train_mcq_df, train_oeq_df, train_answers_merged_df, train_final_qna_df = extract_qna_features(training_mcq_file_path, training_oeq_file_path, training_answers_file_path)
train_final_qna_df.head(5)

##### Save Workspace

In [None]:
# Save the dataframes to Google Drive
joblib.dump(train_final_qna_df, "/content/drive/MyDrive/Colab Notebooks/894/Training/questions_answers_features.joblib")

#### 1.1.2 Extracting Images' Features


In [None]:
train_images_features_df = extract_image_features(model, training_images_file_path)

##### Save Workspace

In [None]:
joblib.dump(train_images_features_df, "/content/drive/MyDrive/Colab Notebooks/894/Training/images_features.joblib")

#### 1.1.3 Complete Training Dataset

##### Load Workspace

In [None]:
train_qna_df = joblib.load("/content/drive/MyDrive/Colab Notebooks/894/Training/questions_answers_features.joblib")
train_images_df = joblib.load("/content/drive/MyDrive/Colab Notebooks/894/Training/images_features.joblib")
train_images_df.shape

In [None]:
train_complete_df = complete_datasets(train_qna_df, train_images_df)
train_complete_df.head()

##### Save Workspace

In [None]:
joblib.dump(train_complete_df, "/content/drive/MyDrive/Colab Notebooks/894/Training/complete_training_set_tabular.joblib")

### 1.2 Validation Datasets

#### 1.2.1 Extracting Questions & Answers


In [None]:
valid_mcq_df, valid_oeq_df, valid_answers_merged_df, valid_final_qna_df = extract_qna_features(validation_mcq_file_path, validation_oeq_file_path, validation_answers_file_path)
valid_final_qna_df.head()

##### Save Workspace

In [None]:
joblib.dump(valid_final_qna_df, "/content/drive/MyDrive/Colab Notebooks/894/Validation/questions_answers_features.joblib")

#### 1.2.2 Extracting Images' Features

In [None]:
valid_images_features_df = extract_image_features(model, validation_images_file_path)

In [None]:
valid_images_features_df.shape

##### Save Workspace

In [None]:
joblib.dump(valid_images_features_df, "/content/drive/MyDrive/Colab Notebooks/894/Validation/images_features.joblib")

#### 1.2.3  Complete Validation Dataset

##### Load Workspace

In [None]:
valid_qna_df = joblib.load("/content/drive/MyDrive/Colab Notebooks/894/Validation/questions_answers_features.joblib")
valid_images_df = joblib.load("/content/drive/MyDrive/Colab Notebooks/894/Validation/images_features.joblib")
valid_images_df.shape

In [None]:
valid_complete_df = complete_datasets(valid_qna_df, valid_images_df)
valid_complete_df.head()

##### Save Workspace

In [None]:
joblib.dump(valid_complete_df, "/content/drive/MyDrive/Colab Notebooks/894/Validation/complete_validation_set_tabular.joblib")

### 1.3 Testing Datasets

#### 1.3.1 Extracting Questions & Answers

In [34]:
test_mcq_df, test_oeq_df, test_answers_df = json_to_pdDf(testing_mcq_file_path, testing_oeq_file_path, None)
test_final_qna_df = pd.merge(test_mcq_df, test_oeq_df, on=['image_id', 'question_id'], how='left', suffixes=('_oeq', '_mcq'))
test_final_qna_df.head()

Unnamed: 0,image_id,question_id,question_oeq,multiple_choices,question_mcq
0,39456,394560,What color are the chairs?,"[red, 4, 3, plates, brown, yellow, green, stan...",What color are the chairs?
1,39456,394561,Is the man asleep?,"[3, blue, 4, no, anger, white, 2, red, yes, on...",Is the man asleep?
2,39456,394562,What is on the table?,"[white, on sidewalk, salt and pepper, 3, yes, ...",What is on the table?
3,47922,479220,How many bushes are in the background?,"[blue, 1, 3, 40, dog, 10, 2, sun rays, red, ye...",How many bushes are in the background?
4,47922,479221,What are they playing?,"[yes, soccer, on man's head, frisbee, golf, mo...",What are they playing?


##### Save Workspace

In [35]:
joblib.dump(test_final_qna_df, "/content/drive/MyDrive/Colab Notebooks/894/Testing/questions_answers_features.joblib")

['/content/drive/MyDrive/Colab Notebooks/894/Testing/questions_answers_features.joblib']

#### 1.3.2 Extracting Images' Features

In [None]:
test_images_features_df = extract_image_features(model, testing_images_file_path)

##### Save Workspace

In [None]:
joblib.dump(test_images_features_df, "/content/drive/MyDrive/Colab Notebooks/894/Testing/images_features.joblib")

#### 1.3.3 Complete Testing Dataset

##### Load Workspace

In [None]:
test_qna_df = joblib.load("/content/drive/MyDrive/Colab Notebooks/894/Testing/questions_answers_features.joblib")
test_images_df = joblib.load("/content/drive/MyDrive/Colab Notebooks/894/Testing/images_features.joblib")
test_images_df.shape

In [None]:
test_complete_df = complete_datasets(valid_qna_df, valid_images_df)
test_complete_df.head()

##### Save Workspace

In [None]:
joblib.dump(test_complete_df, "/content/drive/MyDrive/Colab Notebooks/894/Testing/complete_testing_set_tabular.joblib")









<space>

# Step 2: Exploratory Data Analysis & Feature Engineering

##### Load Workspace

In [None]:
df_train = joblib.load("/content/drive/MyDrive/Colab Notebooks/894/Training/complete_training_set_tabular.joblib")
df_valid = joblib.load("/content/drive/MyDrive/Colab Notebooks/894/Validation/complete_validation_set_tabular.joblib")
df_train.head()

### 2.1 Visualization & Insights

##### Keeping only one question column since question_oeq = question_mcq

In [None]:
# Check if two specific columns are identical in the DataFrame
are_questions_columns_equal = df_train['question_oeq'].equals(df_train['question_mcq'])  # Replace 'question_x' and 'question_y' with your actual column names

# Print the result
print("Are 'question_oeq' and 'question_mcq' identical? : ", "Yes" if are_questions_columns_equal else "No")

In [None]:
df_train['question'] = df_train['question_oeq']
df_valid['question'] = df_valid['question_oeq']

df_train = df_train.drop(columns=['question_oeq', 'question_mcq'])
df_valid = df_valid.drop(columns=['question_oeq', 'question_mcq'])

##### Questions' Length Visualization

In [None]:
# Analyze Question Length
df_train['question_length'] = df_train['question'].apply(lambda x: len(x.split()))
df_valid['question_length'] = df_valid['question'].apply(lambda x: len(x.split()))
plt.figure(figsize=(10, 6))
sns.histplot(df_train['question_length'], kde=True, color='skyblue')
plt.title('Distribution of Question Length')
plt.xlabel('Number of Words in Question')
plt.ylabel('Frequency')
plt.show()

##### Questions' Common Words

In [None]:
# # Most common words in question
# question_text = df_train['question']
# question_words = question_text.split()
# word_counts = Counter(question_words)
# common_words = word_counts.most_common(20)
# print("\nMost Common Words in Questions:")
# for word, count in common_words:
#     print(f"{word}: {count}")

##### Save Workspace

In [None]:
joblib.dump(df_train, "/content/drive/MyDrive/Colab Notebooks/894/Training/(1)_dfTrain_unique_2.1.joblib")
joblib.dump(df_valid, "/content/drive/MyDrive/Colab Notebooks/894/Validation/(1)_dfValid_unique_2.1.joblib")

### 2.2 Embeddings/Encodings

##### Load Workspace

In [None]:
df_train = joblib.load("/content/drive/MyDrive/Colab Notebooks/894/Training/(1)_dfTrain_unique_2.1.joblib")
df_valid = joblib.load("/content/drive/MyDrive/Colab Notebooks/894/Validation/(1)_dfValid_unique_2.1.joblib")
df_train.head()

##### 2.2.1 Questions Text's Features' Engineering

In [None]:
df_train['question_clean'] = df_train['question'].apply(preprocess_text)
df_valid['question_clean'] = df_valid['question'].apply(preprocess_text)
df_train[['question', 'question_clean']]

In [None]:
# Apply the get_sentence_embedding function to each row of the 'question_clean' column
df_train['question_embedding'] = df_train['question_clean'].apply(lambda x: get_sentence_embedding(x, model))
df_valid['question_embedding'] = df_valid['question_clean'].apply(lambda x: get_sentence_embedding(x, model))
df_train.head()

##### 2.2.2 Answers Text's Features' Engineering

In [None]:
df_train['answer_clean'] = df_train['answer'].apply(preprocess_text)
df_valid['answer_clean'] = df_valid['answer'].apply(preprocess_text)
df_train[['answer', 'answer_clean']]

In [None]:
# Apply the get_sentence_embedding function to each row of the 'question_clean' column
df_train['answer_embedding'] = df_train['answer_clean'].apply(lambda x: get_sentence_embedding(x, model))
df_valid['answer_embedding'] = df_valid['answer_clean'].apply(lambda x: get_sentence_embedding(x, model))
df_train.head()

##### 2.2.3 Multiple Choices List's Features' Engineering

In [None]:
df_train['filtered_multiple_choices'] = df_train.apply(lambda row: filter_choices_by_word2vec_similarity(row, model), axis=1)
df_valid['filtered_multiple_choices'] = df_valid.apply(lambda row: filter_choices_by_word2vec_similarity(row, model), axis=1)
df_train[['multiple_choices', 'filtered_multiple_choices']]

In [None]:
df_train['filtered_multiple_choice_corpus'] = df_train.apply(lambda row: concatenate_list_to_sentence(row, 'filtered_multiple_choices'), axis=1)
df_valid['filtered_multiple_choice_corpus'] = df_valid.apply(lambda row: concatenate_list_to_sentence(row, 'filtered_multiple_choices'), axis=1)
df_train[['filtered_multiple_choices', 'filtered_multiple_choice_corpus']]

In [None]:
# Apply the get_sentence_embedding function to each row of the 'question_clean' column
df_train['filtered_choices_embedding'] = df_train['filtered_multiple_choice_corpus'].apply(lambda x: get_sentence_embedding(x, model))
df_valid['filtered_choices_embedding'] = df_valid['filtered_multiple_choice_corpus'].apply(lambda x: get_sentence_embedding(x, model))
df_train.head()

##### 2.2.4 Encoding Selected Categorical Features

In [None]:
# Initialize the LabelEncoder
label_encoder_answer_type = LabelEncoder()
label_encoder_confidence = LabelEncoder()

# Encode answer_type and answer_confidence separately
df_train['answer_type_encoded'] = label_encoder_answer_type.fit_transform(df_train['answer_type'])
df_valid['answer_type_encoded'] = label_encoder_answer_type.transform(df_valid['answer_type'])

df_train['answer_confidence_encoded'] = label_encoder_confidence.fit_transform(df_train['answer_confidence'])
df_valid['answer_confidence_encoded'] = label_encoder_confidence.transform(df_valid['answer_confidence'])

df_train

##### Remove ID & Transformed Features

In [None]:
df_train = df_train.drop(columns=id_features + categorical_features)
df_valid = df_valid.drop(columns=id_features + categorical_features)

##### Save Workspace

In [None]:
joblib.dump(df_train, "/content/drive/MyDrive/Colab Notebooks/894/Training/(2)_dfTrain_w2v_2.2.joblib")
joblib.dump(df_valid, "/content/drive/MyDrive/Colab Notebooks/894/Validation/(2)_dfValid_w2v_2.2.joblib")
joblib.dump(label_encoder_answer_type, "/content/drive/MyDrive/Colab Notebooks/894/Training/(2)_label_encoder_answer_type_2.2.joblib")
joblib.dump(label_encoder_confidence, "/content/drive/MyDrive/Colab Notebooks/894/Training/(2)_label_encoder_confidence_2.2.joblib")

### 2.3 Reducing Dimensionality (Principal Component Analysis (PCA))

##### Load Workspace

In [None]:
df_train = joblib.load("/content/drive/MyDrive/Colab Notebooks/894/Training/(2)_dfTrain_w2v_2.2.joblib")
df_valid = joblib.load("/content/drive/MyDrive/Colab Notebooks/894/Validation/(2)_dfValid_w2v_2.2.joblib")

df_train.head()

##### Find out what is the optimal number of principal components you need for each feature

In [None]:
from sklearn.decomposition import PCA

# Calculate explained variance for image input
pca_image_full = PCA().fit(df_train.iloc[:, :2048].values)
explained_variance_ratio_image = np.cumsum(pca_image_full.explained_variance_ratio_)

# Calculate explained variance for filtered choices embedding
pca_filtered_choices_full = PCA().fit(np.vstack(df_train['filtered_choices_embedding'].values))
explained_variance_ratio_filtered_choices = np.cumsum(pca_filtered_choices_full.explained_variance_ratio_)

# Plot cumulative explained variance for image input
plt.figure(figsize=(10, 6))
plt.plot(np.arange(1, len(explained_variance_ratio_image) + 1), explained_variance_ratio_image, label='Image Input')
plt.plot(np.arange(1, len(explained_variance_ratio_filtered_choices) + 1), explained_variance_ratio_filtered_choices, label='Filtered Choices Embedding')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('Explained Variance vs Number of PCA Components')
plt.axhline(y=0.90, color='r', linestyle='--', label='90% Variance Explained')
plt.axhline(y=0.95, color='g', linestyle='--', label='95% Variance Explained')
plt.legend()
plt.grid(True)
plt.show()

# Determine the number of components for 90% and 95% explained variance
n_components_image_90 = np.argmax(explained_variance_ratio_image >= 0.90) + 1
n_components_image_95 = np.argmax(explained_variance_ratio_image >= 0.95) + 1

n_components_filtered_choices_90 = np.argmax(explained_variance_ratio_filtered_choices >= 0.90) + 1
n_components_filtered_choices_95 = np.argmax(explained_variance_ratio_filtered_choices >= 0.95) + 1

print(f"Number of components for 90% variance (Image Input): {n_components_image_90}")
print(f"Number of components for 95% variance (Image Input): {n_components_image_95}")
print(f"Number of components for 90% variance (Filtered Choices): {n_components_filtered_choices_90}")
print(f"Number of components for 95% variance (Filtered Choices): {n_components_filtered_choices_95}")


In [None]:
# prompt: do the same pca above to questions and answers embedding

# Calculate explained variance for question embedding
pca_question_full = PCA().fit(np.vstack(df_train['question_embedding'].values))
explained_variance_ratio_question = np.cumsum(pca_question_full.explained_variance_ratio_)

# Calculate explained variance for answer embedding
pca_answer_full = PCA().fit(np.vstack(df_train['answer_embedding'].values))
explained_variance_ratio_answer = np.cumsum(pca_answer_full.explained_variance_ratio_)


# Plot cumulative explained variance for question and answer embeddings
plt.figure(figsize=(10, 6))
plt.plot(np.arange(1, len(explained_variance_ratio_question) + 1), explained_variance_ratio_question, label='Question Embedding')
plt.plot(np.arange(1, len(explained_variance_ratio_answer) + 1), explained_variance_ratio_answer, label='Answer Embedding')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('Explained Variance vs Number of PCA Components')
plt.axhline(y=0.90, color='r', linestyle='--', label='90% Variance Explained')
plt.axhline(y=0.95, color='g', linestyle='--', label='95% Variance Explained')
plt.legend()
plt.grid(True)
plt.show()

# Determine the number of components for 90% and 95% explained variance
n_components_question_90 = np.argmax(explained_variance_ratio_question >= 0.90) + 1
n_components_question_95 = np.argmax(explained_variance_ratio_question >= 0.95) + 1

n_components_answer_90 = np.argmax(explained_variance_ratio_answer >= 0.90) + 1
n_components_answer_95 = np.argmax(explained_variance_ratio_answer >= 0.95) + 1


print(f"Number of components for 90% variance (Question Embedding): {n_components_question_90}")
print(f"Number of components for 95% variance (Question Embedding): {n_components_question_95}")
print(f"Number of components for 90% variance (Answer Embedding): {n_components_answer_90}")
print(f"Number of components for 95% variance (Answer Embedding): {n_components_answer_95}")


##### Use the optimal number of features (captures 95% variance) to compute your final PCs

In [None]:
from sklearn.decomposition import PCA

# Define the number of principal components for PCA
n_components_image = 452  # Reduce image input dimensions from 2048 to 452
n_components_filtered_choices = 131  # Reduce filtered_choices_embedding_input from 300 to 131
n_components_questions = 150  # Reduce question_embedding_input from 300 to 150
n_components_answers = 134  # Reduce answer_embedding_input from 300 to 134

# Apply PCA to image input
pca_image = PCA(n_components=n_components_image)
image_train_pca = pca_image.fit_transform(df_train.iloc[:, :2048].values)
image_valid_pca = pca_image.transform(df_valid.iloc[:, :2048].values)

# Apply PCA to question embedding
pca_questions = PCA(n_components=n_components_questions)
question_train_pca = pca_questions.fit_transform(np.vstack(df_train['question_embedding'].values))
question_valid_pca = pca_questions.transform(np.vstack(df_valid['question_embedding'].values))

# Apply PCA to answer embedding
pca_answers = PCA(n_components=n_components_answers)
answer_train_pca = pca_answers.fit_transform(np.vstack(df_train['answer_embedding'].values))
answer_valid_pca = pca_answers.transform(np.vstack(df_valid['answer_embedding'].values))

# Apply PCA to filtered choices embedding
pca_filtered_choices = PCA(n_components=n_components_filtered_choices)
filtered_choices_train_pca = pca_filtered_choices.fit_transform(np.vstack(df_train['filtered_choices_embedding'].values))
filtered_choices_valid_pca = pca_filtered_choices.transform(np.vstack(df_valid['filtered_choices_embedding'].values))

In [None]:
# Function to convert PCA result to DataFrame with preserved index
def pca_to_df(pca_result, df_original, prefix):
    """ Convert PCA results (NumPy array) to Pandas DataFrame and preserve the original DataFrame's index """
    df_pca = pd.DataFrame(pca_result, index=df_original.index,
                          columns=[f'{prefix}_pca_{i}' for i in range(pca_result.shape[1])])
    return df_pca

# Convert the PCA results to DataFrames with preserved row indices
df_image_train_pca = pca_to_df(image_train_pca, df_train, 'image')
df_image_valid_pca = pca_to_df(image_valid_pca, df_valid, 'image')
df_filtered_choices_train_pca = pca_to_df(filtered_choices_train_pca, df_train, 'filtered_choices')
df_filtered_choices_valid_pca = pca_to_df(filtered_choices_valid_pca, df_valid, 'filtered_choices')
df_questions_train_pca = pca_to_df(question_train_pca, df_train, 'questions')
df_questions_valid_pca = pca_to_df(question_valid_pca, df_valid, 'questions')
df_answers_train_pca = pca_to_df(answer_train_pca, df_train, 'answers')
df_answers_valid_pca = pca_to_df(answer_valid_pca, df_valid, 'answers')

In [None]:
# Concatenate PCA results with the original `target_answer` column
df_train_pca = pd.concat([df_train[['target_answer', 'answer_type_encoded', 'answer_confidence_encoded']], df_image_train_pca, df_questions_train_pca, df_answers_train_pca, df_filtered_choices_train_pca], axis=1)
df_valid_pca = pd.concat([df_valid[['target_answer', 'answer_type_encoded', 'answer_confidence_encoded']], df_image_valid_pca, df_questions_valid_pca, df_answers_valid_pca, df_filtered_choices_valid_pca], axis=1)
df_valid_pca.head()

In [None]:
df_train = df_train_pca
df_valid = df_valid_pca

##### Save Workspace

In [None]:
joblib.dump(df_train, "/content/drive/MyDrive/Colab Notebooks/894/Training/(3)_dfTrain_2.3.joblib")
joblib.dump(df_valid, "/content/drive/MyDrive/Colab Notebooks/894/Validation/(3)_dfValid_2.3.joblib")

# joblib.dump(pca_image, "/content/drive/MyDrive/Colab Notebooks/894/Training/(4)_pca_image_3.2.joblib")
# joblib.dump(pca_questions, "/content/drive/MyDrive/Colab Notebooks/894/Training/(4)_pca_questions_3.2.joblib")
# joblib.dump(pca_answers, "/content/drive/MyDrive/Colab Notebooks/894/Training/(4)_pca_answers_3.2.joblib")
# joblib.dump(pca_filtered_choices, "/content/drive/MyDrive/Colab Notebooks/894/Training/(4)_pca_filtered_choices_3.2.joblib"

# Step 3: Model Architecture, Training & Hyperparameter Tuning

##### Load Workspace

In [None]:
df_train = joblib.load("/content/drive/MyDrive/Colab Notebooks/894/Training/(3)_dfTrain_2.3.joblib")
df_valid = joblib.load("/content/drive/MyDrive/Colab Notebooks/894/Validation/(3)_dfValid_2.3.joblib")

# df_train = joblib.load("/Users/nila/Library/CloudStorage/GoogleDrive-nilaa.nahar@gmail.com/My Drive/Colab Notebooks/894/Training/(3)_dfTrain_3.1.joblib")
# df_valid = joblib.load("/Users/nila/Library/CloudStorage/GoogleDrive-nilaa.nahar@gmail.com/My Drive/Colab Notebooks/894/Validation/(3)_dfValid_3.1.joblib")

df_train.head()

### 3.1 Model Architecture

##### Load Workspace

In [None]:
df_train = joblib.load("/content/drive/MyDrive/Colab Notebooks/894/Training/(4)_dfTrain_3.2.joblib")
df_valid = joblib.load("/content/drive/MyDrive/Colab Notebooks/894/Validation/(4)_dfValid_3.2.joblib")

df_train.head()

In [None]:
# Define the number of principal components for PCA
n_components_image = 452
n_components_filtered_choices = 131
n_components_questions = 150
n_components_answers = 134

In [None]:
# Define input layers for PCA-transformed inputs
question_embedding_input = Input(shape=(n_components_questions,), name='question_embedding_input')  # Use PCA components shape
answer_embedding_input = Input(shape=(n_components_answers,), name='answer_embedding_input')  # Use PCA components shape
filtered_choices_embedding_input = Input(shape=(n_components_filtered_choices,), name='filtered_choices_embedding_input')  # Use PCA components shape
image_input = Input(shape=(n_components_image,), name='image_input')  # Use PCA components shape

answer_type_input = Input(shape=(1,), name='answer_type_encoded')  # One-dimensional label-encoded feature
answer_confidence_input = Input(shape=(1,), name='answer_confidence_encoded')  # One-dimensional label-encoded feature

# Step 3: Concatenate all inputs
combined = Concatenate()([question_embedding_input, answer_embedding_input, image_input,
                          filtered_choices_embedding_input, answer_type_input, answer_confidence_input])

# Create dense layers and dropout layers for the combined inputs
x = Dense(64, activation='relu')(combined)
x = Dropout(0.3)(x)
x = Dense(32, activation='relu')(x)
x = Dropout(0.3)(x)

# One-hot encode the target labels for training and validation
train_labels = pd.get_dummies(df_train['target_answer']).values
valid_labels = pd.get_dummies(df_valid['target_answer']).values

# Check that the validation labels have the same number of classes as training labels
if valid_labels.shape[1] != train_labels.shape[1]:
    print(f"Mismatch in number of classes: Train Labels: {train_labels.shape[1]}, Validation Labels: {valid_labels.shape[1]}")
    # Create a new zero-filled array with the same number of columns as train_labels
    new_valid_labels = np.zeros((valid_labels.shape[0], train_labels.shape[1]))
    # Fill the new array with the original valid_labels values where classes exist
    new_valid_labels[:, :valid_labels.shape[1]] = valid_labels  # Copy existing classes
    print(f"Shape of adjusted `valid_labels`: {new_valid_labels.shape}")
    valid_labels = new_valid_labels

print(f"Final shape of `train_labels`: {train_labels.shape}")
print(f"Final shape of `valid_labels`: {valid_labels.shape}")

# Define the number of classes for output
num_classes = train_labels.shape[1]  # Get the correct number of classes from train_labels shape
output = Dense(num_classes, activation='softmax')(x)

model = Model(inputs=[question_embedding_input, answer_embedding_input, image_input,
                      filtered_choices_embedding_input, answer_type_input, answer_confidence_input], outputs=output)

model.compile(optimizer=Adam(learning_rate=0.0005), loss='categorical_crossentropy', metrics=['accuracy'])
print(model.summary())

### 3.2 Model Training

In [None]:
# Step 3: Prepare PCA-Transformed Training Inputs
train_inputs = {
    'question_embedding_input': df_train.filter(like='questions_pca').values,  # Select columns with 'questions_pca' in the name
    'answer_embedding_input': df_train.filter(like='answers_pca').values,  # Select columns with 'answers_pca' in the name
    'filtered_choices_embedding_input': df_train.filter(like='filtered_choices_pca').values,  # Select columns with 'filtered_choices_pca' in the name
    'image_input': df_train.filter(like='image_pca').values,  # Select columns with 'image_pca' in the name
    'answer_type_encoded': np.reshape(df_train['answer_type_encoded'].values, (-1, 1)),  # Reshape to (num_samples, 1)
    'answer_confidence_encoded': np.reshape(df_train['answer_confidence_encoded'].values, (-1, 1))  # Reshape to (num_samples, 1)
}

# Apply the same PCA transformation to validation inputs using column filtering
valid_inputs = {
    'question_embedding_input': df_valid.filter(like='questions_pca').values,  # Select columns with 'questions_pca' in the name
    'answer_embedding_input': df_valid.filter(like='answers_pca').values,  # Select columns with 'answers_pca' in the name
    'filtered_choices_embedding_input': df_valid.filter(like='filtered_choices_pca').values,  # Select columns with 'filtered_choices_pca' in the name
    'image_input': df_valid.filter(like='image_pca').values,  # Select columns with 'image_pca' in the name
    'answer_type_encoded': np.reshape(df_valid['answer_type_encoded'].values, (-1, 1)),  # Reshape to (num_samples, 1)
    'answer_confidence_encoded': np.reshape(df_valid['answer_confidence_encoded'].values, (-1, 1))  # Reshape to (num_samples, 1)
}

# Step 4: Train the simplified model using PCA-transformed inputs
history = model.fit(train_inputs, train_labels, validation_data=(valid_inputs, valid_labels), epochs=10, batch_size=16)


##### Saving the model

In [None]:
# Save the trained model to a file
model.save('/content/drive/MyDrive/Colab Notebooks/894/Training/(4)_SimpleModel_3.2.h5')

In [None]:
# Save the history to a text file
with open('/content/drive/MyDrive/Colab Notebooks/894/Training/(4)_SimpleModel_3.2_training_history.txt', 'w') as f:
    f.write("Training and Validation Performance:\n")
    for key, values in history.history.items():
        f.write(f"{key}: {values}\n")


### 3.3. Hyperparameter Tuning (Optuna)

##### Load Workspace


In [9]:
from tensorflow.keras.models import load_model
model = load_model('/content/drive/MyDrive/Colab Notebooks/894/Training/(4)_SimpleModel_3.2.h5')

df_train = joblib.load("/content/drive/MyDrive/Colab Notebooks/894/Training/(3)_dfTrain_2.3.joblib")
df_valid = joblib.load("/content/drive/MyDrive/Colab Notebooks/894/Validation/(3)_dfValid_2.3.joblib")

df_train.head()



Unnamed: 0,target_answer,answer_type_encoded,answer_confidence_encoded,image_pca_0,image_pca_1,image_pca_2,image_pca_3,image_pca_4,image_pca_5,image_pca_6,...,filtered_choices_pca_121,filtered_choices_pca_122,filtered_choices_pca_123,filtered_choices_pca_124,filtered_choices_pca_125,filtered_choices_pca_126,filtered_choices_pca_127,filtered_choices_pca_128,filtered_choices_pca_129,filtered_choices_pca_130
0,tan,1,2,6.439533,1.479186,7.466555,4.448799,0.212724,5.18466,-9.161225,...,0.004861,0.012495,0.020905,-0.032322,0.008106,-0.018028,-0.006436,0.013486,0.02608,0.001254
1,tan,1,2,6.439657,1.478976,7.466766,4.448922,0.212704,5.184577,-9.160582,...,0.004861,0.012495,0.020905,-0.032322,0.008106,-0.018028,-0.006436,0.013486,0.02608,0.001254
2,tan,1,2,6.439657,1.479007,7.465891,4.448575,0.212389,5.18401,-9.161281,...,0.004861,0.012495,0.020905,-0.032322,0.008106,-0.018028,-0.006436,0.013486,0.02608,0.001254
3,tan,1,2,6.439657,1.478999,7.465737,4.448606,0.212629,5.184058,-9.161361,...,0.004861,0.012495,0.020905,-0.032322,0.008106,-0.018028,-0.006436,0.013486,0.02608,0.001254
4,monkey bars,1,2,6.439657,1.478979,7.465677,4.448548,0.212789,5.184362,-9.161155,...,-0.033575,-0.004703,0.007514,0.006816,-0.019907,-0.010409,0.007188,0.0098,-0.007322,0.012842


In [4]:
# Define the number of principal components for PCA
n_components_image = 452
n_components_filtered_choices = 131
n_components_questions = 150
n_components_answers = 134

In [5]:
# Step 3: Prepare PCA-Transformed Training Inputs
train_inputs = {
    'question_embedding_input': df_train.filter(like='questions_pca').values,  # Select columns with 'questions_pca' in the name
    'answer_embedding_input': df_train.filter(like='answers_pca').values,  # Select columns with 'answers_pca' in the name
    'filtered_choices_embedding_input': df_train.filter(like='filtered_choices_pca').values,  # Select columns with 'filtered_choices_pca' in the name
    'image_input': df_train.filter(like='image_pca').values,  # Select columns with 'image_pca' in the name
    'answer_type_encoded': np.reshape(df_train['answer_type_encoded'].values, (-1, 1)),  # Reshape to (num_samples, 1)
    'answer_confidence_encoded': np.reshape(df_train['answer_confidence_encoded'].values, (-1, 1))  # Reshape to (num_samples, 1)
}

# Apply the same PCA transformation to validation inputs using column filtering
valid_inputs = {
    'question_embedding_input': df_valid.filter(like='questions_pca').values,  # Select columns with 'questions_pca' in the name
    'answer_embedding_input': df_valid.filter(like='answers_pca').values,  # Select columns with 'answers_pca' in the name
    'filtered_choices_embedding_input': df_valid.filter(like='filtered_choices_pca').values,  # Select columns with 'filtered_choices_pca' in the name
    'image_input': df_valid.filter(like='image_pca').values,  # Select columns with 'image_pca' in the name
    'answer_type_encoded': np.reshape(df_valid['answer_type_encoded'].values, (-1, 1)),  # Reshape to (num_samples, 1)
    'answer_confidence_encoded': np.reshape(df_valid['answer_confidence_encoded'].values, (-1, 1))  # Reshape to (num_samples, 1)
}

In [6]:
# Define the objective function for Optuna
def objective(trial):
    # Hyperparameters to tune
    n_units_layer1 = trial.suggest_int('n_units_layer1', 32, 64)  # Number of units in first Dense layer
    n_units_layer2 = trial.suggest_int('n_units_layer2', 16, 32)   # Number of units in second Dense layer
    dropout_rate = trial.suggest_float('dropout_rate', 0.2, 0.5)   # Dropout rate
    learning_rate = trial.suggest_float('learning_rate', 1e-5, 1e-3, log=True)  # Learning rate for Adam optimizer

    # Define the model architecture (similar to your previous one)
    question_embedding_input = Input(shape=(n_components_questions,), name='question_embedding_input')
    answer_embedding_input = Input(shape=(n_components_answers,), name='answer_embedding_input')
    filtered_choices_embedding_input = Input(shape=(n_components_filtered_choices,), name='filtered_choices_embedding_input')
    image_input = Input(shape=(n_components_image,), name='image_input')

    answer_type_input = Input(shape=(1,), name='answer_type_encoded')  # One-dimensional label-encoded feature
    answer_confidence_input = Input(shape=(1,), name='answer_confidence_encoded')  # One-dimensional label-encoded feature

    # Step 3: Concatenate all inputs
    combined = Concatenate()([question_embedding_input, answer_embedding_input, image_input,
                              filtered_choices_embedding_input, answer_type_input, answer_confidence_input])


    # Fully connected layers with dynamic units and dropout
    x = Dense(n_units_layer1, activation='relu')(combined)
    x = Dropout(dropout_rate)(x)
    x = Dense(n_units_layer2, activation='relu')(x)
    x = Dropout(dropout_rate)(x)

    # One-hot encode the target labels for training and validation
    train_labels = pd.get_dummies(df_train['target_answer']).values
    valid_labels = pd.get_dummies(df_valid['target_answer']).values

    # Check that the validation labels have the same number of classes as training labels
    if valid_labels.shape[1] != train_labels.shape[1]:
        print(f"Mismatch in number of classes: Train Labels: {train_labels.shape[1]}, Validation Labels: {valid_labels.shape[1]}")
        # Create a new zero-filled array with the same number of columns as train_labels
        new_valid_labels = np.zeros((valid_labels.shape[0], train_labels.shape[1]))
        # Fill the new array with the original valid_labels values where classes exist
        new_valid_labels[:, :valid_labels.shape[1]] = valid_labels  # Copy existing classes
        print(f"Shape of adjusted `valid_labels`: {new_valid_labels.shape}")
        valid_labels = new_valid_labels

    # Output layer (number of classes)
    # Define the number of classes for output
    num_classes = train_labels.shape[1]  # Get the correct number of classes from train_labels shape
    output = Dense(num_classes, activation='softmax')(x)

    model = Model(inputs=[question_embedding_input, answer_embedding_input, image_input,
                      filtered_choices_embedding_input, answer_type_input, answer_confidence_input], outputs=output)

    # Compile the model
    model.compile(optimizer=Adam(learning_rate=learning_rate), loss='categorical_crossentropy', metrics=['accuracy'])

    # Train the model with early stopping to prevent overfitting
    model.fit(train_inputs, train_labels, validation_data=(valid_inputs, valid_labels),
              epochs=10, batch_size=16, verbose=0)

    # Evaluate the model on validation data
    val_loss, val_accuracy = model.evaluate(valid_inputs, valid_labels, verbose=0)

    # Return the validation accuracy as the objective value
    return val_accuracy

# Create the study object
study = optuna.create_study(direction='maximize')

# Start optimization
study.optimize(objective, n_trials=10)  # Number of trials can be adjusted

[I 2024-10-12 02:01:43,412] A new study created in memory with name: no-name-0a488691-8bf1-43af-a480-c9e243866366


Mismatch in number of classes: Train Labels: 2521, Validation Labels: 1568
Shape of adjusted `valid_labels`: (300000, 2521)


[I 2024-10-12 02:16:46,882] Trial 0 finished with value: 7.333333633141592e-05 and parameters: {'n_units_layer1': 35, 'n_units_layer2': 17, 'dropout_rate': 0.31024918585842665, 'learning_rate': 0.0006113162127228869}. Best is trial 0 with value: 7.333333633141592e-05.


Mismatch in number of classes: Train Labels: 2521, Validation Labels: 1568
Shape of adjusted `valid_labels`: (300000, 2521)


[I 2024-10-12 02:32:11,284] Trial 1 finished with value: 2.9999999242136255e-05 and parameters: {'n_units_layer1': 39, 'n_units_layer2': 26, 'dropout_rate': 0.312064511699935, 'learning_rate': 0.00028480355340472724}. Best is trial 0 with value: 7.333333633141592e-05.


Mismatch in number of classes: Train Labels: 2521, Validation Labels: 1568
Shape of adjusted `valid_labels`: (300000, 2521)


[I 2024-10-12 02:47:24,454] Trial 2 finished with value: 4.333333345130086e-05 and parameters: {'n_units_layer1': 55, 'n_units_layer2': 28, 'dropout_rate': 0.31236853176225754, 'learning_rate': 1.2028330114961096e-05}. Best is trial 0 with value: 7.333333633141592e-05.


Mismatch in number of classes: Train Labels: 2521, Validation Labels: 1568
Shape of adjusted `valid_labels`: (300000, 2521)


[I 2024-10-12 03:02:29,757] Trial 3 finished with value: 2.6666666599339806e-05 and parameters: {'n_units_layer1': 34, 'n_units_layer2': 20, 'dropout_rate': 0.23068281993938045, 'learning_rate': 0.0005612470438088822}. Best is trial 0 with value: 7.333333633141592e-05.


Mismatch in number of classes: Train Labels: 2521, Validation Labels: 1568
Shape of adjusted `valid_labels`: (300000, 2521)


[I 2024-10-12 03:17:39,047] Trial 4 finished with value: 6.666666740784422e-05 and parameters: {'n_units_layer1': 64, 'n_units_layer2': 19, 'dropout_rate': 0.2824495791945499, 'learning_rate': 0.00030178163890673404}. Best is trial 0 with value: 7.333333633141592e-05.


Mismatch in number of classes: Train Labels: 2521, Validation Labels: 1568
Shape of adjusted `valid_labels`: (300000, 2521)


[I 2024-10-12 03:32:50,312] Trial 5 finished with value: 1.3333333299669903e-05 and parameters: {'n_units_layer1': 56, 'n_units_layer2': 27, 'dropout_rate': 0.3194002948816634, 'learning_rate': 9.993558034669476e-05}. Best is trial 0 with value: 7.333333633141592e-05.


Mismatch in number of classes: Train Labels: 2521, Validation Labels: 1568
Shape of adjusted `valid_labels`: (300000, 2521)


[I 2024-10-12 03:47:59,036] Trial 6 finished with value: 6.666666740784422e-05 and parameters: {'n_units_layer1': 52, 'n_units_layer2': 17, 'dropout_rate': 0.23573627342499245, 'learning_rate': 0.00014631100351412206}. Best is trial 0 with value: 7.333333633141592e-05.


Mismatch in number of classes: Train Labels: 2521, Validation Labels: 1568
Shape of adjusted `valid_labels`: (300000, 2521)


[I 2024-10-12 04:03:20,435] Trial 7 finished with value: 0.00013000000035390258 and parameters: {'n_units_layer1': 46, 'n_units_layer2': 30, 'dropout_rate': 0.4514204506946587, 'learning_rate': 0.00026099928221019915}. Best is trial 7 with value: 0.00013000000035390258.


Mismatch in number of classes: Train Labels: 2521, Validation Labels: 1568
Shape of adjusted `valid_labels`: (300000, 2521)


[I 2024-10-12 04:18:38,130] Trial 8 finished with value: 7.666666351724416e-05 and parameters: {'n_units_layer1': 32, 'n_units_layer2': 31, 'dropout_rate': 0.39821827370251006, 'learning_rate': 0.0004299855941768864}. Best is trial 7 with value: 0.00013000000035390258.


Mismatch in number of classes: Train Labels: 2521, Validation Labels: 1568
Shape of adjusted `valid_labels`: (300000, 2521)


[I 2024-10-12 04:33:56,503] Trial 9 finished with value: 9.666667028795928e-05 and parameters: {'n_units_layer1': 36, 'n_units_layer2': 22, 'dropout_rate': 0.3932058022139764, 'learning_rate': 0.00016700785057080454}. Best is trial 7 with value: 0.00013000000035390258.


### Final Model Fit

In [20]:
# Trial 7 finished with value: 0.00013000000035390258 and parameters: {'n_units_layer1': 46, 'n_units_layer2': 30, 'dropout_rate': 0.4514204506946587, 'learning_rate': 0.00026099928221019915}. Best is trial 7 with value: 0.00013000000035390258.

# Manually set the best hyperparameters
best_params = {
    'n_units_layer1': 46,
    'n_units_layer2': 30,
    'dropout_rate': 0.4514,
    'learning_rate': 0.0002609,
    'batch_size': 16
}

In [23]:

# Define input layers for PCA-transformed inputs
question_embedding_input = Input(shape=(n_components_questions,), name='question_embedding_input')  # Use PCA components shape
answer_embedding_input = Input(shape=(n_components_answers,), name='answer_embedding_input')  # Use PCA components shape
filtered_choices_embedding_input = Input(shape=(n_components_filtered_choices,), name='filtered_choices_embedding_input')  # Use PCA components shape
image_input = Input(shape=(n_components_image,), name='image_input')  # Use PCA components shape

answer_type_input = Input(shape=(1,), name='answer_type_encoded')  # One-dimensional label-encoded feature
answer_confidence_input = Input(shape=(1,), name='answer_confidence_encoded')  # One-dimensional label-encoded feature

combined = Concatenate()([question_embedding_input, answer_embedding_input, image_input,
                          filtered_choices_embedding_input, answer_type_input, answer_confidence_input])

# Manually set the best hyperparameters
best_params = {
    'n_units_layer1': 64,        # Example number of units in first Dense layer
    'n_units_layer2': 32,        # Example number of units in second Dense layer
    'dropout_rate': 0.3,         # Example dropout rate
    'learning_rate': 0.0001,     # Example learning rate
    'batch_size': 16             # Example batch size
}
# Fully connected layers with the best hyperparameters
x = Dense(best_params['n_units_layer1'], activation='relu')(combined)
x = Dropout(best_params['dropout_rate'])(x)
x = Dense(best_params['n_units_layer2'], activation='relu')(x)
x = Dropout(best_params['dropout_rate'])(x)

# Output layer (softmax for classification)
num_classes = train_labels.shape[1]  # Number of classes from one-hot encoded labels
output = Dense(num_classes, activation='softmax')(x)

# Compile the model with the best learning rate
final_model = Model(inputs=[question_embedding_input, answer_embedding_input, image_input,
                      filtered_choices_embedding_input, answer_type_input, answer_confidence_input], outputs=output)

final_model.compile(optimizer=Adam(learning_rate=best_params['learning_rate']),
              loss='categorical_crossentropy', metrics=['accuracy'])


# One-hot encode the target labels for training and validation
train_labels = pd.get_dummies(df_train['target_answer']).values
valid_labels = pd.get_dummies(df_valid['target_answer']).values

# Ensure validation labels match the shape of training labels (if necessary)
if valid_labels.shape[1] != train_labels.shape[1]:
    new_valid_labels = np.zeros((valid_labels.shape[0], train_labels.shape[1]))
    new_valid_labels[:, :valid_labels.shape[1]] = valid_labels
    valid_labels = new_valid_labels

# Step 3: Prepare PCA-Transformed Training Inputs
train_inputs = {
    'question_embedding_input': df_train.filter(like='questions_pca').values,  # Select columns with 'questions_pca' in the name
    'answer_embedding_input': df_train.filter(like='answers_pca').values,  # Select columns with 'answers_pca' in the name
    'filtered_choices_embedding_input': df_train.filter(like='filtered_choices_pca').values,  # Select columns with 'filtered_choices_pca' in the name
    'image_input': df_train.filter(like='image_pca').values,  # Select columns with 'image_pca' in the name
    'answer_type_encoded': np.reshape(df_train['answer_type_encoded'].values, (-1, 1)),  # Reshape to (num_samples, 1)
    'answer_confidence_encoded': np.reshape(df_train['answer_confidence_encoded'].values, (-1, 1))  # Reshape to (num_samples, 1)
}

# Apply the same PCA transformation to validation inputs using column filtering
valid_inputs = {
    'question_embedding_input': df_valid.filter(like='questions_pca').values,  # Select columns with 'questions_pca' in the name
    'answer_embedding_input': df_valid.filter(like='answers_pca').values,  # Select columns with 'answers_pca' in the name
    'filtered_choices_embedding_input': df_valid.filter(like='filtered_choices_pca').values,  # Select columns with 'filtered_choices_pca' in the name
    'image_input': df_valid.filter(like='image_pca').values,  # Select columns with 'image_pca' in the name
    'answer_type_encoded': np.reshape(df_valid['answer_type_encoded'].values, (-1, 1)),  # Reshape to (num_samples, 1)
    'answer_confidence_encoded': np.reshape(df_valid['answer_confidence_encoded'].values, (-1, 1))  # Reshape to (num_samples, 1)
}

final_modelmodel.fit(train_inputs, train_labels, validation_data=(valid_inputs, valid_labels), epochs=10, batch_size=16)


Epoch 1/10
[1m37500/37500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m110s[0m 3ms/step - accuracy: 0.3414 - loss: 3.5390 - val_accuracy: 4.3333e-05 - val_loss: 41.0209
Epoch 2/10
[1m37500/37500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m92s[0m 2ms/step - accuracy: 0.5725 - loss: 2.0894 - val_accuracy: 3.6667e-05 - val_loss: 55.2265
Epoch 3/10
[1m37500/37500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m92s[0m 2ms/step - accuracy: 0.6262 - loss: 1.8003 - val_accuracy: 3.3333e-05 - val_loss: 64.8920
Epoch 4/10
[1m37500/37500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m93s[0m 2ms/step - accuracy: 0.6428 - loss: 1.6860 - val_accuracy: 5.0000e-05 - val_loss: 72.8419
Epoch 5/10
[1m37500/37500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m92s[0m 2ms/step - accuracy: 0.6513 - loss: 1.6250 - val_accuracy: 4.0000e-05 - val_loss: 83.1177
Epoch 6/10
[1m37500/37500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m93s[0m 2ms/step - accuracy: 0.6579 - loss: 1.5794 - val_accuracy

<keras.src.callbacks.history.History at 0x7bc470388ac0>

In [24]:
# Save the trained model to a file
model.save('/content/drive/MyDrive/Colab Notebooks/894/Training/(5)_FinalModel_4.0.h5')



In [28]:
del df_train, df_valid
del question_embedding_input, answer_embedding_input, image_input, filtered_choices_embedding_input, answer_type_input, answer_confidence_input, combined, x, output
del train_inputs, valid_inputs, train_labels, valid_labels
del n_components_image, n_components_filtered_choices, n_components_questions, n_components_answers
del best_params, new_valid_labels, num_classes, study, stopwords

### Final Model's Accuracy: 0.665 on Validation Set


---




---



---



# Step 5: Testing & Performance (Not Applicable)

##### Load Workspace

In [None]:
final_model = load_model('/content/drive/MyDrive/Colab Notebooks/894/Training/(6)_FinalModel_4.0.h5')
label_encoder_answer_type = joblib.load('/content/drive/MyDrive/Colab Notebooks/894/Training/(6)_FinalModel_4.0_label_encoder_answer_type.joblib')
label_encoder_confidence = joblib.load('/content/drive/MyDrive/Colab Notebooks/894/Training/(6)_FinalModel_4.0_label_encoder_confidence.joblib')
pca_image = joblib.load('/content/drive/MyDrive/Colab Notebooks/894/Training/(4)_pca_image_3.2.job
pca_questions = joblib('/content/drive/MyDrive/Colab Notebooks/894/Training/(4)_pca_questions_3.2.job
pca_answers = joblib.load('/content/drive/MyDrive/Colab Notebooks/894/Training/(4)_pca_answers_3.2.joblib')
pca_filtered_choices = joblib.load('/content/drive/MyDrive/Colab Notebooks/894/Training/(4)_pca_filtered_choices_3.2.joblib')

In [None]:
df_test['question'] = df_test['question_oeq']
df_test = df_test.drop(columns=['question_oeq'])

df_test['question_clean'] = df_test['question'].apply(preprocess_text)
df_test['question_embedding'] = df_test['question_clean'].apply(lambda x: get_sentence_embedding(x, model))

df_test['answer_clean'] = df_test['answer'].apply(preprocess_text)
df_test['answer_embedding'] = df_test['answer_clean'].apply(lambda x: get_sentence_embedding(x, model))

df_test['filtered_multiple_choices'] = df_test.apply(lambda row: filter_choices_by_word2vec_similarity(row, model), axis=1)
df_test['filtered_multiple_choice_corpus'] = df_test.apply(lambda row: concatenate_list_to_sentence(row, 'filtered_multiple_choices'), axis=1)
df_test['filtered_choices_embedding'] = df_test.apply(lambda x: get_sentence_embedding(x, model), axis=1)

df_test['answer_type_encoded'] = label_encoder_answer_type.transform(df_test['answer_type'])
df_test['answer_confidence_encoded'] = label_encoder_confidence.transform(df_test['answer_confidence'])

In [None]:
df_test = df_test.drop(columns=id_features + categorical_features)

In [None]:
df_image_test_pca = pca_image.transform(df_test.iloc[:, :2048].values)
df_questions_test_pca = pca_questions.transform(np.vstack(df_test['question_embedding'].values))
df_answers_test_pca = pca_answers.transform(np.vstack(df_test['answer_embedding'].values))
df_filtered_choices_test_pca = pca_filtered_choices.transform(np.vstack(df_test['filtered_choices_embedding'].values))

df_test_pca = pd.concat([df_test[['answer_type_encoded', 'answer_confidence_encoded']], df_image_test_pca, df_questions_test_pca, df_answers_test_pca, df_filtered_choices_test_pca], axis=1)

In [None]:
# # Evaluate the final model on the test set
# test_loss, test_accuracy = final_model.evaluate([np.stack(df_test['question_bert_embedding'].values),
#                                                  np.stack(df_test['answer_embedding'].values),
#                                                  np.stack(df_test['filtered_choices_embedding'].values),
#                                                  np.stack(df_test['image_features'].values),
#                                                  np.stack(df_test['answer_type_encoded'].values),
#                                                  np.stack(df_test['answer_confidence_encoded'].values)],
#                                                 test_labels,
#                                                 verbose=0)
#
# print(f"Test Accuracy: {test_accuracy}")
#