# Imports

In [21]:
import os
from nn import SimpleFFNN
from train import Train
from preProcessing import PreProcessing
import numpy as np
from testModel import TestModel
from sklearn.model_selection import KFold

# Função para dividir os dados de treino e de teste

In [22]:
def split_array(data: np.ndarray, train_size: float = 0.8):
    """
    Divide um array 2D em dois arrays: um com train_size dos dados e outro com o restante.
    
    :param data: O array 2D a ser dividido.
    :param train_size: A proporção de dados a serem usados para o primeiro array (default é 0.8).
    :return: Dois arrays 2D, um com os dados de treinamento e outro com os dados de teste.
    """
    # Calcula o índice para a divisão
    split_index = int(len(data) * train_size)
    
    # Embaralha os dados
    np.random.shuffle(data)
    
    # Divide o array
    train_data = data[:split_index]
    test_data = data[split_index:]
    
    return train_data, test_data

# Variaveis onde está os ficheiros de treino e o modelo

In [23]:
#file onde já está o modelo treinado"
model=None
newPKL = "joao2"
n_splits = 10

#limpesa de ficheiro de treino
data="train"
#ficheiro onde vai ser feita ao autoavaliacao
avaliation = "test_no_labels"

## Pre Processamento

In [24]:
print("\033[34mPre Processing the data\n\033[0m")
pp=PreProcessing(data+".txt")
clean_data=pp.returnCleanText()
print("\033[32mPre Processing Completed!\n\033[0m")
clean_data = np.array(clean_data)

[34mPre Processing the data
[0m
[32mPre Processing Completed!
[0m


# KFold Cross-Validation

In [25]:
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

# Initialize performance metrics storage
all_fold_scores = []

## Carregar o modelo ou Criar um novo
#### Conforme o nome dado a variavel 'newPKL'

In [26]:
# Splitting data into train and test folds using K-Fold
for fold, (train_index, test_index) in enumerate(kf.split(clean_data)):
    print(f"\033[34mProcessing Fold {fold + 1}/{n_splits}\033[0m")

    # Extract the train and test data for this fold
    data_to_train, data_to_test = clean_data[train_index], clean_data[test_index]

    # Create a new model for each fold (optional: adjust if you want to load a pre-trained model)
    print("\033[34mCreating a new Model\n\033[0m")
    layer_hidden = [50, 25, 9]  # Architecture of the network
    learning_rate = 0.01
    epochs = 50
    model = Train(data_to_train, newPKL, layer_hidden, learning_rate, epochs)
    model.train()
    print("\033[32mModel Created and Trained for Fold {fold + 1}!\n\033[0m")

    # Test the model on the test fold
    model_teste = TestModel(newPKL)
    score = model_teste.test_with_label(data_to_test)

    # Save the score from this fold
    all_fold_scores.append(score)
    print(f"\033[32mFold {fold + 1} Score: {score}\n\033[0m")

[34mProcessing Fold 1/10[0m
[34mCreating a new Model
[0m
Epoch 0, Loss 0.09425303448115163
Epoch 1, Loss 0.08947847096359349
Epoch 2, Loss 0.08261809556052739
Epoch 3, Loss 0.07642067681618823
Epoch 4, Loss 0.07057874202472332
Epoch 5, Loss 0.06422364785190074
Epoch 6, Loss 0.0578699616102236
Epoch 7, Loss 0.051711938369333424
Epoch 8, Loss 0.04564287807898728
Epoch 9, Loss 0.04032072779383449
Epoch 10, Loss 0.03491862626439195
Epoch 11, Loss 0.030434788638695198
Epoch 12, Loss 0.026649459921663315
Epoch 13, Loss 0.023710955182222392
Epoch 14, Loss 0.021379788163770813
Epoch 15, Loss 0.01943212005893932
Epoch 16, Loss 0.017987142837205917
Epoch 17, Loss 0.016574503143192875
Epoch 18, Loss 0.015456007847433584
Epoch 19, Loss 0.01463597628972543
Epoch 20, Loss 0.01302539964151297
Epoch 21, Loss 0.012154231859937803
Epoch 22, Loss 0.011088634525767575
Epoch 23, Loss 0.010990331108785749
Epoch 24, Loss 0.009975911463342113
Epoch 25, Loss 0.00942561681772513
Epoch 26, Loss 0.00899509420

In [27]:
# Calculate the average score over all folds
average_score = np.mean(all_fold_scores)
print(f"\033[32mCross-Validation Completed! Average Score: {average_score}\033[0m")

[32mCross-Validation Completed! Average Score: 57.3684527672198[0m


# Variavel para testar os modelos

In [28]:
model_teste= TestModel(newPKL)

# Teste do modelo com o ficheiro com as labels identificadas
##### 'train.txt'

In [29]:
# Testar e comparar labels
model_teste.test_with_label(data_to_test)

[34mTesting the Model with Labels
[0m
[31mFor the movie 'Intimate Relations' the model said: 'romance', and was: 'drama'[0m
[31mFor the movie 'The Angel Who Pawned Her Harp' the model said: 'action', and was: 'comedy'[0m
[32mFor the movie 'Aap Ke Deewane' the model said: 'romance', and was: 'romance'[0m
[31mFor the movie 'The Oh in Ohio' the model said: 'drama', and was: 'comedy'[0m
[32mFor the movie 'The Women' the model said: 'comedy', and was: 'comedy'[0m
[31mFor the movie 'Frontier Gal' the model said: 'action', and was: 'western'[0m
[31mFor the movie 'Nadodimannan' the model said: 'action', and was: 'romance'[0m
[31mFor the movie 'Faithful in My Fashion' the model said: 'comedy', and was: 'romance'[0m
[31mFor the movie 'The World of Suzie Wong' the model said: 'drama', and was: 'romance'[0m
[31mFor the movie 'Smilin' Through' the model said: 'drama', and was: 'romance'[0m
[31mFor the movie 'The Daytrippers' the model said: 'western', and was: 'drama'[0m
[3

58.70646766169154

# Gerar ficheiro com os resultados do modelo, com inputs do ficheiro sem  as labels
##### 'test_no_labels.txt'

In [30]:
# Testar e escrever resultados no ficheiro 'results.txt'
pp_no_label = PreProcessing(avaliation+".txt")
clean_data_no_label = pp_no_label.returnCleanText(plot_index=3)
model_teste.test_without_labels(clean_data_no_label)

[34mTesting the Model without Labels
[0m
[36mTest Completed!
[0m


# Testar apenas com 1 frase de input

In [31]:
texto="Romeo and Juliet is a play written by Shakespeare. It is a tragic love story where the two main characters, Romeo and Juliet, are supposed to be sworn enemies but fall in love. Due to their families' ongoing conflict, they cannot be together, so they kill themselves because they cannot cope with being separated from one another. Romeo and Juliet is a Shakespearean tragedy"
clean_texto=PreProcessing.returnCleanInputText(texto)
genre="romance"
model_teste.test_from_input(clean_texto,genre)

[34mTrying to predict the genre
[0m
[32mFor the movie given by input the model said: 'romance', and was: 'romance'[0m


# Testar com o nosso ficheiro de teste
##### 'our_data.txt'

In [32]:
print("\033[34mPre Processing the Our data\n\033[0m")
pp=PreProcessing("our_data.txt")
clean_our_data=pp.returnCleanText(plot_index=2)
print("\033[32mPre Processing of Our Data Completed!\n\033[0m")

[34mPre Processing the Our data
[0m
[32mPre Processing of Our Data Completed!
[0m


In [33]:
model_teste.test_with_label(clean_our_data,genre_index=1,plot_index=2)

[34mTesting the Model with Labels
[0m
[32mFor the movie 'Dune' the model said: 'sci-fi', and was: 'sci-fi'[0m
[32mFor the movie 'Shrek' the model said: 'animation', and was: 'animation'[0m
[31mFor the movie 'The Teacher' the model said: 'comedy', and was: 'drama'[0m
[31mFor the movie 'Coco' the model said: 'horror', and was: 'animation'[0m
[32mFor the movie 'How I Met Your Mother' the model said: 'comedy', and was: 'comedy'[0m
[31mFor the movie 'Scary Movie' the model said: 'horror', and was: 'comedy'[0m
[31mFor the movie 'Back to the Future' the model said: 'romance', and was: 'sci-fi'[0m
[31mFor the movie 'Scream' the model said: 'animation', and was: 'horror'[0m
[32mFor the movie 'Fast and Furious' the model said: 'action', and was: 'action'[0m
[31mFor the movie '10 Things I Ate About You' the model said: 'drama', and was: 'romance'[0m
[32mFor the movie 'O Rei Leao' the model said: 'animation', and was: 'animation'[0m
[32mFor the movie 'Django Unchained' the

48.57142857142857