# Construcción del corpus

In [1]:
SHAKESPEARE_PATH    = '../data/Shakespeare'
JANE_PATH           = '../data/JaneAusten'
LOVECRAFT_PATH      = '../data/Lovecraft'

In [2]:
import re

def normalize(sentence: str) -> str:
    """
    Normalices and returns a sentence.

    Note: Taken and adapeted from P06_word2Vec.ipynb
    class notebook.
    """
    # Step 1: Remove special chars
    sentence = re.sub(r'\W', ' ', str(sentence))
    # Step 2: Remove single characters
    sentence = re.sub(r'\s+[a-zA-Z]\s+', ' ', sentence)
    sentence = re.sub(r'\^[a-zA-Z]\s+', ' ', sentence)
    # Step 3: Remove numbers
    sentence = re.sub(r'[0-9]+', ' ', sentence)
    # Step 4: remove consecutive spaces
    sentence = re.sub(' +', ' ', sentence)
    # Step 5: Sentence to lower cases
    sentence = sentence.lower()

    return sentence

In [3]:
import os

def load_book(path: str) -> str:
    book_str = ""
    book_paths = os.listdir(path)
    for book_path in book_paths:
        book = open(path + '/' + book_path, 'r', encoding='utf-8')
        lines = book.readlines()
        for sentence in lines:
            book_str += sentence

    return book_str

In [4]:
from keras.preprocessing.text import Tokenizer

books = {
    "jane": load_book(JANE_PATH),
    "lovecraft": load_book(LOVECRAFT_PATH),
    "shakespeare": load_book(SHAKESPEARE_PATH),
}

corpus_arr = {"full": []}

for author in books:
    books[author] = normalize(books[author])
    tokenized_book = books[author].split()
    complete_sentences = len(tokenized_book) // 150
    remaining_tokens = len(tokenized_book) % 150
    corpus_arr[author] = []
    for i in range(complete_sentences):
        start = 150 * i
        end = start + 150
        sentence = tokenized_book[start:end]
        corpus_arr[author].append(sentence)
        corpus_arr["full"].append(sentence)
    
    # if remaining_tokens > 0:
    #     start = 150 * complete_sentences
    #     end = start + remaining_tokens
    #     sentence = tokenized_book[start:end]
    #     corpus_arr[author].append(sentence)
    #     corpus_arr["full"].append(sentence)

print(corpus_arr["full"][0])

# for author in books:
#     corpus[author] = ["" for sentence in corpus_arr[author]]
#     for i in range(len(corpus[author])):
#         sentence = corpus_arr[author][i]
#         for token in sentence:
#             corpus[author][i] += token + " "
#         corpus["full"].append(corpus[author][i])

# corpus["full"][0]

['emma', 'by', 'jane', 'austen', 'chapter', 'emma', 'woodhouse', 'handsome', 'clever', 'and', 'rich', 'with', 'comfortable', 'home', 'and', 'happy', 'disposition', 'seemed', 'to', 'unite', 'some', 'of', 'the', 'best', 'blessings', 'of', 'existence', 'and', 'had', 'lived', 'nearly', 'twenty', 'one', 'years', 'in', 'the', 'world', 'with', 'very', 'little', 'to', 'distress', 'or', 'vex', 'her', 'she', 'was', 'the', 'youngest', 'of', 'the', 'two', 'daughters', 'of', 'most', 'affectionate', 'indulgent', 'father', 'and', 'had', 'in', 'consequence', 'of', 'her', 'sister', 'marriage', 'been', 'mistress', 'of', 'his', 'house', 'from', 'very', 'early', 'period', 'her', 'mother', 'had', 'died', 'too', 'long', 'ago', 'for', 'her', 'to', 'have', 'more', 'than', 'an', 'indistinct', 'remembrance', 'of', 'her', 'caresses', 'and', 'her', 'place', 'had', 'been', 'supplied', 'by', 'an', 'excellent', 'woman', 'as', 'governess', 'who', 'had', 'fallen', 'little', 'short', 'of', 'mother', 'in', 'affection', 

In [49]:
len(corpus_arr["full"])

5591

In [61]:
count = 0
tokens = {}
for sentence in corpus_arr["full"]:
    for token in sentence:
        tokens[token] = 1

len(tokens)

24044

# Construcción del embedding de tamaño 50 con su modelo

In [62]:
import os
import gensim

def load_embeding(path: str, sentences_arr: list, size: int = None) -> gensim.models.Word2Vec:
    """
    If embeding path exists, loads and returns the embeding.
    If embeding path does not exist, creates, saves and
    returns the embeding. If size is not specified, 50
    will be used by default.
    """    
    if not os.path.exists(path):
        # Give size its default value
        if size is None:
            size = 50
        
        # Train enbeding
        embeding = gensim.models.Word2Vec(sentences_arr, vector_size=size, window=3, min_count=0, workers=10)
        embeding.train(sentences_arr, total_examples=len(sentences_arr), epochs=30)
        embeding.save(path)
        print("Model created and saved")
    else:
        embeding = gensim.models.Word2Vec.load(path)
        print("Model loaded")

    return embeding


In [63]:
EMBEDDING_50_PATH  =    "../models/books_50_l.rojasb_j.arboleda.model"
EMBEDDING_100_PATH =    "../models/books_100_l.rojasb_j.arboleda.model"
EMBEDDING_150_PATH =    "../models/books_150_l.rojasb_j.arboleda.model"


In [65]:
embeding_50 = load_embeding(EMBEDDING_50_PATH, corpus_arr["full"], 50)

Model created and saved


In [66]:
len(embeding_50.wv.index_to_key)

24044

In [67]:
from keras.preprocessing.text import Tokenizer

def get_tokenizer_from_embeding(embeding: gensim.models.Word2Vec) -> Tokenizer:
    vocab = {}
    i = 0
    for word in embeding.wv.index_to_key:
        vocab[word] = i
        i += 1
    
    tokenizer = Tokenizer(num_words=len(vocab))
    tokenizer.word_index = vocab
    return tokenizer

In [68]:
tokenizer_50 = get_tokenizer_from_embeding(embeding_50)

In [69]:
def corpus2sequences(corpus: dict, tokenizer: Tokenizer) -> dict:
    sequences = {}
    for key in corpus:
        sequences[key] = tokenizer.texts_to_sequences(corpus[key])
    
    return sequences

In [70]:
# sequences_50 = corpus2sequences(corpus, tokenizer_50)

In [71]:
# sequences_50["full"][10]

In [72]:
len(corpus_arr["full"])

5591

In [73]:
len(corpus_arr["jane"]) + len(corpus_arr["shakespeare"]) + len(corpus_arr["lovecraft"])

5591

In [82]:
import pandas as pd
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

def get_X_Y(corpus_arr: dict, embedding: gensim.models.Word2Vec):
    x_list = []
    y_list = []
    i = 0
    count = 0
    for author in books:
        for sentence in corpus_arr[author]:
            vector = None
            for token in sentence:
                token_vector = embedding.wv[token]
                if vector is None:
                    vector = embedding.wv[token]
                else:
                    vector = np.concatenate((vector, token_vector))
            x_list.append(vector)
            y_list.append(i)
        i += 1

    # for author in sequences:
    #     if author != "full":
    #         for sequence in sequences[author]:
    #             vector = np.zeros((1, 150 * ))
    #             x_list.append(pad_seq)
    #             y_list.append(i)
    #         i += 1

    X = np.array(x_list)
    Y = np.array(y_list)

    return X,Y

In [83]:
X,Y = get_X_Y(corpus_arr, embeding_50)

In [84]:
X.shape

(5591, 7500)

In [85]:
Y.shape

(5591,)

In [86]:
from sklearn.model_selection import train_test_split
def get_train_test_val(X,Y):
    """
    Returns n-tuple: (X_train, X_test, X_val, y_train, y_test, y_val)
    """
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.30)
    X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5)
    return X_train, X_test, X_val, y_train, y_test, y_val

In [87]:
X_train, X_test, X_val, y_train, y_test, y_val = get_train_test_val(X,Y)

In [None]:
import numpy as np

def get_embeding_matrix(embeding: gensim.models.Word2Vec, tokenizer: Tokenizer):
    vocab_size = len(tokenizer.word_index) + 1
    vector_size = embeding.layer1_size
    matrix = np.zeros((vocab_size, vector_size))
    for word in tokenizer.word_index:
        i = tokenizer.word_index[word]
        matrix[i] = embeding.wv[word]

    return matrix

embeding_matrix_50 = get_embeding_matrix(embeding_50, tokenizer_50)

In [92]:
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Input
from tensorflow.keras.models import Sequential

def get_model(input_size, num_dense = 2, inter_units = 100, dropout = 0.3, dense_units = 3) -> Sequential:
    """
    Generates keras model.
        Params:
        -------
            data: n-tuple
                X_train, X_val, y_train, y_val

            gensim_embeding
                embeding generated using gensim
    """
    model = Sequential()

    model.add(Input(shape=(input_size)))
    for i in range(num_dense):
        model.add(Dense(inter_units, "relu"))
        model.add(Dropout(dropout))
    
    model.add(Dense(3, "softmax"))

    return model


model_50 = get_model(X_train.shape[1], dropout=0.7)   

In [93]:
model_50.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_3 (Dense)             (None, 100)               750100    
                                                                 
 dropout_2 (Dropout)         (None, 100)               0         
                                                                 
 dense_4 (Dense)             (None, 100)               10100     
                                                                 
 dropout_3 (Dropout)         (None, 100)               0         
                                                                 
 dense_5 (Dense)             (None, 3)                 303       
                                                                 
Total params: 760503 (2.90 MB)
Trainable params: 760503 (2.90 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [94]:
model_50.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [97]:
model_50.fit(X_train, y_train, validation_data = (X_val,y_val), epochs=7, verbose=1)

Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


<keras.src.callbacks.History at 0x179bb447250>

In [98]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
y_pred = model_50.predict(X_test)
y_pred = np.argmax(y_pred, axis=1)

# Generar un informe de clasificación
report = classification_report(y_test, y_pred)
print("Informe de Clasificación:")
print(report)

Informe de Clasificación:
              precision    recall  f1-score   support

           0       1.00      0.99      0.99       622
           1       0.94      0.97      0.96       100
           2       0.98      1.00      0.99       117

    accuracy                           0.99       839
   macro avg       0.97      0.99      0.98       839
weighted avg       0.99      0.99      0.99       839

