In [None]:
import time
import nltk.tokenize as nt
import os
import json
import torch
import copy
import numpy as np
import matplotlib.pyplot as plt
import random
import time
from torch.utils.data import (TensorDataset, DataLoader, RandomSampler,
                              SequentialSampler)
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import device
import torch.optim as optim
import json
from collections import OrderedDict

import sys
sys.path.append('..')
import datasetgenerator as dsg



In [None]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

In [None]:
if device == "mps":
    dtype_value_torch = dtype=torch.float32
    dtype_value_np = dtype=np.float32
else:
    dtype_value_torch = dtype=torch.float64
    dtype_value_np = dtype=np.float64

In [None]:
AUTHOR1 = "coutinho-dataset"
AUTHOR2 = "denser-dataset"
PATH_TO_RAW_DATA = "data/raw/"
PATH_TO_PARSED_DATA = "data/parsed/"

ds_gen = dsg.ds_gen()

In [None]:
len_data = ds_gen.get_data_length(dsg.ds_gen.DENSER, go_up_on_path=1)
print(len_data)

In [None]:
dictAuthors = dict()

In [None]:
def map_to_number(dict, word):
    if(word in dict.keys()):
        return dict.get(word)
    else:
        next_value = len(dict.keys())
        dict.update({word: next_value})
        return next_value

In [None]:
dictAuthors.update({"padding": 0})
map_to_number(dictAuthors, "unknown")

In [None]:
def tokenize_words_in_paragraph(paragraph, dict):
    tensor = []
    parsed_paragraph = nt.word_tokenize(paragraph)
    for word in parsed_paragraph:
        token = map_to_number(dict, word)
        tensor.append(token)
    return tensor

In [None]:
def get_paragraph_size_list(paragraph_set):
    paragraph_sizes = []
    for paragraph in paragraph_set:
        paragraph_size = len(paragraph)
        paragraph_sizes.append(paragraph_size)
    return paragraph_sizes

In [None]:
len_data = ds_gen.get_data_length(dsg.ds_gen.DENSER, go_up_on_path=1)
print(len_data)

In [None]:
full_dataset_1, empty_dataset_1 = ds_gen.get_dataset_from_author(dsg.ds_gen.COUTINHO, 1, len_data, go_up_on_path=1)
full_dataset_2, empty_dataset_2 = ds_gen.get_dataset_from_author(dsg.ds_gen.DENSER, 1, len_data, go_up_on_path=1)
type(full_dataset_1)

In [None]:
def get_data_from_json(filepath, filename):
    with open(os.path.join(filepath, filename), "r", encoding="utf8") as f:
        loaded_data = json.load(f)
        return loaded_data

In [None]:
gpt_train_coutinho = get_data_from_json("../data/gpt_train_parsed/","gpt_coutinho.json")
gpt_train_denser = get_data_from_json("../data/gpt_train_parsed/","gpt_denser.json")

In [None]:
gpt_test_coutinho = get_data_from_json("../data/gpt_test_parsed/","gpt_coutinho.json")
gpt_test_denser = get_data_from_json("../data/gpt_test_parsed/","gpt_denser.json")

In [None]:
def remove_keys_from_dataset(dataset, keys):
    for key in keys:
        if(dataset.get(key) is not None):
            dataset.pop(key)
    return dataset


In [None]:
reduced_dataset_1 = remove_keys_from_dataset(full_dataset_1, gpt_train_coutinho.keys())
reduced_dataset_1 = remove_keys_from_dataset(reduced_dataset_1, gpt_test_coutinho.keys())
reduced_dataset_2 = remove_keys_from_dataset(full_dataset_2, gpt_train_denser.keys())
reduced_dataset_2 = remove_keys_from_dataset(reduced_dataset_2, gpt_test_denser.keys())
len(reduced_dataset_2)

In [None]:
proportion_training = 0.8
len_available_data = len_data-80
training_part = int(proportion_training * len_available_data)
test_part = len_available_data - training_part
print(training_part)
print(test_part)

In [None]:
def segment_dataset_randomly(dataset, quantity, avoid_indices=[]):

    shuffled_keys = list(dataset.keys())
    for key in dataset.keys():
        if(key in avoid_indices):
            shuffled_keys.remove(key)
    random.shuffle(shuffled_keys)

    if(len(shuffled_keys) >= quantity):
        selected_data = {}
        for index in range(quantity):
            key = shuffled_keys[index]
            selected_data.update({key: dataset[key]})
        return selected_data
    
    return None

In [None]:
selected_test_dataset_1 = segment_dataset_randomly(reduced_dataset_1, test_part)
selected_training_dataset_1 = segment_dataset_randomly(reduced_dataset_1, training_part-len(gpt_train_coutinho), list(selected_test_dataset_1.keys()))
selected_test_dataset_2 = segment_dataset_randomly(reduced_dataset_2, test_part)
selected_training_dataset_2 = segment_dataset_randomly(reduced_dataset_2, training_part-len(gpt_train_denser), list(selected_test_dataset_2.keys()))

In [None]:
complete_training_dataset_1 = list(selected_training_dataset_1.values())
complete_training_dataset_1.extend(list(gpt_train_coutinho.values()))
complete_training_dataset_2 = list(selected_training_dataset_2.values())
complete_training_dataset_2.extend(list(gpt_train_denser.values()))

In [None]:
train_dt_1_sizes = get_paragraph_size_list(complete_training_dataset_1)
train_dt_2_sizes = get_paragraph_size_list(complete_training_dataset_2)
test_dt_1_sizes = get_paragraph_size_list(list(selected_test_dataset_1.values()))
test_dt_2_sizes = get_paragraph_size_list(list(selected_test_dataset_2.values()))

In [None]:
train_dataset_raw_both = copy.deepcopy(complete_training_dataset_1)
train_dataset_raw_both.extend(complete_training_dataset_2)

In [None]:
test_dataset_raw_both = copy.deepcopy(list(selected_test_dataset_1.values()))
test_dataset_raw_both.extend(list(selected_test_dataset_2.values()))

In [None]:

train_labels = [0]*len(complete_training_dataset_1)
train_labels_2 = [1]*len(complete_training_dataset_2)

train_labels.extend(train_labels_2)

In [None]:
test_labels = [0]*len(selected_test_dataset_1)
test_labels_2 = [1]*len(selected_test_dataset_2)

test_labels.extend(test_labels_2)

In [None]:
all_data = copy.deepcopy(train_dt_1_sizes)
all_data.extend(train_dt_2_sizes)
all_data.extend(test_dt_1_sizes)
all_data.extend(test_dt_2_sizes)

In [None]:
plt.plot(all_data)
percetile = np.percentile(all_data, 85)
print(percetile)
plt.axhline(y=percetile, color="red")

In [None]:
train_dt_2_sizes = get_paragraph_size_list(complete_training_dataset_2)

In [None]:
def normalize_data(data, rule):
    while (len(data) < rule):
        data.append(0)

    if(len(data) > rule):
        data = data[0:rule]
    return data 

In [None]:
def tokenize_paragraph_set(paragraph_set):
    tokenized_paragraph_set= []
    for paragraph in paragraph_set:
        lowered_paragraph = paragraph.lower()
        tokenized_paragraph = tokenize_words_in_paragraph(lowered_paragraph, dictAuthors)
        tokenized_paragraph = normalize_data(tokenized_paragraph, 520)
        tokenized_paragraph_set.append(tokenized_paragraph)
    return tokenized_paragraph_set

In [None]:
tokenized_set1 = tokenize_paragraph_set(complete_training_dataset_1)

In [None]:
tokenized_set2 = tokenize_paragraph_set(complete_training_dataset_2)

In [None]:
train_dataset_both_tokenized = copy.deepcopy(tokenized_set1)
train_dataset_both_tokenized.extend(tokenized_set2)

In [None]:
tokenized_test1 = tokenize_paragraph_set(list(selected_test_dataset_1.values()))

In [None]:
tokenized_test2 = tokenize_paragraph_set(list(selected_test_dataset_2.values()))

In [None]:
test_dataset_both_tokenized = copy.deepcopy(tokenized_test1)
test_dataset_both_tokenized.extend(tokenized_test2)

In [None]:
with open('authorsDict.json', 'w', encoding='utf-8') as f:
    json.dump(dictAuthors, f, ensure_ascii=False, indent=4)
f.close

In [None]:
def load_pretrained_vectors(word2idx, fname):
    
    print("Loading pretrained vectors...")
    fin = open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, fin.readline().split())

    print(n)
    print(d)

    embeddings = np.random.uniform(-0.25, 0.25, (len(word2idx), d))
    embeddings[word2idx['padding']] = np.zeros((d,))

    count = 0
    for line in fin:
        #print(line)
        tokens = line.rstrip().split(' ')
        word = tokens[0]
        if word in word2idx:
            count += 1
            embeddings[word2idx[word]] = np.array(tokens[1:], dtype=dtype_value_np)

    print(f"There are {count} / {len(word2idx)} pretrained vectors found.")

    return embeddings

In [None]:
embeddings = load_pretrained_vectors(dictAuthors, "../cc.pt.300.vec")
embeddings = torch.tensor(embeddings)

In [None]:
def data_loader(train_inputs, test_inputs, train_labels, test_labels,
                batch_size=50):
   
    train_inputs, test_inputs, train_labels, test_labels =\
    tuple(torch.tensor(data) for data in
          [train_inputs, test_inputs, train_labels, test_labels])

    batch_size = 50

    train_data = TensorDataset(train_inputs, train_labels)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

    test_data = TensorDataset(test_inputs, test_labels)
    test_sampler = SequentialSampler(test_data)
    test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

    return train_dataloader, test_dataloader

In [None]:

train_dataloader, test_dataloader = data_loader(train_dataset_both_tokenized, test_dataset_both_tokenized, train_labels, test_labels, batch_size=50)

In [None]:
filter_sizes = [2, 3, 4]
num_filters = [2, 2, 2]

In [None]:
class CNN_NLP(nn.Module):
    def __init__(self,
                 pretrained_embedding=None,
                 freeze_embedding=False,
                 vocab_size=None,
                 embed_dim=300,
                 filter_sizes=[3, 4, 5],
                 num_filters=[100, 100, 100],
                 num_classes=2,
                 dropout=0.5):
        
        super(CNN_NLP, self).__init__()
        
        if pretrained_embedding is not None:
            self.vocab_size, self.embed_dim = pretrained_embedding.shape
            self.embedding = nn.Embedding.from_pretrained(pretrained_embedding,
                                                          freeze=freeze_embedding)
        else:
            self.embed_dim = embed_dim
            self.embedding = nn.Embedding(num_embeddings=vocab_size,
                                          embedding_dim=self.embed_dim,
                                          padding_idx=0,
                                          max_norm=5.0)
       
        self.conv1d_list = nn.ModuleList([
            nn.Conv1d(in_channels=self.embed_dim,
                      out_channels=num_filters[i],
                      kernel_size=filter_sizes[i])
            for i in range(len(filter_sizes))
        ])
       
        self.fc = nn.Linear(np.sum(num_filters), num_classes)
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, input_ids):
        
        x_embed = self.embedding(input_ids)

        x_reshaped = x_embed.permute(0, 2, 1)

        x_conv_list = [F.relu(conv1d(x_reshaped)) for conv1d in self.conv1d_list]

        x_pool_list = [F.max_pool1d(x_conv, kernel_size=x_conv.shape[2])
            for x_conv in x_conv_list]
        
        x_fc = torch.cat([x_pool.squeeze(dim=2) for x_pool in x_pool_list],
                         dim=1)
        
        logits = self.fc(self.dropout(x_fc))

        return logits

In [None]:

def initialize_model(pretrained_embedding=None,
                    freeze_embedding=False,
                    vocab_size=None,
                    embed_dim=300,
                    filter_sizes=[3, 4, 5],
                    num_filters=[100, 100, 100],
                    num_classes=2,
                    dropout=0.5,
                    learning_rate=0.01):

    assert (len(filter_sizes) == len(num_filters)), "filter_sizes and num_filters need to be of the same length."

    cnn_model = CNN_NLP(pretrained_embedding=pretrained_embedding,
                        freeze_embedding=freeze_embedding,
                        vocab_size=vocab_size,
                        embed_dim=embed_dim,
                        filter_sizes=filter_sizes,
                        num_filters=num_filters,
                        num_classes=2,
                        dropout=0.5)

    cnn_model.to(device, dtype=dtype_value_torch)

    optimizer = optim.Adadelta(cnn_model.parameters(), lr=learning_rate, rho=0.95)

    # optimizer = optim.Adam(cnn_model.parameters(),lr=learning_rate, eps=1e-06)
    return cnn_model, optimizer

In [None]:
loss_fn = nn.CrossEntropyLoss()

In [None]:
def set_seed(seed_value=42):

    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)

In [None]:
def evaluate(model, test_dataloader):
    model.eval()
    evaluation_start_time = time.time()
    test_accuracy = []
    test_loss = []
    for batch in test_dataloader:
        b_input_ids, b_labels = tuple(t.to(device) for t in batch)

        with torch.no_grad():
            logits = model(b_input_ids)

        loss = loss_fn(logits, b_labels)
        test_loss.append(loss.item())

        preds = torch.argmax(logits, dim=1).flatten()

        accuracy = (preds == b_labels).cpu().numpy().mean() * 100
        test_accuracy.append(accuracy)
        
    test_loss = np.mean(test_loss)
    test_accuracy = np.mean(test_accuracy)

    return test_loss, test_accuracy

In [None]:
def train(model, optimizer, train_dataloader, test_dataloader=None, epochs=10):

    best_accuracy = 0
    training_start_time = time.time()
    test_acc_hist=[]
    test_loss_hist=[]
    time_elapsed_hist=[]
    best_acc_hist=[]


    print("Start training...\n")
    print(f"{'Epoch':^7} | {'Train Loss':^12} | {'Test Loss':^10} | {'Test Acc':^9} | {'Elapsed':^9}")
    print("-"*60)

    for epoch_i in range(epochs):
     
        t0_epoch = time.time()
        total_loss = 0

        model.train()

        for step, batch in enumerate(train_dataloader):
           
            b_input_ids, b_labels = tuple(t.to(device) for t in batch)
            model.zero_grad()
            logits = model(b_input_ids)
            loss = loss_fn(logits, b_labels)
            total_loss += loss.item()
            loss.backward()
            optimizer.step()

        avg_train_loss = total_loss / len(train_dataloader)

        if test_dataloader is not None:
            test_loss, test_accuracy = evaluate(model, test_dataloader)
            if test_accuracy > best_accuracy:
                best_accuracy = test_accuracy
                best_acc_hist.append(best_accuracy)
            time_elapsed = time.time() - t0_epoch
            print(f"{epoch_i + 1:^7} | {avg_train_loss:^12.6f} | {test_loss:^10.6f} | {test_accuracy:^9.2f} | {time_elapsed:^9.2f}")
            
        test_acc_hist.append(test_accuracy)
        test_loss_hist.append(test_loss)
        time_elapsed_hist.append(time_elapsed)

    print("\n")
    print(optimizer)
    print(f"Training complete! Best accuracy: {best_accuracy:.2f}%.")
    print('Training finished, took {:.2f}s'.format(time.time() - training_start_time))

    figure, axis = plt.subplots(2, 2)

    axis[0, 0].plot(test_acc_hist) 

    axis[0, 1].plot(test_loss_hist) 

    axis[1, 0].plot(time_elapsed_hist) 
  
    axis[1, 1].plot(best_acc_hist) 

    plt.show() 


In [None]:
set_seed(42)
cnn_non_static, optimizer = initialize_model(pretrained_embedding=embeddings,
                                            freeze_embedding=False,
                                            learning_rate=0.25,
                                            dropout=0.5)
train(cnn_non_static, optimizer, train_dataloader, test_dataloader, epochs=4)

In [None]:
dictAuthorsOriginal = copy.deepcopy(dictAuthors)

In [None]:
last_token = dictAuthors.get("leu…")
dict_size = len(dictAuthors)
bound_size = dict_size - last_token
print(dict_size)
print(bound_size)

In [None]:
def tokenize_words_in_paragraph_for_predict(paragraph, dict):
    tensor = []
    dict_keys= list(dict.keys())
    parsed_paragraph = nt.word_tokenize(paragraph)
    for word in parsed_paragraph:
        token = map_to_number(dict, word)
        if (word not in dict_keys):
            token = map_to_number(dict, "unknown")
        tensor.append(token)
    print(tensor) 
    return tensor

In [None]:
def predict(text, model=cnn_non_static, max_len=520):

    tokens = tokenize_words_in_paragraph_for_predict(text.lower(), dictAuthorsOriginal)
    padded_tokens = tokens + [0] * (max_len - len(tokens))

    input_id = torch.tensor(padded_tokens).unsqueeze(dim=0)
    model.to(device, dtype=dtype_value_torch)

    logits = model.forward(input_id.to(device))

    probs = F.softmax(logits, dim=1).squeeze(dim=0)

    if probs[1] > 0.5:
        print(f"Esse parágrafo tem {probs[1] * 100:.2f}% de chance de ser Denser.")
        return "Denser"
    else:
        chance = 1 -probs[1]
        print(f"Esse parágrafo tem {chance * 100:.2f}% de chance de ser Coutinho.")
        return "Coutinho"


In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sn
import pandas as pd

y_pred = []
y_true = []

# iterate over test data
for inputs, labels in test_dataloader:
        output = cnn_non_static(inputs.to(device)) # Feed Network

        output = (torch.max(torch.exp(output), 1)[1]).data.cpu().numpy()
        y_pred.extend(output) # Save Prediction
        
        labels = labels.data.cpu().numpy()
        y_true.extend(labels) # Save Truth

# constant for classes
classes = ('Coutinho', 'Denser')

# Build confusion matrix
cf_matrix = confusion_matrix(y_true, y_pred)
df_cm = pd.DataFrame(cf_matrix / np.sum(cf_matrix, axis=1)[:, None], index = [i for i in classes],
                     columns = [i for i in classes])
plt.figure(figsize = (12,7))
sn.heatmap(df_cm, annot=True)
plt.savefig('output.png')

In [None]:
torch.save(cnn_non_static.state_dict(), "cnnns.pth")
print("Saved PyTorch Model State to cnnns.pth")

In [None]:
model = cnn_non_static.to(device, dtype=dtype_value_torch)
model.load_state_dict(torch.load("cnnns.pth"))

In [None]:
predict("Ou cruzes.")

In [None]:
with open("../data/gpt_test_parsed/index_map_coutinho.json") as f:
    index_map_0 = json.load(f)

In [None]:
with open("../data/gpt_test_parsed/index_map_denser.json") as f:
    index_map_1 = json.load(f)

In [None]:
def predict_gpt_dataset(sorted_result, index_map, gpt_test_dataset):
    for i in index_map.keys():
        main_index = index_map[i]
        prediction = predict(gpt_test_dataset[main_index])
        sorted_result.update({int(i): (prediction, gpt_test_dataset[main_index])})
    return sorted_result

In [None]:
sorted_result = OrderedDict()

sorted_result = predict_gpt_dataset(sorted_result, index_map_0, gpt_test_coutinho)
sorted_result = predict_gpt_dataset(sorted_result, index_map_1, gpt_test_denser)

In [None]:
sorted_indices = sorted(sorted_result)

for index in sorted_indices:
    print(index, sorted_result[index])

In [None]:
for index in sorted_indices:
    print(sorted_result[index][0])