<a href="https://colab.research.google.com/github/lucasaaz/ACII/blob/main/Pre_Processamento_Textual.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. Normalização

*Realizar as seguintes ações de normalização no arquivo de entrada e gerar o arquivo de saída Shakespeare_Normalized.txt.*


* Lower case reduction

* Accent and diacritic removal

* Canonicalizing of acronyms, currency, date and hyphenated words

* Punctuation removal (except currency and date).

* Special characters removal

In [17]:
import unicodedata
import re

def normalize_text(text):
    # 1. Lower case reduction
    text = text.lower()

    # 2. Accent and diacritic removal
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('ascii')

    # 3. Canonicalizing of acronyms, currency, date and hyphenated words
    text = re.sub(r'\b(\w+)\s*-\s*(\w+)\b', r'\1\2', text)  # Hyphenated words
    text = re.sub(r'\$\d+\.\d{2}', '[CURRENCY]', text)  # Currency
    text = re.sub(r'\b\d{1,2}/\d{1,2}/\d{2,4}\b', '[DATE]', text)  # Date

    # 4. Punctuation removal (except currency and date)
    text = re.sub(r'[^\w\s]', '', text)

    # 5. Special characters removal
    text = re.sub(r'[^\x00-\x7F]+', '', text)

    return text

with open('Shakespeare.txt', 'r') as file:
    text = file.read()

normalized_text = normalize_text(text)

with open('Shakespeare_Normalized.txt', 'w') as file:
    file.write(normalized_text)

# 2. Tokenização

Realizar cada uma das seguintes tokenizações no arquivo ShakespeareNormalized txt e gerar o arquivo de saída Shakespeare_Normalized_TokenizedXX.txt, onde XX é o número da subtarefa. Por exemplo, o arquivo Shakespeare_Normalized_Tokenized01.txt é a saída do algoritmo 1 (White Space Tokenization):

* White Space Tokenization
* NLTK: Word Tokenizer
* NLTK: Tree Bank Tokenizer
* NLTK: Word Punctuation Tokenizer
* NLTK: Tweet Tokenizer
* NLTK: MWE Tokenizer
* TextBlob Word Tokenizer
* spaCy Tokenizer
* Gensim Word Tokenizer
* Keras Tokenization

In [18]:
import nltk

# Baixar os pacotes "punkt" e "wordnet", se não estiverem baixados
nltk.download('punkt_tab')
nltk.download('wordnet')

print("Caminhos NLTK: ", nltk.data.path)

from nltk.tokenize import word_tokenize
import os
import spacy
from gensim.utils import tokenize
from tensorflow.keras.preprocessing.text import text_to_word_sequence

# Baixar o pacote punkt
nltk.download('punkt')

# Carregar o modelo spaCy
nlp = spacy.load('en_core_web_sm')

def tokenize_text(text, method):
    if method == 1:
        return text.split()
    elif method == 2:
        return word_tokenize(text)
    elif method == 3:
        return TreebankWordTokenizer().tokenize(text)
    elif method == 4:
        return WordPunctTokenizer().tokenize(text)
    elif method == 5:
        return TweetTokenizer().tokenize(text)
    elif method == 6:
        return MWETokenizer().tokenize(text.split())
    elif method == 7:
        return TextBlob(text).words
    elif method == 8:
        return [token.text for token in nlp(text)]
    elif method == 9:
        return list(tokenize(text))
    elif method == 10:
        return text_to_word_sequence(text)

methods = {
    1: "Tokenização por Espaço em Branco",
    2: "Tokenizador de Palavras do NLTK",
    3: "Tokenizador Tree Bank do NLTK",
    4: "Tokenizador de Pontuação do NLTK",
    5: "Tokenizador de Tweet do NLTK",
    6: "Tokenizador MWE do NLTK",
    7: "Tokenizador de Palavras do TextBlob",
    8: "Tokenizador do spaCy",
    9: "Tokenizador do Gensim",
    10: "Tokenização do Keras"
}

# Verificação do arquivo de texto
if os.path.exists('Shakespeare_Normalized.txt'):
    with open('Shakespeare_Normalized.txt', 'r') as file:
        normalized_text = file.read()

    for method_num, method_name in methods.items():
        tokens = tokenize_text(normalized_text, method_num)
        with open(f'Shakespeare_Normalized_Tokenized{method_num:02d}.txt', 'w') as file:
            file.write('\n'.join(tokens))
else:
    print("O arquivo 'Shakespeare_Normalized.txt' não foi encontrado.")

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Caminhos NLTK:  ['/root/nltk_data', '/usr/nltk_data', '/usr/share/nltk_data', '/usr/lib/nltk_data', '/usr/share/nltk_data', '/usr/local/share/nltk_data', '/usr/lib/nltk_data', '/usr/local/lib/nltk_data']


# 3. Stop-words Removal

Realizar a remoção de stop-words do texto (apenas o da subtarefa 2 de tokenização), e gerar um arquivo de saída denominado Shakespeare_Normalized_Tokenized_StopWord.txt.

In [19]:
from nltk.corpus import stopwords

nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

with open('Shakespeare_Normalized_Tokenized01.txt', 'r') as file:
    tokens = file.read().splitlines()

filtered_tokens = [word for word in tokens if word.lower() not in stop_words]

with open('Shakespeare_Normalized_Tokenized_StopWord.txt', 'w') as file:
    file.write('\n'.join(filtered_tokens))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# 4. Text Lemmatization

Realizar a lematização do texto gerado na etapa anterior, utilizando o WordNet Lemmatizer e gerar um arquivo de saída denominado Shakespeare_Normalized_Tokenized_StopWord_Lemmatized.txt.

In [20]:
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')
nltk.download('omw-1.4')

lemmatizer = WordNetLemmatizer()

with open('Shakespeare_Normalized_Tokenized_StopWord.txt', 'r') as file:
    tokens = file.read().splitlines()

lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]

with open('Shakespeare_Normalized_Tokenized_StopWord_Lemmatized.txt', 'w') as file:
    file.write('\n'.join(lemmatized_tokens))

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


# 5. Text Stemming

Aplicar cada um dos seguintes stemmers no arquivo de entrada Shakespeare_Normalized_Tokenized_StopWord_Lemmatized.txt e gerar o arquivo de saída Shakespeare_Normalized_Tokenized_StopWord_Lemmatized_StemmingXX.txt, onde XX é o número da subtarefa. Por exemplo, o arquivo Shakespeare_Normalized_Tokenized_StopWord_Lemmatized_Stemming01.txt é a saída do algoritmo 1 (Porter Stemmer):

* Porter Stemmer
* Snowball Stemmer

In [21]:
from nltk.stem import PorterStemmer, SnowballStemmer

porter = PorterStemmer()
snowball = SnowballStemmer('english')

with open('Shakespeare_Normalized_Tokenized_StopWord_Lemmatized.txt', 'r') as file:
    tokens = file.read().splitlines()

porter_stemmed_tokens = [porter.stem(word) for word in tokens]
snowball_stemmed_tokens = [snowball.stem(word) for word in tokens]

with open('Shakespeare_Normalized_Tokenized_StopWord_Lemmatized_Stemming01.txt', 'w') as file:
    file.write('\n'.join(porter_stemmed_tokens))

with open('Shakespeare_Normalized_Tokenized_StopWord_Lemmatized_Stemming02.txt', 'w') as file:
    file.write('\n'.join(snowball_stemmed_tokens))

# 6. Análise do Vocabulário

Comparar os vocabulários gerados por cada lematizador e stemmer, apresentando um arquivo CSV para cada um deles contendo:

* Token (raíz resultante)
* Número de ocorrências do token no documento resultante (lematizado ou com stemming)
* Tamanho em caracteres de cada token do vocabulário

Por exemplo, para o lematizador, gerar o arquivo Shakespeare_Vocabulary_Lemmatized.csv e para o Porter Stemmer gerar o arquivo Shakespeare_Vocabulary_Porter.csv.

Apresentar um documento final comparativo denominado Shakespeare_Vocabulary_Analysis.txt contendo, para cada lematizador e stemmer utilizado, o tamanho do vocabulário (número de tokens), o número médio de ocorrências e o tamanho médio dos tokens.

In [22]:
import csv
from collections import Counter

def analyze_vocabulary(tokens, output_file):
    token_counts = Counter(tokens)
    with open(output_file, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['Token', 'Occurrences', 'Length'])
        for token, count in token_counts.items():
            writer.writerow([token, count, len(token)])

with open('Shakespeare_Normalized_Tokenized_StopWord_Lemmatized.txt', 'r') as file:
    lemmatized_tokens = file.read().splitlines()

analyze_vocabulary(lemmatized_tokens, 'Shakespeare_Vocabulary_Lemmatized.csv')

with open('Shakespeare_Normalized_Tokenized_StopWord_Lemmatized_Stemming01.txt', 'r') as file:
    porter_stemmed_tokens = file.read().splitlines()

analyze_vocabulary(porter_stemmed_tokens, 'Shakespeare_Vocabulary_Porter.csv')

with open('Shakespeare_Normalized_Tokenized_StopWord_Lemmatized_Stemming02.txt', 'r') as file:
    snowball_stemmed_tokens = file.read().splitlines()

analyze_vocabulary(snowball_stemmed_tokens, 'Shakespeare_Vocabulary_Snowball.csv')

# Documento Final Comparativo

In [23]:
import pandas as pd

def generate_comparative_analysis():
    lemmatized_df = pd.read_csv('Shakespeare_Vocabulary_Lemmatized.csv')
    porter_df = pd.read_csv('Shakespeare_Vocabulary_Porter.csv')
    snowball_df = pd.read_csv('Shakespeare_Vocabulary_Snowball.csv')

    with open('Shakespeare_Vocabulary_Analysis.txt', 'w') as file:
        file.write("Lemmatizer Analysis:\n")
        file.write(f"Vocabulary Size: {len(lemmatized_df)}\n")
        file.write(f"Average Occurrences: {lemmatized_df['Occurrences'].mean()}\n")
        file.write(f"Average Token Length: {lemmatized_df['Length'].mean()}\n\n")

        file.write("Porter Stemmer Analysis:\n")
        file.write(f"Vocabulary Size: {len(porter_df)}\n")
        file.write(f"Average Occurrences: {porter_df['Occurrences'].mean()}\n")
        file.write(f"Average Token Length: {porter_df['Length'].mean()}\n\n")

        file.write("Snowball Stemmer Analysis:\n")
        file.write(f"Vocabulary Size: {len(snowball_df)}\n")
        file.write(f"Average Occurrences: {snowball_df['Occurrences'].mean()}\n")
        file.write(f"Average Token Length: {snowball_df['Length'].mean()}\n")

generate_comparative_analysis()