# Preprocessing of text data
We perform the following preprocessing steps on the novel The Great Gatsby : 
+ Lower casing
+ Removal of Punctuations
+ Removal of Stopwords
+ Removal of Frequent words
+ Removal of Rare words
+ Stemming
+ Lemmatization

In [1]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [2]:
# Importation des stopwords de spacy 
spacy_stopwords= spacy.lang.en.stop_words.STOP_WORDS
len(spacy_stopwords)

326

In [3]:
from collections import Counter
from nltk.stem import PorterStemmer

In [4]:
# Fonction de preprocessing qui effectue les taches mentionnées précdemment
def preprocessing(text, word_frequency_threshold, word_rare_threshold):
    # transformer le texte en miniscule
    text = text.lower()
    # tokenization du texte
    doc = nlp(text)
    # removal of punctuation
    words = [token.text for token in doc if not token.is_punct and not token.is_space]
    # removal of stop words
    words = [word for word in words if not nlp.vocab[word].is_stop]
    
    # elimination des mots ayant une frequence superieure a word_frequency_threshold (mots tres fréquents)
    word_freq = Counter(words)
    words = [word for word in words if word_freq[word] <= word_frequency_threshold]
    
    # elimination des mots ayant une frequence inferieure a word_rare_threshold (mots tres rares)
    rare_words = [word for word in words if word_freq[word] == word_rare_threshold]  
    words = [word for word in words if word not in rare_words]

    # Stemming avec Porter
    stemmer = PorterStemmer()
    words = [stemmer.stem(word) for word in words]

    # Lemmatization
    words = [token.lemma_ for token in nlp(" ".join(words))]
    
    # recuperation du nouveau texte consituté des nouveau token
    processed_text = " ".join(words)
    return processed_text

In [5]:
# sauvegarde du texte preprocessed dans un fichier txt
def save_preprocessed_text_to_file(text, output_file_path):
    with open(output_file_path, "w") as file:
        file.write(text)

In [6]:
with open("Gatsby.txt", "r", encoding="utf-8") as file:
    text = file.read()

    

processed_text = preprocessing(text, word_frequency_threshold=15, word_rare_threshold=2)
output_file_path = "preprocessed_Gatsby.txt"

save_preprocessed_text_to_file(processed_text, output_file_path)
