# Text Preprocessing - Financial Sentiment Analysis

This notebook is a step-by-step review of a text preprocessing functions used as a initial step of sentiment analysis.

In [16]:
import spacy
import re
import pandas as pd
import contractions
import os
from spacy.language import Language
from spacy_langdetect import LanguageDetector

# Exploratory input data analysis



In [17]:
data = pd.read_parquet("data/FinancialNews.parquet")

ImportError: Unable to find a usable engine; tried using: 'pyarrow', 'fastparquet'.
A suitable version of pyarrow or fastparquet is required for parquet support.
Trying to import the above resulted in these errors:
 - Missing optional dependency 'pyarrow'. pyarrow is required for parquet support. Use pip or conda to install pyarrow.
 - Missing optional dependency 'fastparquet'. fastparquet is required for parquet support. Use pip or conda to install fastparquet.

In [14]:
print(data.shape)
data.head(5)

NameError: name 'data' is not defined

In [None]:
# Sentences examples
pd.set_option("display.max_colwidth", None)
data.iloc[10, :]

In [None]:
# Sentiment feature - number of occurences
data["Sentiment"].value_counts().plot(kind="bar")

# Text cleaning

In [None]:
# Unique chars
unique_chars = pd.Series(
    [char for sentence in data["Sentence"] for char in sentence]
).unique()
print("Number of unique chars:", len(unique_chars))
print(unique_chars)

In [None]:
def clean_text(text: str) -> str:
    """
    Cleans input text by lowercasing and removing punctuation.

    :param text: An input string to be cleaned
    :return: Cleaned string
    """

    text = text.lower()

    print("Lower case")
    print(text)

    text = re.compile(r"https?://\S+|www\.\S+").sub("", text)

    print()
    print("Links")
    print(text)

    desired_elements = r"[^a-z\?\!\'\ ]"
    text = re.sub(desired_elements, "", text)

    print()
    print("Desired signs")
    print(text)

    text = " ".join([contractions.fix(word) for word in text.split()])

    print()
    print("Contractions")
    print(text)

    replacements = {
        r"'s\b": "",
        r"\s+": " ",
    }

    for replace, by in replacements.items():
        text = re.sub(replace, by, text)

    print()
    print("Space and 's")

    return text.strip()

In [None]:
initial_sentence = "The company plans to increase the unit's specialist staff to several dozen -- it's going to depend on the market situation during 2010 . We're happy for that. Check https://t.co/jNDphllzq5 for more!"
example_sentence = clean_text(initial_sentence)
example_sentence

In [None]:
print(data["Sentence"][0])
print()
clean_text(data["Sentence"][0])

In [None]:
print(data["Sentence"][4])
print()
clean_text(data["Sentence"][4])

# Tokenization

In [None]:
nlp = spacy.load("en_core_web_sm")

In [None]:
@Language.factory("language_detector")
def create_language_detector(nlp, name):
    return LanguageDetector()  # Create the detector component


# Add language detector component
nlp.add_pipe("language_detector", last=True)

In [None]:
def tokenize_text(text: str, nlp):
    """
    Tokenizes the input text by splitting it into words.

    :param text: An input string to be tokenized
    :param nlp: A SpaCy model
    :return: An array of tokens
    """
    doc = nlp(text)

    return doc

In [None]:
doc = tokenize_text(example_sentence, nlp)
doc

In [None]:
for token in doc:
    print(token.text)

# Language detection function

In [None]:
def check_language(text):
    """
    Check the language of the given text.

    :param text: Text to check the language.
    :return: Language of the text.
    """
    return text._.language["language"]

In [None]:
check_language(doc)

# Stop words

In [None]:
stop_words = [
    "a",
    "an",
    "and",
    "but",
    "how",
    "in",
    "on",
    "or",
    "the",
    "what",
    "will",
]


def remove_stop_words(tokenized_words, stop_words: list) -> list:
    """
    Removes the stop-words from the list of tokenized words.

    :param tokenized_words: Array of words after tokenization
    :return: Array of words after stop-words removing
    """
    words_removed_stop_words = [
        word for word in tokenized_words if word.text not in stop_words
    ]

    return words_removed_stop_words

In [None]:
example_sentence = remove_stop_words(doc, stop_words)
example_sentence

# Lemmatization

In [None]:
def lemmatize_words(tokenized_words):
    """
    Lemmatizes words from the list of tokenized words.

    :param tokenized_words: List of words after tokenization
    :return: List of words after lemmatization
    """

    lemmatized_words = [token.lemma_ for token in tokenized_words]

    return lemmatized_words

In [None]:
lemmatize_words(example_sentence)

# Whole preprocessing 

In [None]:
# clean_text function but without internal print functions


def clean_text(text: str) -> str:
    """
    Cleans input text by lowercasing and removing punctuation.

    :param text: An input string to be cleaned
    :return: Cleaned string
    """
    text = text.lower()
    text = re.compile(r"https?://\S+|www\.\S+").sub("", text)

    desired_elements = r"[^a-z\?\!\'\ ]"
    text = re.sub(desired_elements, "", text)

    text = " ".join([contractions.fix(word) for word in text.split()])

    replacements = {
        r"'s\b": "",
        r"\s+": " ",
    }
    for replace, by in replacements.items():
        text = re.sub(replace, by, text)

    return text.strip()

In [None]:
def text_preprocessing_on_sentence(sentence: str, stop_words: list, nlp):
    """
    Performs the whole preprocessing procedure on the given sentence.

    :param sentence: Sentence we want to preprocess.
    :param stop_words: List with stop-words we want to remove from the sentence.
    :param nlp: SpaCy model.

    :return str: Sentence after preprocessing.
    """
    preprocessed_sentence = clean_text(sentence)
    preprocessed_sentence = tokenize_text(preprocessed_sentence, nlp)
    preprocessed_sentence = remove_stop_words(preprocessed_sentence, stop_words)
    preprocessed_sentence = lemmatize_words(preprocessed_sentence)
    preprocessed_sentence = " ".join(preprocessed_sentence)

    return preprocessed_sentence

In [None]:
text_preprocessing_on_sentence(initial_sentence, stop_words, nlp)

In [None]:
def text_preprocessing_on_dataframe(
    dataframe: pd.DataFrame, stop_words: list, nlp, save_to_csv=True
):
    """
    Perform the whole preprocessing procedure on the Pandas DataFrame
    with columns "Sentence" and "Sentiment".

    :param dataframe: Pandas Dataframe containing "Sentence" and "Sentiment" columns.
    :param stop_words: List with stop-words we want to remove from sentences.
    :param nlp: SpaCy model.
    :param save_to_parquet: when not None, saves the dataframe in a .parquet file
                            with given name.

    :return: Pandas Dataframe with "PreprocessedSentence" column added.
    """

    dataframe["PreprocessedSentence"] = dataframe["Sentence"].apply(
        lambda text: clean_text(text)
    )
    dataframe["PreprocessedSentence"] = dataframe["PreprocessedSentence"].apply(
        lambda text: tokenize_text(text, nlp)
    )

    dataframe["Language"] = dataframe["PreprocessedSentence"].apply(
        lambda text: check_language(text)
    )

    dataframe["PreprocessedSentence"] = dataframe["PreprocessedSentence"].apply(
        lambda text: remove_stop_words(text, stop_words)
    )
    dataframe["PreprocessedSentence"] = dataframe["PreprocessedSentence"].apply(
        lambda text: lemmatize_words(text)
    )
    dataframe["PreprocessedSentence"] = dataframe["PreprocessedSentence"].apply(
        lambda text: " ".join(text)
    )

    dataframe = dataframe[dataframe["Language"] == "en"]

    dataframe = dataframe[["Sentence", "PreprocessedSentence", "Sentiment"]]

    if save_to_csv is not None:
        dataframe.to_csv(f"data/{save_to_csv}")

    return dataframe

In [None]:
preprocessed_data = text_preprocessing_on_dataframe(
    data, stop_words, nlp, save_to_csv="FinancialNewsPreprocessed.csv"
)
preprocessed_data.head(5)