In [1]:
import re
import pandas as pd
from collections import defaultdict
import spacy
import logging  # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

In [None]:
# Define the file name of the CSV file to be read
file_name = "extracted_text_1.csv"

# Read the CSV file into a DataFrame, skipping any bad lines
df = pd.read_csv(file_name, on_bad_lines='skip')  # Load the data

# Set the column names of the DataFrame explicitly
df.columns = ['PMID', 'DOI', 'Authors', 'Title', 'Content']

# Display the first 10 rows of the DataFrame for a quick overview
df.head(10)

Replacing abbreviations

In [None]:
import re

def replace_with_dict(sentence, my_dict):
    for key, value in my_dict.items():
        # Replace the key in the sentence with the value
        sentence = re.sub(r'\b' + re.escape(key) + r'\b', value, sentence)
    return sentence

In [None]:
# Initialize two empty lists
list1 = []
list2 = []

# Define a regex pattern that matches a string of non-digit characters
pattern = r'^\D+$'

# Iterate over each sentence in the 'Content' column of the DataFrame
for sentence in df["Content"]:
    # Initialize an empty dictionary for each sentence
    my_dict = {}

    # Temporary string to construct a phrase
    sent = ""

    # Convert the sentence to a string and remove periods and commas
    sentence = str(sentence)
    sentence = re.sub(r'[.,]', '', sentence)

    # Split the sentence into a list of words
    text_list = sentence.split()

    # Check if the sentence contains an opening parenthesis
    if "(" in sentence:
        # Iterate over each word in the sentence
        for i in range(len(text_list)):
            # Check if the word meets several conditions:
            # 1. Starts with an opening parenthesis
            # 2. Is less than 8 characters and more than 3 characters long
            # 3. Contains a closing parenthesis
            # 4. Matches the specified pattern (non-digit characters)
            if (text_list[i][0] == "(" and len(text_list[i]) < 8 and
                ")" in text_list[i] and len(text_list[i]) > 3 and
                re.match(pattern, text_list[i])):
                # Look up to 4 words before the current word
                for k in range(i-2, i-5, -1):
                    try:
                        # Check if the first character of a previous word matches
                        # the second character of the current word (case-insensitive)
                        if text_list[k][0].lower() == text_list[i][1].lower():
                            # Construct a phrase from the words between these two points
                            for p in range(k, i):
                                sent += text_list[p] + " "

                            # Add the phrase to list1 if it's not already present
                            if sent not in list1:
                                list1.append(sent)
                                # Add an entry to the dictionary with the key being
                                # the current word stripped of parentheses
                                my_dict[text_list[i][1:len(text_list[i]) - 1]] = sent
                            # Reset the temporary string
                            sent = ""
                            break
                    except:
                        # Handle any exceptions that occur (e.g., index errors)
                        pass

    # Replace words in the sentence based on the dictionary and add to list2
    modified_sentence = replace_with_dict(sentence, my_dict)
    list2.append(modified_sentence)

# Add the modified sentences as a new column 'abr' in the DataFrame
df["abr"] = pd.Series(list2)

Using SpaCy to preprocess the text. Performing the following steps:

lowercase the words
remove the stopwords and single characters
use regex to remove non-alphabetic characters (anything that is not a number or alphabet including punctuations), in other words only keep "a" to "z" and digits.
remove lines that have less than 4 words, since they cannot contribute much to the training process.

In [None]:
# Load the small English model from spaCy, disabling tagger, parser, and NER for efficiency
nlp = spacy.load("en_core_web_sm", disable=["tagger", "parser", "ner"])

# Load the set of English stopwords
stopwords = spacy.lang.en.stop_words.STOP_WORDS

# Convert each quote to lowercase for standardization
df["cleaned"] = pd.Series(str(quote).lower() for quote in df["abr"])

list0 = []
for sentence in df["cleaned"]:
    # Iterate through each stopword
    for word in stopwords:
        # If the stopword contains an apostrophe, remove it from the sentence
        if "'" in word:
            sentence = re.sub(re.escape(word) + r'\b', '', sentence, flags=re.IGNORECASE)
        else:
            # Else, remove the stopword ensuring it is a whole word (`\b` is a word boundary)
            sentence = re.sub(r'\b' + re.escape(word) + r'\b', '', sentence, flags=re.IGNORECASE)
    list0.append(sentence)

# Update the 'cleaned' column with the sentences from which stopwords have been removed
df["cleaned"] = pd.Series(list0)

# Remove all characters except letters, numbers, and whitespaces
df["cleaned"] = pd.Series([re.sub(r'[^a-zA-Z0-9\s\(\)]', "", quote) for quote in df["cleaned"]])

# Remove standalone single letters from the sentences
df["cleaned"] = pd.Series([re.sub(r"\b[a-zA-Z]\b", "", sentence) for sentence in df["cleaned"]])

# Replace multiple consecutive spaces with a single space and trim leading/trailing spaces
df["cleaned"] = pd.Series([re.sub(r'\s+', ' ', text).strip() for text in df["cleaned"]])

# Function to count the number of words in a sentence
def word_count(sentence):
    return len(sentence.split())

# Filter the DataFrame to only include sentences with 4 or more words
filtered_df = df[df["cleaned"].apply(word_count) >= 4]
print(filtered_df)

# Create a list of sentences from the 'cleaned' column of the filtered DataFrame
quotes = [sentence for sentence in filtered_df["cleaned"]]  # Save all the lines

In [None]:
del df["abr"]

Saving in a new file preprocessed code

In [None]:
df.to_csv(f'file_1.csv', index=False)