# Preprocessing:

### Importing necessary Libraries and modules

In [1]:
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize, RegexpTokenizer
from typing import Iterator

# Download NLTK data 
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\karee\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
# Loading the CSV file into a DataFrame
df = pd.read_csv('reddit_data_filtered.csv')

# Defining the function to replace tricky characters
def replace_characters(text: str) -> str:
    replacement_rules = {'“': '"', '”': '"', '’': "'", '--': ','}
    for symbol, replacement in replacement_rules.items():
        text = text.replace(symbol, replacement)
    return text

# Defining the function to tokenize and preprocess sentences
def generate_tokenized_sentences(paragraph: str) -> Iterator[str]:
    word_tokenizer = RegexpTokenizer(r'[-\'\w]+')

    for sentence in sent_tokenize(paragraph):
        # Replace tricky characters
        sentence = replace_characters(sentence)
        
        # Lowercase the sentence
        sentence = sentence.lower()

        
        # Tokenize the sentence into words
        tokenized_sentence = word_tokenizer.tokenize(sentence)
        
        # Append [END] and [START] to the tokenized sentence
        if tokenized_sentence:
            tokenized_sentence.append('[END]')
            tokenized_sentence.insert(0, "[START]")
            
            # Convert the tokenized sentence to a comma-separated string
            tokenized_sentence_str = ', '.join(tokenized_sentence)
            
            yield tokenized_sentence_str

# Initializing an empty list to store tokenized sentences
all_tokenized_sentences = []

# Processing each comment in the DataFrame
for comment in df['Comment']:
    # Tokenize and preprocess each comment
    for tokenized_sentence in generate_tokenized_sentences(comment):
        all_tokenized_sentences.append(tokenized_sentence)

# Creating a text file to store the tokenized sentences
with open('PROCESSED_CORPUS.txt', 'w', encoding='utf-8') as file:
    for sentence in all_tokenized_sentences:
        file.write(sentence + '\n')