In [1]:
import re
import emoji
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK resources (run this once)
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\neelb\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\neelb\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\neelb\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# Remove usernames
def remove_usernames(text):
    text = re.sub(r"\b[\w.\s]+(?=\s*:)", " ", text)
    text = " ".join(text.split())  # Clean up extra spaces
    return text

# Remove bot commands
def remove_bot_commands(text):
    text = re.sub(r'\?\w+\s?@\w+', '', text)
    text = re.sub(r'!\w+\s?@\w+', '', text)
    text = re.sub(r'@\w+', '', text)
    return text

# Fix concatenated 'n'
def fix_concatenated_n(text):
    text = re.sub(r'(\w)n(\w)', r'\1 n \2', text)
    return text

# Format text
def format_text(text):
    text = re.sub(r'\s*\n\s*', ' ', text)
    text = re.sub(r'(?<=[.,!?])(?=\S)', ' ', text)
    text = re.sub(r"[^a-zA-Z0-9\s.,!?']", "", text)
    text = " ".join(text.split())
    return text

In [3]:
def clean_text(text):
    text = remove_usernames(text)
    text = remove_bot_commands(text)
    text = fix_concatenated_n(text)
    text = format_text(text)
    text = emoji.replace_emoji(text, replace="")
    text = text.lower().strip()
    return text

In [4]:
def clean_text_file(input_file, output_file):
    """
    Reads a text file, cleans its contents, and writes the cleaned data to a new file.
    """
    cleaned_lines = []

    with open(input_file, 'r', encoding='utf-8') as file:
        for line in file:
            cleaned_line = clean_text(line)  # Apply the cleaning pipeline
            cleaned_lines.append(cleaned_line.strip())

    # Write the cleaned data to the output file
    with open(output_file, 'w', encoding='utf-8') as file:
        file.write("\n".join(cleaned_lines))

In [5]:
import os

def process_all_files(input_folder, output_folder):
    """
    Processes all text files in the input folder, cleans them, and saves the results in the output folder.
    """
    # Create the output folder if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)

    # Process each text file in the input folder
    for filename in os.listdir(input_folder):
        if filename.endswith(".txt"):
            input_file_path = os.path.join(input_folder, filename)
            output_file_name = os.path.splitext(filename)[0] + "_Cleaned.txt"
            output_file_path = os.path.join(output_folder, output_file_name)

            clean_text_file(input_file_path, output_file_path)
            print(f"✅ Processed and cleaned: {filename}")

    print(f"✅ All files cleaned and saved in {output_folder}")

In [None]:
# Define input and output folders
input_folder = r"C:\Users\neelb\Desktop\MEDD8925 - Analyzing Data Quantitatively\MEDD8925 Discord Project\Discord Data v3 Antispam"
output_folder = r"C:\Users\neelb\Desktop\MEDD8925 - Analyzing Data Quantitatively\MEDD8925 Discord Project\Test Cleaned Discord Data"

# Process all files in the folder
process_all_files(input_folder, output_folder)

✅ Processed and cleaned: [298954459172700181] [part 10].txt
✅ Processed and cleaned: [298954459172700181] [part 11].txt
✅ Processed and cleaned: [298954459172700181] [part 12].txt
✅ Processed and cleaned: [298954459172700181] [part 13].txt
✅ Processed and cleaned: [298954459172700181] [part 2].txt
✅ Processed and cleaned: [298954459172700181] [part 3].txt
