In [1]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import emoji  # Install with `!pip install emoji`

# Download necessary NLTK resources (run this once)
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# Initialize tools
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\neelb\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\neelb\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\neelb\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
def remove_usernames(text):
    """
    Removes usernames that appear at the start of the line or are followed by a colon.
    Replaces the username with a space to avoid concatenation issues.
    """
    # Replace username pattern with a space
    text = re.sub(r"\b[\w.\s]+(?=\s*:)", " ", text)
    return text

In [3]:
def remove_bot_commands(text):
    """
    Removes bot commands like '!command' or '?command' and @mentions.
    """
    text = re.sub(r'\?\w+\s?@\w+', '', text)  # Remove bot commands like "?warn @user"
    text = re.sub(r'!\w+\s?@\w+', '', text)  # Remove bot commands like "!rank @user"
    text = re.sub(r'@\w+', '', text)  # Remove @mentions
    return text

In [4]:
def format_text(text):
    """
    Formats text by:
    - Replacing multiple newlines with single spaces.
    - Ensuring spaces exist around punctuation.
    - Correcting common phrasing issues.
    """
    # Replace multiple newlines or mixed spaces/newlines with a single space
    text = re.sub(r'\s*\n\s*', ' ', text)

    # Add a space after punctuation if it's missing
    text = re.sub(r'(?<=[.,!?])(?=\S)', ' ', text)

    # Correct specific known concatenation patterns
    text = re.sub(r'\bitnthats\b', "it that's", text, flags=re.IGNORECASE)  # Correct "itnthats"
    text = re.sub(r'\btherewego\b', "there we go", text, flags=re.IGNORECASE)  # Correct "therewego"
    text = re.sub(r'\bgonlmao\b', "gon lmao", text, flags=re.IGNORECASE)  # Correct "gonlmao"

    # Ensure proper spacing between words
    text = re.sub(r'(?<=[a-zA-Z])(?=[A-Z])', ' ', text)  # Split camel case, if any
    text = re.sub(r'(?<=[a-zA-Z])(?=\d)', ' ', text)  # Split words from numbers, if concatenated

    # Strip extra spaces
    text = " ".join(text.split())

    return text

In [5]:
def clean_text(text):
    """
    Cleans and preprocesses text:
    - Removes usernames, bot commands, and emojis.
    - Retains meaningful structure without removing critical stopwords or context.
    """
    text = remove_usernames(text)  # Remove usernames
    text = remove_bot_commands(text)  # Remove bot commands
    text = format_text(text)  # Properly handle newlines and spacing
    text = emoji.replace_emoji(text, replace="")  # Remove emojis
    text = re.sub(r"http\S+|www\S+", "", text)  # Remove URLs
    text = re.sub(r"[^a-zA-Z0-9\s.,!?']", "", text)  # Retain alphanumeric characters and basic punctuation
    text = text.lower().strip()  # Convert to lowercase

    return text

In [6]:
def clean_text_file(input_file, output_file):
    """
    Reads a text file, cleans its contents, and writes the cleaned data to a new file.
    """
    cleaned_lines = []

    with open(input_file, 'r', encoding='utf-8') as file:
        for line in file:
            cleaned_line = clean_text(line)  # Apply the cleaning pipeline
            cleaned_lines.append(cleaned_line.strip())

    # Write the cleaned data to the output file
    with open(output_file, 'w', encoding='utf-8') as file:
        file.write("\n".join(cleaned_lines))

In [7]:
import os

def process_all_files(input_folder, output_folder):
    """
    Processes all text files in the input folder, cleans them, and saves the results in the output folder.
    """
    # Create the output folder if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)

    # Process each text file in the input folder
    for filename in os.listdir(input_folder):
        if filename.endswith(".txt"):
            input_file_path = os.path.join(input_folder, filename)
            output_file_name = os.path.splitext(filename)[0] + "_Cleaned.txt"
            output_file_path = os.path.join(output_folder, output_file_name)

            clean_text_file(input_file_path, output_file_path)
            print(f"✅ Processed and cleaned: {filename}")

    print(f"✅ All files cleaned and saved in {output_folder}")

In [8]:
# Example problematic text
example_text = "middle eastern: not, itnthats how it is therewego lmao gonlmao i can almost guarantee itnthats true"

# Clean the text
cleaned_text = clean_text(example_text)
print("Original Text:", example_text)
print("Cleaned Text:", cleaned_text)

Original Text: middle eastern: not, itnthats how it is therewego lmao gonlmao i can almost guarantee itnthats true
Cleaned Text: not, it that's how it is there we go lmao gon lmao i can almost guarantee it that's true


In [9]:
# Define input and output folders
input_folder = r"C:\Users\neelb\Desktop\MEDD8925 - Analyzing Data Quantitatively\MEDD8925 Discord Project\Discord Data v3 Antispam"
output_folder = r"C:\Users\neelb\Desktop\MEDD8925 - Analyzing Data Quantitatively\MEDD8925 Discord Project\Cleaned Discord Data"

# Process all files in the folder
process_all_files(input_folder, output_folder)

✅ Processed and cleaned: [298954459172700181] [part 10].txt
✅ Processed and cleaned: [298954459172700181] [part 11].txt
✅ Processed and cleaned: [298954459172700181] [part 12].txt
✅ Processed and cleaned: [298954459172700181] [part 13].txt
✅ Processed and cleaned: [298954459172700181] [part 2].txt
✅ Processed and cleaned: [298954459172700181] [part 3].txt
✅ Processed and cleaned: [298954459172700181] [part 4].txt
✅ Processed and cleaned: [298954459172700181] [part 5].txt
✅ Processed and cleaned: [298954459172700181] [part 6].txt
✅ Processed and cleaned: [298954459172700181] [part 7].txt
✅ Processed and cleaned: [298954459172700181] [part 8].txt
✅ Processed and cleaned: [298954459172700181] [part 9].txt
✅ Processed and cleaned: [298954459172700181].txt
✅ Processed and cleaned: [360462032811851778] [part 10].txt
✅ Processed and cleaned: [360462032811851778] [part 11].txt
✅ Processed and cleaned: [360462032811851778] [part 12].txt
✅ Processed and cleaned: [360462032811851778] [part 13].tx

✅ Processed and cleaned: [490935325259202560] [part 22].txt
✅ Processed and cleaned: [490935325259202560] [part 23].txt
✅ Processed and cleaned: [490935325259202560] [part 24].txt
✅ Processed and cleaned: [490935325259202560] [part 25].txt
✅ Processed and cleaned: [490935325259202560] [part 26].txt
✅ Processed and cleaned: [490935325259202560] [part 27].txt
✅ Processed and cleaned: [490935325259202560] [part 28].txt
✅ Processed and cleaned: [490935325259202560] [part 29].txt
✅ Processed and cleaned: [490935325259202560] [part 2].txt
✅ Processed and cleaned: [490935325259202560] [part 30].txt
✅ Processed and cleaned: [490935325259202560] [part 31].txt
✅ Processed and cleaned: [490935325259202560] [part 32].txt
✅ Processed and cleaned: [490935325259202560] [part 33].txt
✅ Processed and cleaned: [490935325259202560] [part 34].txt
✅ Processed and cleaned: [490935325259202560] [part 35].txt
✅ Processed and cleaned: [490935325259202560] [part 36].txt
✅ Processed and cleaned: [490935325259202

✅ Processed and cleaned: [568279918111227924] [part 14].txt
✅ Processed and cleaned: [568279918111227924] [part 15].txt
✅ Processed and cleaned: [568279918111227924] [part 16].txt
✅ Processed and cleaned: [568279918111227924] [part 17].txt
✅ Processed and cleaned: [568279918111227924] [part 18].txt
✅ Processed and cleaned: [568279918111227924] [part 19].txt
✅ Processed and cleaned: [568279918111227924] [part 20].txt
✅ Processed and cleaned: [568279918111227924] [part 21].txt
✅ Processed and cleaned: [568279918111227924] [part 22].txt
✅ Processed and cleaned: [568279918111227924] [part 23].txt
✅ Processed and cleaned: [568279918111227924] [part 24].txt
✅ Processed and cleaned: [568279918111227924] [part 25].txt
✅ Processed and cleaned: [568279918111227924] [part 26].txt
✅ Processed and cleaned: [568279918111227924] [part 27].txt
✅ Processed and cleaned: [568279918111227924] [part 28].txt
✅ Processed and cleaned: [568279918111227924] [part 29].txt
✅ Processed and cleaned: [56827991811122

✅ Processed and cleaned: [681363568293183489] [part 4].txt
✅ Processed and cleaned: [681363568293183489] [part 5].txt
✅ Processed and cleaned: [681363568293183489] [part 6].txt
✅ Processed and cleaned: [681363568293183489] [part 7].txt
✅ Processed and cleaned: [681363568293183489] [part 8].txt
✅ Processed and cleaned: [681363568293183489] [part 9].txt
✅ Processed and cleaned: [681363568293183489].txt
✅ Processed and cleaned: [689963458967765003] [part 10].txt
✅ Processed and cleaned: [689963458967765003] [part 11].txt
✅ Processed and cleaned: [689963458967765003] [part 12].txt
✅ Processed and cleaned: [689963458967765003] [part 2].txt
✅ Processed and cleaned: [689963458967765003] [part 3].txt
✅ Processed and cleaned: [689963458967765003] [part 4].txt
✅ Processed and cleaned: [689963458967765003] [part 5].txt
✅ Processed and cleaned: [689963458967765003] [part 6].txt
✅ Processed and cleaned: [689963458967765003] [part 7].txt
✅ Processed and cleaned: [689963458967765003] [part 8].txt
✅ P

✅ Processed and cleaned: [713928323805347955].txt
✅ Processed and cleaned: [721215902091182170] [part 2].txt
✅ Processed and cleaned: [721215902091182170] [part 3].txt
✅ Processed and cleaned: [721215902091182170] [part 4].txt
✅ Processed and cleaned: [721215902091182170] [part 5].txt
✅ Processed and cleaned: [721215902091182170] [part 6].txt
✅ Processed and cleaned: [721215902091182170] [part 7].txt
✅ Processed and cleaned: [721215902091182170].txt
✅ Processed and cleaned: [724681893869518903] [part 10].txt
✅ Processed and cleaned: [724681893869518903] [part 2].txt
✅ Processed and cleaned: [724681893869518903] [part 3].txt
✅ Processed and cleaned: [724681893869518903] [part 4].txt
✅ Processed and cleaned: [724681893869518903] [part 5].txt
✅ Processed and cleaned: [724681893869518903] [part 6].txt
✅ Processed and cleaned: [724681893869518903] [part 7].txt
✅ Processed and cleaned: [724681893869518903] [part 8].txt
✅ Processed and cleaned: [724681893869518903] [part 9].txt
✅ Processed an

✅ Processed and cleaned: [780553847276961812] [part 2].txt
✅ Processed and cleaned: [780553847276961812] [part 3].txt
✅ Processed and cleaned: [780553847276961812] [part 4].txt
✅ Processed and cleaned: [780553847276961812] [part 5].txt
✅ Processed and cleaned: [780553847276961812] [part 6].txt
✅ Processed and cleaned: [780553847276961812] [part 7].txt
✅ Processed and cleaned: [780553847276961812] [part 8].txt
✅ Processed and cleaned: [780553847276961812] [part 9].txt
✅ Processed and cleaned: [780553847276961812].txt
✅ Processed and cleaned: [781180833133625344] [part 2].txt
✅ Processed and cleaned: [781180833133625344] [part 3].txt
✅ Processed and cleaned: [781180833133625344] [part 4].txt
✅ Processed and cleaned: [781180833133625344] [part 5].txt
✅ Processed and cleaned: [781180833133625344].txt
✅ Processed and cleaned: [783794107083390996].txt
✅ Processed and cleaned: [783867661674217502].txt
✅ Processed and cleaned: [784383954425085992] [part 2].txt
✅ Processed and cleaned: [7843839