In [1]:
import pandas as pd
import re
import string

In [2]:
train_df = pd.read_csv("train_data.csv")
val_df = pd.read_csv("val_data.csv")
test_df = pd.read_csv("test_data.csv")

In [3]:
for df in [train_df, val_df, test_df]:
    if "Unnamed: 0" in df.columns:
        df.drop(columns=["Unnamed: 0"], inplace=True)

In [4]:
def clean_text(text):
    if pd.isnull(text):
        return ""
    text = text.lower() # Lowercase
    text = re.sub(r'http\S+', '', text) # Remove URLs
    text = re.sub(r'@[A-Za-z0-9_]+', '', text) # Remove mentions
    text = re.sub(r'#[A-Za-z0-9_]+', '', text) # Remove hashtags
    text = re.sub(r'[%s]' % re.escape(string.punctuation), ' ', text) # Remove punctuation
    text = re.sub(r'\d+', '', text) # Remove numbers
    text = re.sub(r'\s+', ' ', text).strip() # Remove extra spaces
    return text

In [5]:
train_df["clean_text"] = train_df["text"].apply(clean_text)
val_df["clean_text"] = val_df["text"].apply(clean_text)
test_df["clean_text"] = test_df["text"].apply(clean_text)

In [10]:
print("Original vs Cleaned Example:")
for i in range(2):
    print("\nBefore: ", train_df.loc[i, "text"])
    print("\nAfter: ", train_df.loc[i, "clean_text"])
    print("\n" + "-"*90 + "\n")

Original vs Cleaned Example:

Before:  All my life i've been going through shit (only 17 years old) and when things started to get better i crashed. I can't get myself to get out of bed no matter how much i try, my family understands but do still not approve since my grades dropped from all A's to E-C. It has been like this for 1-2 years now and none of my friends understands how It's like, I can't really blame them either since I don't like talking about it and i've always been taught to be a man and keep this stuff to myself. They just see a lazy fuck who is too irresponsible to go too school, same with my teachers. Idk if typing here is going to help at all but if anyone has some tips/advice on how to get motivated again i would be super happy.

After:  all my life i ve been going through shit only years old and when things started to get better i crashed i can t get myself to get out of bed no matter how much i try my family understands but do still not approve since my grades drop

In [11]:
train_df['clean_length'] = train_df['clean_text'].str.split().apply(len)
print("\nAverage text length (words):", train_df['clean_length'].mean())


Average text length (words): 122.06541582150102


In [14]:
train_df.to_csv('train_data_cleaned.csv', index=False)
val_df.to_csv('val_data_cleaned.csv', index=False)
test_df.to_csv('test_data_cleaned.csv', index=False)