In [1]:
import pandas as pd

train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

print(train_df.shape)
print(test_df.shape)


(10780, 6)
(2697, 6)


In [2]:
train_df.columns


Index(['id', 'subject', 'body', 'text', 'category', 'category_id'], dtype='str')

In [3]:
train_df['category'].value_counts()


category
forum           1800
verify_code     1800
promotions      1796
social_media    1796
spam            1794
updates         1794
Name: count, dtype: int64

In [4]:
train_df.isnull().sum()


id             0
subject        0
body           0
text           0
category       0
category_id    0
dtype: int64

In [5]:
import re

def clean_text(text):
    text = text.lower()                       # convert to lowercase
    text = re.sub(r'\s+', ' ', text)          # remove extra spaces
    text = re.sub(r'[^a-zA-Z0-9 ]', '', text) # remove special characters
    return text


In [6]:
train_df['clean_text'] = train_df['text'].apply(clean_text)
test_df['clean_text'] = test_df['text'].apply(clean_text)

train_df[['text', 'clean_text']].head()


Unnamed: 0,text,clean_text
0,Anniversary Special: Buy one get one free As o...,anniversary special buy one get one free as ou...
1,Your Amazon was used on new device Your $5000 ...,your amazon was used on new device your 5000 r...
2,"Re: Your Google inquiry Hi, following up about...",re your google inquiry hi following up about y...
3,Digital Ritual Experience Creation Cross-cultu...,digital ritual experience creation crosscultur...
4,"Your post was moved to ""Programming Help"" Tren...",your post was moved to programming help trendi...


In [7]:
# Remove any completely empty cleaned text rows
train_df = train_df[train_df['clean_text'].str.strip() != ""]
test_df = test_df[test_df['clean_text'].str.strip() != ""]

print("Final Train Shape:", train_df.shape)
print("Final Test Shape:", test_df.shape)


Final Train Shape: (10780, 7)
Final Test Shape: (2697, 7)


In [8]:
# Save cleaned datasets
train_df.to_csv("clean_train.csv", index=False)
test_df.to_csv("clean_test.csv", index=False)
