# 01 - Data Preprocessing

### Description:
This notebook loads raw email data, cleans and tokenizes the text, and saves the preprocessed data for further analysis and model training.

In [None]:
# Import necessary libraries
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [None]:
# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

In [None]:
# Load raw email data
raw_data = pd.read_csv('data/raw/emails.csv')

In [None]:
# Example: Clean and tokenize text
stop_words = set(stopwords.words('english'))

In [None]:
def preprocess(text):
    tokens = word_tokenize(text.lower())  # Convert to lowercase and tokenize
    tokens = [word for word in tokens if word.isalnum() and word not in stop_words]  # Remove non-alphanumeric tokens and stopwords
    return tokens

In [None]:
# Apply preprocessing to each email
raw_data['clean_text'] = raw_data['text'].apply(preprocess)

In [None]:
# Display the first few rows of the cleaned data
print(raw_data.head())

In [None]:
# Save the cleaned data
raw_data.to_csv('data/processed/cleaned_data.csv', index=False)
print("Preprocessed data saved to 'data/processed/cleaned_data.csv'")