In [1]:
import pandas as pd 
from sklearn.model_selection import train_test_split
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import nltk

In [2]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sayan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sayan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
emails_df = pd.read_csv('D:\code\AppliedML_assgn_01\emails.csv')
emails_df.head()

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


In [4]:
def clean_text(text):
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text, re.I|re.A)
    # Convert to lowercase
    text = text.lower()
    # Remove extra spaces
    text = text.strip()
    return text

# Apply cleaning function to the text column
emails_df['cleaned_text'] = emails_df['text'].apply(clean_text)

In [5]:
def preprocess_text(text):
    # Tokenization
    tokens = word_tokenize(text)
    # Stopwords removal
    tokens = [token for token in tokens if token not in stopwords.words('english')]
    # Stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]
    return ' '.join(tokens)

# Apply preprocessing to the cleaned text
emails_df['preprocessed_text'] = emails_df['cleaned_text'].apply(preprocess_text)

In [6]:
df_shuffled = emails_df.sample(frac = 1).reset_index(drop = True)
train_size = 0.7
val_size = 0.2

df_train, df_temp = train_test_split(df_shuffled, train_size = train_size, random_state = 1)

validation_size_adjusted = val_size / (1 - train_size) 
df_validation, df_test = train_test_split(df_temp, train_size=validation_size_adjusted, random_state = 1)

In [7]:
df_train.to_csv('train.csv', index=False)
df_validation.to_csv('validation.csv', index=False)
df_test.to_csv('test.csv', index=False)