import

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import nltk
nltk.download("stopwords")
nltk.download("punkt")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download NLTK stopwords (if not already downloaded)
nltk.download("stopwords")
nltk.download("punkt")

def load_data(file_path):
    """
    Load data from a given file path.

    Parameters:
    - file_path (str): Path to the data file.

    Returns:
    - df (DataFrame): Pandas DataFrame containing the data.
    """
    data_df = pd.read_csv(file_path)
    return data_df

def preprocess_text(text):
    """
    Preprocess the text data.

    Parameters:
    - text (str): Text data to be preprocessed.

    Returns:
    - processed_text (str): Preprocessed text.
    """
    # Converting to lowercase
    text = text.strip().lower()

    # Removing special characters, numbers, and extra whitespaces
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)

    # Removing stopwords
    stop_words = set(stopwords.words("english"))
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word not in stop_words]

    # Joining the filtered words back into a string
    processed_text = " ".join(filtered_text)

    return processed_text

def preprocess_data(data):
    """
    Preprocess the data.

    Parameters:
    - data (DataFrame): Pandas DataFrame containing the data.

    Returns:
    - processed_data (DataFrame): Preprocessed Pandas DataFrame.
    """
    # Drop rows with missing values
    processed_data = data.dropna()

    # Apply text preprocessing to the "text" column
    processed_data["processed_text"] = processed_data["text"].apply(preprocess_text)

    # Drop duplicate texts
    processed_data.drop_duplicates("processed_text", inplace=True)

    return processed_data

def split_data(data):
    """
    Split the data into train/validation/test sets.

    Parameters:
    - data (DataFrame): Pandas DataFrame containing the data.

    Returns:
    - train_df (DataFrame): Train set.
    - validation_df (DataFrame): Validation set.
    - test_df (DataFrame): Test set.
    """
    # Splitting the data into 80% training, 10% validation, and 10% test
    train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
    validation_data, test_data = train_test_split(test_data, test_size=0.5, random_state=42)

    return train_data, validation_data, test_data

def store_splits(train_data, validation_data, test_data):
    """
    Store the train/validation/test splits as CSV files.

    Parameters:
    - train_data (DataFrame): Train set.
    - validation_data (DataFrame): Validation set.
    - test_data (DataFrame): Test set.
    """
    train_data.to_csv("train.csv", index=False)
    validation_data.to_csv("validation.csv", index=False)
    test_data.to_csv("test.csv", index=False)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [7]:
# Load data
file_path = "/content/drive/MyDrive/emails.csv"
email_data = load_data(file_path)

# Preprocess data
preprocessed_data = preprocess_data(email_data)

# Split and store data
train_set, validation_set, test_set = split_data(preprocessed_data)
store_splits(train_set, validation_set, test_set)
