In [3]:
import pandas as pd
# Load the dataset
data = pd.read_csv('new_spam.csv')
data.drop_duplicates(inplace=True)

# Count the number of 'ham' and 'spam' entries
ham_count = data[data['Category'] == 'ham'].shape[0]
spam_count = data[data['Category'] == 'spam'].shape[0]

# Check for duplicates
duplicate_count = data.duplicated().sum()

# # Check for non-English entries
# def is_english(text):
#     try:
#         text.encode(encoding='utf-8').decode('ascii')
#     except UnicodeDecodeError:
#         return False
#     else:
#         return True

# non_english_count = data['Message'].apply(lambda x: not is_english(x)).sum()

data.to_csv('cleaned_spam.csv', index=False)
# Print the results
print(f"Number of 'ham' messages: {ham_count}")
print(f"Number of 'spam' messages: {spam_count}")
print(f"Number of duplicate messages: {duplicate_count}")
# print(f"Number of non-English messages: {non_english_count}")


Number of 'ham' messages: 4516
Number of 'spam' messages: 3729
Number of duplicate messages: 0


In [10]:
import pandas as pd
import re

# Load the dataset
df = pd.read_csv('new_spam.csv')

def clean_text(text):
    if pd.isna(text) or not isinstance(text, str):
        return ""  # Return empty string for NaN or non-string values

    # Remove only the "Subject:" phrase from the text
    text = re.sub(r'\bSubject:\s*', '', text)
    
    # Keep English letters, numbers, punctuation, and whitespace
    text = re.sub(r'[^\w\s,.!?;:()-]', '', text)
    
    # Replace multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text)
    
    # Strip leading and trailing whitespace
    return text.strip()

# Apply the cleaning function to the 'Message' column
df['Message'] = df['Message'].apply(clean_text)

# Save the cleaned data
df.to_csv('cleaned_spam.csv', index=False)


In [11]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold

# Load the dataset
data = pd.read_csv('new_spam.csv')

# Initialize StratifiedKFold to split data into, e.g., 5 equal parts
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# Split the data while maintaining the same proportion of categories
for fold, (train_idx, test_idx) in enumerate(skf.split(data, data['Category'])):
    # Create a part of the dataset
    part = data.iloc[test_idx]
    
    # Save each part to a new CSV file
    part.to_csv(f'part_{fold+1}.csv', index=False)

    # Optionally, you can print out the proportion of 'ham' and 'spam' in each part
    print(f"Part {fold+1}:")
    print(part['Category'].value_counts(normalize=True))


Part 1:
Category
ham     0.547836
spam    0.452164
Name: proportion, dtype: float64
Part 2:
Category
ham               0.547472
spam              0.452164
{"mode":"full"    0.000364
Name: proportion, dtype: float64
Part 3:
Category
ham     0.547671
spam    0.452329
Name: proportion, dtype: float64




In [12]:
data.dropna(subset=['Message'], inplace=True)  # Ensure no NaN entries in 'Message' column
data.to_csv('new_spam.csv', index=False)  # Save the cleaned data to a new CSV file


In [5]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('omw-1.4')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kingc\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\kingc\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\kingc\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\kingc\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kingc\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [11]:
import pandas as pd
import random
import nltk
from nltk.corpus import wordnet, stopwords
from nltk.tokenize import word_tokenize


# Set up the stopwords
stop_words = set(stopwords.words('english'))

def get_synonyms(word):
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonym = lemma.name().replace('_', ' ')
            if synonym != word:
                synonyms.add(synonym)
    return list(synonyms)

def synonym_replacement(sentence, n):
    words = word_tokenize(sentence)
    non_stop_words = [word for word in words if word not in stop_words]
    random_words = random.sample(non_stop_words, min(n, len(non_stop_words)))
    new_words = words[:]
    for word in random_words:
        synonyms = get_synonyms(word)
        if synonyms:
            new_word = random.choice(synonyms)
            new_words = [new_word if w == word else w for w in new_words]
    return ' '.join(new_words)

def random_insertion(sentence, n, max_attempts=10):
    words = word_tokenize(sentence)
    for _ in range(n):
        attempts = 0
        inserted = False
        while not inserted and attempts < max_attempts:
            word = random.choice(words)
            if word not in stop_words:
                synonyms = get_synonyms(word)
                if synonyms:
                    new_word = random.choice(synonyms)
                    position = random.randint(0, len(words))
                    words.insert(position, new_word)
                    inserted = True
            attempts += 1
        if attempts == max_attempts:
            print(f"Failed to find a synonym for insertion after {max_attempts} attempts.")
    return ' '.join(words)


def random_swap(sentence, n):
    words = word_tokenize(sentence)
    length = len(words)
    if length < 2:
        print("Not enough words to perform swap.")
        return sentence  # Return the original sentence if not enough words to swap
    for _ in range(n):
        idx1, idx2 = random.sample(range(length), 2)
        words[idx1], words[idx2] = words[idx2], words[idx1]
    return ' '.join(words)


def random_deletion(sentence, p):
    words = word_tokenize(sentence)
    new_words = [word for word in words if random.random() > p]
    return ' '.join(new_words)

# Load the dataset
print("Loading dataset...")
df = pd.read_csv('new_spam.csv')

# Assuming the dataset is initially balanced
augment_funcs = [synonym_replacement, random_insertion, random_swap, random_deletion]
augmented_rows = []

print("Starting data augmentation...")
for index, row in df.iterrows():
    category, message = row['Category'], row['Message']
    for func in augment_funcs:
        if func in [synonym_replacement, random_insertion]:
            new_message = func(message, 2)
        elif func == random_swap:
            new_message = func(message, 2)
        elif func == random_deletion:
            new_message = func(message, 0.1)
        augmented_rows.append([category, new_message])
    print(f"Augmented row {index + 1}/{len(df)}")

# Create a DataFrame from the augmented rows and concatenate it with the original DataFrame
augmented_df = pd.DataFrame(augmented_rows, columns=['Category', 'Message'])
df_augmented = pd.concat([df, augmented_df]).drop_duplicates(subset=['Message'])

# Shuffle the DataFrame
df_augmented = df_augmented.sample(frac=1).reset_index(drop=True)

# Optionally, save the augmented DataFrame to a new CSV file
df_augmented.to_csv('augmented_spam.csv', index=False)

print("Augmentation complete. The augmented dataset has been saved.")


Loading dataset...
Starting data augmentation...
Augmented row 1/8245
Augmented row 2/8245
Augmented row 3/8245
Augmented row 4/8245
Augmented row 5/8245
Augmented row 6/8245
Augmented row 7/8245
Augmented row 8/8245
Augmented row 9/8245
Augmented row 10/8245
Augmented row 11/8245
Augmented row 12/8245
Augmented row 13/8245
Augmented row 14/8245
Augmented row 15/8245
Failed to find a synonym for insertion after 10 attempts.
Augmented row 16/8245
Augmented row 17/8245
Augmented row 18/8245
Augmented row 19/8245
Augmented row 20/8245
Augmented row 21/8245
Augmented row 22/8245
Augmented row 23/8245
Augmented row 24/8245
Augmented row 25/8245
Augmented row 26/8245
Augmented row 27/8245
Augmented row 28/8245
Augmented row 29/8245
Augmented row 30/8245
Augmented row 31/8245
Augmented row 32/8245
Augmented row 33/8245
Augmented row 34/8245
Augmented row 35/8245
Augmented row 36/8245
Augmented row 37/8245
Augmented row 38/8245
Augmented row 39/8245
Augmented row 40/8245
Augmented row 41/8245
