In [34]:
import pandas as pd
import re

In [None]:
# Function to normalize a DataFrame
def normalize_data(data):

    # Remove special characters from all string columns
    data = data.apply(lambda x: x.str.replace(r'ï¿½', '', regex=True) if x.dtype == "object" else x)

    # Strip leading and trailing whitespaces in string columns
    data = data.apply(lambda x: x.str.strip() if x.dtype == "object" else x)

    # Drop rows with missing values
    data.dropna(inplace=True)

    # Drop duplicate rows
    data.drop_duplicates(inplace=True)

    # Reset index to maintain order
    data.reset_index(drop=True, inplace=True)

    return data

In [36]:
def extract_subject(text):
    return re.sub(r'Subject: .*?(\n|$)', '', text, flags=re.IGNORECASE).strip()

In [37]:
# Function to change column data
def change_column_data(data):
    # Change column names
    data.columns = ['is_spam', 'text']

    # Normalize data
    data = normalize_data(data)

    # Change column data
    data['is_spam'] = data['is_spam'].map({'ham': 0, 'spam': 1, 0: 0, 1: 1})
    
    return data

In [38]:
def get_url_data(url_data):
    # Change column names
    url_data = url_data[['result', 'url']]

    # Normalize data
    url_data = change_column_data(url_data)

    return url_data

In [39]:
# Function to get email data
def get_email_data(email_data):
    # Load email data
    # print(email_data[['label', 'text']].head())
    email_data = email_data[['label', 'text']]

    # Change column data
    email_data = change_column_data(email_data)

    # Extract subject
    email_data['subject'] = email_data['text'].apply(extract_subject)

    return email_data

In [40]:
# Function to get sms data
def get_sms_data(sms_data):
    sms_data = sms_data[['v1', 'v2']]

    # Change column data
    sms_data = change_column_data(sms_data)
    
    return sms_data

In [41]:
# Function to get youtube data
def get_youtube_data(youtube_data):
    # Load youtube data
    youtube_data = youtube_data[['CLASS', 'CONTENT']]

    # Change column data
    youtube_data = change_column_data(youtube_data)

    return youtube_data

In [42]:

# Function to combine data
# combine data i want it to take unlimited amount of fike and combine them
def combine_data(all_data):
    # Loop through an concatate data
    data = pd.concat(all_data, ignore_index=True)

    return data

In [43]:
# Saves data
def save_data(data, output_file):
    # Save data
    data.to_csv(output_file, index=False)

In [44]:
# Load datasets
# email_dataset_1 = get_email_data(pd.read_csv('data/email_dataset_1.csv'))
# email_dataset_2 = get_email_data(pd.read_csv('data/email_dataset_2.csv'))
# email_dataset_3 = get_email_data(pd.read_csv('data/email_dataset_3.csv'))
# email_dataset_4 = get_email_data(pd.read_csv('data/email_dataset_4.csv'))
# text_dataset_1 = get_sms_data(pd.read_csv('data/sms_dataset_1.csv')) 
# text_dataset_2 = get_sms_data(pd.read_csv('data/sms_dataset_2.csv')) 
# youtube_dataset = get_youtube_data(pd.read_csv('data/youtube_comments_dataset.csv'))
url_dataset = get_url_data(pd.read_csv('data/url_dataset.csv'))

# Combine datasets
data = combine_data([url_dataset])

# Save data
save_data(data, 'data/spam_data.csv')