IMPORTANT!! Remember to ONLY engineer based on training data. 
All engineering must be able to pass to testing and validation data 
WITHOUT needing to be directly viewed / manually analysed.

Feature Engineering/Extraction

Objective: 
(depends on output from Code_Inspect)
If decision is to use Ling and Enron:
- extract sender (if possible)
- extract urls (if possible)            <-- PRIORITY --#


In [1]:
import pandas as pd
import URLExtraction
import URLHandling
from pathlib import Path

In [2]:
# Prep import lists
from_save = ['Nigerian_Fraud', 'Nazario', 'SpamAssasin', 'CEAS_08', 'Enron', 'Ling']
save_levels = ['_train', '_val', '_test']

In [3]:
# Define list of dataframes
df_train_list = []
df_val_list = []
df_test_list = []

In [4]:
# Define necessary class instances
extract_urls = URLExtraction.url_extraction()
replace_urls = URLHandling.url_handling()

In [7]:
# Make output directory if it doesn't exist
input_dir = Path("Split_Data/Uncleaned")
input_dir.mkdir(parents=True, exist_ok=True)

for dataset in from_save:
    for level in save_levels:
        var_name = f"{dataset}{level}"
        input_path = input_dir / f"{var_name}.csv"
        df = pd.read_csv(input_path)
        if df is not None:
            '''
            Dropping all columns other than subject, body, and label.
                This way, we can use all 6 datasets
                The url extractor designed in RULExtraction will be used to 
                    handle URLs. We will have count and distinct_count, which 
                    will contain all information (and more) from urls
                        (urls was just a 0,1 column stating whether or 
                        not a url appeared in body)
            '''
            df0 = df.drop(columns=['sender', 'receiver', 'date', 'urls',], errors='ignore')
            
            # Fill null values in subject and body
            df0[['subject', 'body']] = df0[['subject', 'body']].fillna('<missing>')

            df1 = extract_urls.df_extractor(df0, 'body', 'url_dict')
            df2 = replace_urls.url_replacement(df1, 'body', 'url_dict', 'cleaned_body', indexed=False)
                # indexed=False by default: if we want <url1> <url2> ..., change to True
            # Perform additional cleaning:
                # extract url_count and distinct_url_count from url_dict
            df2['url_count'] = df2['url_dict'].apply(len)
            df2['distinct_url_count'] = df2['url_dict'].apply(lambda d: sum(d.values()))
            df2['body'] = df2['cleaned_body']
            df3 = df2.drop(columns=['cleaned_body', 'url_dict'])
            
            # # Fill null values in url counters
            # df3[['subject', 'body']] = df3[['subject', 'body']].fillna('<missing>')
            df3[['url_count', 'distinct_url_count']] = df3[['url_count', 'distinct_url_count']].fillna('<missing>')


            df_final = df3 #placeholder: df2 is wrong: fix later.

            # Append dataset to dataset list
            if level == '_train':
                df_train_list.append(df_final)
            elif level == '_val':
                df_val_list.append(df_final)
            else:
                df_test_list.append(df_final)
        else:
            print(f"Dataset '{var_name}' not found.")

In [8]:
# # For testing: remove later
# df = pd.DataFrame({'test': 'test'})
# # End test: below is required. Merge with necessary loop
# '''
# Dropping all columns other than subject, body, and label.
#     This way, we can use all 6 datasets
#     The url extractor designed in RULExtraction will be used to 
#         handle URLs. We will have count and distinct_count, which 
#         will contain all information (and more) from urls
#             (urls was just a 0,1 column stating whether or 
#             not a url appeared in body)
# '''
# df0 = df.drop(columns=['sender', 'receiver', 'date', 'urls',], errors='ignore')
# df1 = extract_urls.df_extractor(df0, 'body', 'url_dict')
# df2 = replace_urls.url_replacement(df1, 'body', 'url_dict', 'cleaned_body', indexed=False)
#     # indexed=False by default: if we want <url1> <url2> ..., change to True
# # Perform additional cleaning:
#     # extract url_count and distinct_url_count from url_dict
# df2['url_count'] = df2['url_dict'].apply(len)
# df2['distinct_url_count'] = df2['url_dict'].apply(lambda d: sum(d.values()))
# df2['body'] = df2['cleaned_body']
# df3 = df2.drop(columns=['cleaned_body', 'url_dict'])

# df_final = df3 #placeholder: df2 is wrong: fix later.

# # Append dataset to dataset list
# df_list.append(df_final)

In [9]:
# Concat all dataframes
df_train_combined = pd.concat(df_train_list, ignore_index=True)
df_val_combined = pd.concat(df_val_list, ignore_index=True)
df_test_combined = pd.concat(df_test_list, ignore_index=True)

In [10]:
# Save dataset
# Make output directory if it doesn't exist
output_dir = Path("Split_Data/Cleaned")
output_dir.mkdir(parents=True, exist_ok=True)
train_path = output_dir / f"cleaned_train_data.csv"
val_path = output_dir / f"cleaned_val_data.csv"
test_path = output_dir / f"cleaned_test_data.csv"

df_train_combined.to_csv(train_path)
df_val_combined.to_csv(val_path)
df_test_combined.to_csv(test_path)