In [2]:
import pandas as pd
import feature_extraction_mod as fe
from nltk.stem.porter import PorterStemmer

# Initialize stemmer
stemmer = PorterStemmer()

# Applied after features have been extracted for more normalization
def text_postprocessing(col):
    # Lowercase all text
    col = col.str.lower()
    # Tokenizing string and stemming words, then joining back as string
    return col.apply(lambda text: ' '.join(stemmer.stem(word) for word in text.split()))

enron_combined_df = pd.read_parquet('data/clean_enron_legit_emails.parquet')
nazario_phishing_emails_df = pd.read_csv('data/clean_nazario_phishing_emails.csv')
nigerian_phishing_emails_df = pd.read_csv('data/clean_nigerian_phishing_emails.csv')

# - Combining Clean Datasets -
combined_df = pd.concat([enron_combined_df, nazario_phishing_emails_df, nigerian_phishing_emails_df], ignore_index=True)

# - Feature Extraction on Combined Dataframe -
# Extract percentage of capital letters in email subject (subject_caps_percent)
combined_df['subject_caps_percent'] = fe.calc_caps_percent(combined_df['subject'])

# Extract percentage of capital letters in email body (body_caps_percent)
combined_df['body_caps_percent'] = fe.calc_caps_percent(combined_df['body'])

# Extract word count of email subject (subject_word_count)
combined_df['subject_word_count'] = fe.calc_word_count(combined_df['subject'])

# Extract word count of email body (body_word_count)
combined_df['body_word_count'] = fe.calc_word_count(combined_df['body'])

# Extract unigram, bigram, trigram match counts in subject + body
combined_df['unigram_count'] = fe.count_matching_ngrams(combined_df['subject'] + ' ' + combined_df['body'], 1)
combined_df['bigram_count'] = fe.count_matching_ngrams(combined_df['subject'] + ' ' + combined_df['body'], 2)
combined_df['trigram_count'] = fe.count_matching_ngrams(combined_df['subject'] + ' ' + combined_df['body'], 3)

# Extract unigram percentage using unigram count and total word count in subject + body
combined_df['unigram_percent'] = round(combined_df['unigram_count'] / (combined_df['subject_word_count'] + combined_df['body_word_count']) * 100, 2)

# Calculate composite score
combined_df['composite_score'] = round(
    fe.calc_composite_score(combined_df['unigram_count'], combined_df['bigram_count'], combined_df['trigram_count']), 2)

# Apply post text processing after features have been extracted
combined_df['subject'] = text_postprocessing(combined_df['subject'])
combined_df['body'] = text_postprocessing(combined_df['body'])

combined_df = combined_df[['subject', 'body', 'subject_caps_percent', 'body_caps_percent', 'subject_word_count', 'body_word_count', 'unigram_count', 'bigram_count', 'trigram_count','unigram_percent', 'composite_score', 'url_label', 'label']]

# Save combined dataframe to parquet file
combined_df.to_parquet('data/preprocessed_combined_data.parquet', index=False)

# Preview combined dataframe
combined_df

Unnamed: 0,subject,body,subject_caps_percent,body_caps_percent,subject_word_count,body_word_count,unigram_count,bigram_count,trigram_count,unigram_percent,composite_score,url_label,label
0,prc review phone call,ani morn between 10 and 11 30,15.79,0.00,4,4,0,0,0,0.00,0.00,0,0
1,high speed internet access,1 login pallen pw ke9davi i dont think these a...,17.39,16.67,4,22,0,0,0,0.00,0.00,0,0
2,fix forward or other collar floor ga price term,fix forward or other collar floor ga price ter...,2.33,5.30,9,263,5,0,0,1.84,0.83,0,0
3,consolid posit issu to do list,consolid posit issu to do list phillip below i...,11.43,2.91,6,851,10,0,0,1.17,1.67,0,0
4,var report and resourc meet,var report and resourc meet pleas plan to atte...,12.90,11.00,5,40,1,0,0,2.22,0.17,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
92535,contact global max shipe compani,atten my dear i have paid the fee for your che...,100.00,17.16,5,231,7,1,0,2.97,1.50,0,1
92536,treat as urgent,from mr ali sherif african develop bank adb ou...,100.00,3.67,3,521,40,2,0,7.63,7.33,1,1
92537,from dr usman ibrahim mr wahid yoff properti,from dr usman ibrahim danko audit and account ...,18.42,6.05,8,710,36,2,0,5.01,6.67,1,1
92538,my belov in christ,belov in the lord jesu christ pleas endeavour ...,23.53,9.01,4,512,13,0,0,2.52,2.17,1,1
