# Term-Frequency Inverse Document Frequency

# Import packages and data

In [1]:
# Pre-processing
import numpy as np
import pandas as pd
import re
from collections import Counter

# Vectorizers and model
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# Metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [2]:
# Data from https://www.kaggle.com/datasets/jackksoncsie/spam-email-dataset?resource=download
data = pd.read_csv('../data/external/emails.csv')
print(data.info())
print('\n',data.spam.value_counts())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5728 entries, 0 to 5727
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    5728 non-null   object
 1   spam    5728 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 89.6+ KB
None

 spam
0    4360
1    1368
Name: count, dtype: int64


# Pre-process text

In [3]:
# Get sample
sample = data.iloc[0,0]

# Instantiate stops words
stop_words = ['a', 'an', 'and', 'are', 'be', 'but', 'from', 'if', 'in', 'is', 'it', 'of', 'on', 'our', 'that', 'the', 'to', 'we', 'will', 'you', 'your']

# Clean in order
def clean_text(text, print_result=False, stop_words=stop_words):
    # Instantiate patterns for cleaning
    patt_remove_subj = r'(?i)subject:\s?' # Drop opening "subject:' mention
    patt_remove_non_alpha = r'[^a-zA-Z\s]' # Remove anything that is not a word or space character
    patt_remove_new_lines_and_multiple_spaces = r'[^\w\s]' # Remove anything that is not a word or space character
    
    # Apply cleaning steps
    for patt in [patt_remove_subj, patt_remove_non_alpha, patt_remove_new_lines_and_multiple_spaces]:
        text = re.sub(patt,'',text)
    
    # Drop stopwords
    text = ' '.join(word for word in text.split() if word not in stop_words)

    # See clean sample
    if print_result:
        print(text)
        return text
    else:
        return text

# See example
clean_sample = clean_text(sample, print_result=True)

naturally irresistible corporate identity lt really hard recollect company market full suqgestions information isoverwhelminq good catchy logo stylish statlonery outstanding website make task much easier do not promise havinq ordered iogo company automaticaily become world ieader isguite ciear without good products effective business organization practicable aim hotat nowadays market do promise marketing efforts become much more effective here list clear benefits creativeness hand made original logos specially done reflect distinctive company image convenience logo stationery provided all formats easy use content management system letsyou change website content even its structure promptness see logo drafts within three business days affordability marketing break through shouldn t make gaps budget satisfaction guaranteed provide unlimited amount changes with no extra fees for surethat love result this collaboration have look at portfolio not interested


In [4]:
# Implement cleaning to all text
data['clean_text'] = data['text'].apply(clean_text)
data.head()
    

Unnamed: 0,text,spam,clean_text
0,Subject: naturally irresistible your corporate...,1,naturally irresistible corporate identity lt r...
1,Subject: the stock trading gunslinger fanny i...,1,stock trading gunslinger fanny merrill muzo no...
2,Subject: unbelievable new homes made easy im ...,1,unbelievable new homes made easy im wanting sh...
3,Subject: 4 color printing special request add...,1,color printing special request additional info...
4,"Subject: do not have money , get software cds ...",1,do not have money get software cds here softwa...


# Calculate Inverse Document Frequency (IDF)

In [5]:
# Get list of documents
clean_text_list = data.clean_text.to_list()
documents_list = [doc.split() for doc in clean_text_list]

# Get all text as a single string
documents_str = ' '.join(doc for doc in clean_text_list)

# Get unique words and see count
unique_term_list = list(set(documents_str.split()))

print(f'Length of unique terms: {len(unique_term_list):,}')
print(len(documents_list))

Length of unique terms: 33,720
5728


In [6]:
def get_idf(term):

    # Calcuate inverse document frequency
    num_docs_containing_word = sum(1 for doc in documents_list if term in doc)
    N = len(documents_list)
    idf = np.log(N / (1 + num_docs_containing_word)) 
    return idf

# Get dataframe
idf_df = pd.DataFrame(data=unique_term_list, columns=['term'])

# Create IDF values
idf_df['idf'] = idf_df['term'].apply(get_idf)

# Create vocab ID column and set term as index
idf_df.reset_index(drop=False, names = 'vocab_id', inplace=True)
idf_df.set_index('term',inplace=True)

In [7]:
idf_df.head()

Unnamed: 0_level_0,vocab_id,idf
term,Unnamed: 1_level_1,Unnamed: 2_level_1
principie,0,7.959975
halluin,1,7.266827
estranged,2,7.554509
fritch,3,7.959975
southtrust,4,7.959975


# Calculate Term Frequency

In [8]:
def get_term_frequency(document):
    
    # Get length of row
    term_list = document.split()
    num_of_terms = len(term_list)
    
    # Get term frequency for each row
    counter_dct = Counter(term_list)

    # Convert counter object to dictionary, then dataframe
    tf_dct = {key:(counter_dct[key]/num_of_terms) for key in counter_dct if counter_dct[key] > 0}

    return tf_dct

data['tf'] = data['clean_text'].apply(get_term_frequency)

# Calculate TF-IDF

In [9]:
def get_tf_idf(tf_dict):

    """
    For each document in dataframe, calculate the term-frequency (TF).
    Next, search the IDF dataframe for the corresponding IDF value and vocab ID for each term.
    Finally, take the product of the TF and IDF and add to TF-IDF dictionary with vocab ID as key.
    """

    tf_idf_dct = {}

    # Loop through dictionary representing each document (row)
    for term in tf_dict:

        # Get term frequency from dictionary
        tf = tf_dict[term]

        # Get the corresponding idf value for the given term
        idf = idf_df.loc[term,'idf']

        # Get the corresponding idf value for the given term
        vocab_id = idf_df.loc[term,'vocab_id']
        
        # Multiply tf by idf for tf-idf
        tf_idf = tf * idf

        # Create dictionary item using vocab_id and tf-idf
        tf_idf_dct[vocab_id] = tf_idf
    
    return tf_idf_dct

In [10]:
# Create tf_idf columns
data['tf_idf'] = data['tf'].apply(get_tf_idf)

# Narrow dataframe to key columns
df = data[['clean_text','tf_idf','spam']].copy()

# Rename label column
df.rename(columns={'spam':'label'},inplace=True)

In [11]:
# See results
df.head()

Unnamed: 0,clean_text,tf_idf,label
0,naturally irresistible corporate identity lt r...,"{23737: 0.04558552702636202, 29850: 0.05460219...",1
1,stock trading gunslinger fanny merrill muzo no...,"{23317: 0.05089119735585338, 11618: 0.03300236...",1
2,unbelievable new homes made easy im wanting sh...,"{7489: 0.13415673810123765, 4050: 0.0331978551...",1
3,color printing special request additional info...,"{10839: 0.09809136673852725, 5630: 0.196182733...",1
4,do not have money get software cds here softwa...,"{12109: 0.04026561719433407, 9254: 0.031192238...",1


# TF-IDF from Scratch and Logistic Regression

In [12]:
# Convert tf_idf column to list of dictionaries
X = df['tf_idf'].values.tolist()  
y = df['label'].values

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert TF-IDF dictionaries into lists of values
vectorizer = DictVectorizer(sparse=True)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Initialize and train logistic regression model
model = LogisticRegression()
model.fit(X_train_vec, y_train)

# Predict labels for test data
y_pred = model.predict(X_test_vec)

# Evaluate model performance
accuracy = model.score(X_test_vec, y_test)
print("Accuracy:", accuracy)
print('Classification Report:\n', classification_report(y_test, y_pred))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_pred))

Accuracy: 0.8726003490401396
Classification Report:
               precision    recall  f1-score   support

           0       0.85      1.00      0.92       856
           1       1.00      0.50      0.66       290

    accuracy                           0.87      1146
   macro avg       0.93      0.75      0.79      1146
weighted avg       0.89      0.87      0.86      1146

Confusion Matrix:
 [[856   0]
 [146 144]]


# TF-IDF and Logistic Regression with Sci-Kit Learn

In [13]:
# Convert tf_idf column to list of dictionaries
X1 = df['clean_text'].values.tolist()  
y1 = df['label'].values

# Split data into training and testing sets
X_train1, X_test1, y_train1, y_test1 = train_test_split(X1, y1, test_size=0.2, random_state=42)

# Instantiate vectorizer
vectorizer = TfidfVectorizer()

# Fit and transform
X_train_vec_sklearn = vectorizer.fit_transform(X_train1)
X_test_vec_sklearn = vectorizer.transform(X_test1)

# Initialize and train logistic regression model
model_sklearn = LogisticRegression()
model_sklearn.fit(X_train_vec_sklearn, y_train1)

# Predict labels for test data
y_pred1 = model_sklearn.predict(X_test_vec_sklearn)

# Evaluate model performance
accuracy1 = model_sklearn.score(X_test_vec_sklearn, y_test1)
print("Accuracy:", accuracy1)
print('Classification Report:\n', classification_report(y_test1, y_pred1))
print('Confusion Matrix:\n', confusion_matrix(y_test1, y_pred1))

Accuracy: 0.9773123909249564
Classification Report:
               precision    recall  f1-score   support

           0       0.97      1.00      0.99       856
           1       1.00      0.91      0.95       290

    accuracy                           0.98      1146
   macro avg       0.98      0.96      0.97      1146
weighted avg       0.98      0.98      0.98      1146

Confusion Matrix:
 [[855   1]
 [ 25 265]]


# Fin