In [1]:
# 可供参考 https://www.kaggle.com/code/surekharamireddy/spam-detection-with-99-accuracy

In [2]:
import os
import pandas as pd
import numpy as np

# 根目录
root_dir = "lingspam_public/bare"

# 创建DataFrame with 3 cols
# df = pd.DataFrame(columns=["label", "subject", "main_body"])
df = []

# 遍历根目录下的10个文件夹
for i in range(1, 11):
    folder = os.path.join(root_dir, "part{}".format(i))
    file_list = os.listdir(folder)

    # 遍历每个文件
    for file_name in file_list:
        # 重复文件名的去掉 df.drop_duplicates(subset=['file_name'], keep='first', inplace=True)

        with open(os.path.join(folder, file_name), "r") as f:
            file_content = f.readlines()

            # 获取标签
            if "spm" in file_name:
                label = "spam"
            else:
                label = "non-spam"

            # 获取主题行和正文
            # 这里把Subject: 去掉了, 只留下subject本身
            if file_name == "Icon": continue
            if len(file_content) == 0:
                print(folder, file_name)
            else:
                subject = file_content[0].replace("Subject: ", "").strip()
            # 用strip()是为了防止存在换行符\n
            main_body = "".join([line.strip() for line in file_content[1:]])

            # 将信息添加到DataFrame中
            df.append(pd.DataFrame({"label": label, "subject": subject, "main_body": main_body}, index=[0]))

# 将所有数据连接在一起
df = pd.concat(df, ignore_index=True)

# ""实际上不是空值, 他的值就是""
# 这里把所有""替换成空值
df["label"].replace("", np.nan, inplace=True)
df["subject"].replace("", np.nan, inplace=True)
df["main_body"].replace("", np.nan, inplace=True)

In [3]:
df

Unnamed: 0,label,subject,main_body
0,non-spam,re : 2 . 882 s - > np np,"> date : sun , 15 dec 91 02 : 25 : 02 est > fr..."
1,non-spam,s - > np + np,the discussion of s - > np + np reminds me tha...
2,non-spam,2 . 882 s - > np np,. . . for me it 's much more restrictive than ...
3,non-spam,gent conference,""" for the listserv "" international conference ..."
4,non-spam,query : causatives in korean,could anyone point me to any books and article...
...,...,...,...
2888,spam,lucky you !,congratulations ! you ' ve been selected to en...
2889,spam,new on capitalfm . com,this is new at http : / / capitalfm . com - ex...
2890,spam,submit 600,this is not spam ; you are receiving this mess...
2891,spam,submit 600,this is not spam ; you are receiving this mess...


In [4]:
# df.to_csv("dataframe.csv", index=False)

# assignment #7 参考kaggle

In [5]:
# check null values
df.isnull().sum()

label         0
subject      62
main_body     0
dtype: int64

In [6]:
'''
Data Cleaning: Handling of Incomplete & Missing Data

miss label - drop
miss subject - keep
miss main body - drop
'''
# 如果主题行缺失，则用 Missing 替代
# df["subject"] = df["subject"].fillna("missing")
df["subject"].fillna("missing", inplace=True)

# 直接删掉
df.dropna(subset=["label"], inplace=True)
df.dropna(subset=["main_body"], inplace=True)

df

Unnamed: 0,label,subject,main_body
0,non-spam,re : 2 . 882 s - > np np,"> date : sun , 15 dec 91 02 : 25 : 02 est > fr..."
1,non-spam,s - > np + np,the discussion of s - > np + np reminds me tha...
2,non-spam,2 . 882 s - > np np,. . . for me it 's much more restrictive than ...
3,non-spam,gent conference,""" for the listserv "" international conference ..."
4,non-spam,query : causatives in korean,could anyone point me to any books and article...
...,...,...,...
2888,spam,lucky you !,congratulations ! you ' ve been selected to en...
2889,spam,new on capitalfm . com,this is new at http : / / capitalfm . com - ex...
2890,spam,submit 600,this is not spam ; you are receiving this mess...
2891,spam,submit 600,this is not spam ; you are receiving this mess...


In [7]:
'''
Data Cleaning: Handling of Noisy Data

noisy data -> meaningless data: all punctuations -> drop
noisy data -> redundant data: repetitive data -> drop
# REPLACING EMAIL IDs BY 'MAILID'
# REPLACING URLs  BY 'Links'
# REPLACING CURRENCY SIGNS BY 'MONEY'
# REPLACINg NUMBERS by 'numbers'
'''

# drop all punctuations
df['subject'] = df['subject'].str.replace(r'[^\w\s]', '', regex=True)
df['main_body'] = df['main_body'].str.replace(r'[^\w\s]', '', regex=True)

# drop repetitive data
df.drop_duplicates(subset=["subject"], inplace=True) # 可以删掉
df.drop_duplicates(subset=["main_body"], inplace=True)

# replace email by 'MailID'
df['subject']=df['subject'].str.replace(r'^.+@[^\.].*\.[a-z]{2,}$','MailID', regex=True) # 可以删掉
df['main_body']=df['main_body'].str.replace(r'^.+@[^\.].*\.[a-z]{2,}$','MailID', regex=True)

# replace links by 'Links'
df['subject']=df['subject'].str.replace(r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$','Links', regex=True) # 可以删掉
df['main_body']=df['main_body'].str.replace(r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$','Links', regex=True)

# replace currency by 'money'
df['subject']=df['subject'].str.replace(r'£|\$', 'money', regex=True)
df['main_body']=df['main_body'].str.replace(r'£|\$', 'money', regex=True)

# replace numbers by 'numbers'
df['subject']=df['subject'].str.replace(r'\d+(\.\d+)?', 'numbers')
df['main_body']=df['main_body'].str.replace(r'\d+(\.\d+)?', 'numbers')

df

  df['subject']=df['subject'].str.replace(r'\d+(\.\d+)?', 'numbers')
  df['main_body']=df['main_body'].str.replace(r'\d+(\.\d+)?', 'numbers')


Unnamed: 0,label,subject,main_body
0,non-spam,re numbers numbers s np np,date sun numbers dec numbers numbers numbe...
1,non-spam,s np np,the discussion of s np np reminds me that s...
2,non-spam,numbers numbers s np np,for me it s much more restrictive than s ...
3,non-spam,gent conference,for the listserv international conference nu...
4,non-spam,query causatives in korean,could anyone point me to any books and article...
...,...,...,...
2884,spam,angels sent to serve mankind,learn to put angels to work angels are anothe...
2885,spam,lucky you,congratulations you ve been selected to ente...
2886,spam,review any book pc or mac software pgm consu...,we are celebrating our numbersth issue of our ...
2887,spam,stock market information for you,sender trinity ventures inc address number...


In [8]:
'''
Data Cleaning: Handling of Inconsistent Data

lowercase / uppercase -> all lowercase
# REPLACING NEXT LINES BY 'WHITE SPACE'
# REPLACING LARGE WHITE SPACE BY SINGLE WHITE SPACE
# REPLACING LEADING AND TRAILING WHITE SPACE BY SINGLE WHITE SPACE
# REPLACING SPECIAL CHARACTERS  BY WHITE SPACE
'''

# convert to lowercase
df['subject']=df['subject'].str.lower()
df['main_body']=df['main_body'].str.lower()

# replace special characters by white space
df['subject']=df['subject'].str.replace(r"[^a-zA-Z0-9]+", " ", regex=True)
df['main_body']=df['main_body'].str.replace(r"[^a-zA-Z0-9]+", " ", regex=True)

# replace leading and trailing white space by single white space
df['subject']=df['subject'].str.replace(r'^\s+|\s+?$', ' ', regex=True)
df['main_body']=df['main_body'].str.replace(r'^\s+|\s+?$', ' ', regex=True)

# replace next line by white space
df['subject']=df['subject'].str.replace(r'\n'," ", regex=True)
df['main_body']=df['main_body'].str.replace(r'\n'," ", regex=True)

# 这个要放最后
# replace large white space by single white space
df['subject']=df['subject'].str.replace(r'\s+', ' ', regex=True)
df['main_body']=df['main_body'].str.replace(r'\s+', ' ', regex=True)

df

Unnamed: 0,label,subject,main_body
0,non-spam,re numbers numbers s np np,date sun numbers dec numbers numbers numbers ...
1,non-spam,s np np,the discussion of s np np reminds me that some...
2,non-spam,numbers numbers s np np,for me it s much more restrictive than s np n...
3,non-spam,gent conference,for the listserv international conference num...
4,non-spam,query causatives in korean,could anyone point me to any books and article...
...,...,...,...
2884,spam,angels sent to serve mankind,learn to put angels to work angels are another...
2885,spam,lucky you,congratulations you ve been selected to enter ...
2886,spam,review any book pc or mac software pgm consume...,we are celebrating our numbersth issue of our ...
2887,spam,stock market information for you,sender trinity ventures inc address numbers we...


In [9]:
# 在完成后续的操作以后可能原本不是空的main body会变成空
# 所以可能需要再次处理
df["subject"].replace(" ", np.nan, inplace=True)
df["main_body"].replace(" ", np.nan, inplace=True)

df["subject"].fillna("missing", inplace=True)
df.dropna(subset=["main_body"], inplace=True)

df.insert(len(df.columns)-1, 'length', df['main_body'].apply(len))

df

Unnamed: 0,label,subject,length,main_body
0,non-spam,re numbers numbers s np np,947,date sun numbers dec numbers numbers numbers ...
1,non-spam,s np np,394,the discussion of s np np reminds me that some...
2,non-spam,numbers numbers s np np,93,for me it s much more restrictive than s np n...
3,non-spam,gent conference,7118,for the listserv international conference num...
4,non-spam,query causatives in korean,176,could anyone point me to any books and article...
...,...,...,...,...
2884,spam,angels sent to serve mankind,1176,learn to put angels to work angels are another...
2885,spam,lucky you,541,congratulations you ve been selected to enter ...
2886,spam,review any book pc or mac software pgm consume...,5837,we are celebrating our numbersth issue of our ...
2887,spam,stock market information for you,3918,sender trinity ventures inc address numbers we...


In [10]:
# df.to_csv("cleaned_data.csv", index=False)

## Data Transformation

In [11]:
# import nltk
# nltk.download('wordnet')
# nltk.download('omw-1.4')

In [12]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# removing stopwords
stop = stopwords.words('english')
df['subject'] = df['subject'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
df['main_body'] = df['main_body'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

# Apply lemmatization
lemm = WordNetLemmatizer()
df['subject'] = df['subject'].apply(lambda x: ' '.join([lemm.lemmatize(word, pos="v") for word in x.split()]))
df['main_body'] = df['main_body'].apply(lambda x: ' '.join([lemm.lemmatize(word, pos="v") for word in x.split()]))

In [13]:
df["label"].replace("non-spam", 0, inplace=True)
df["label"].replace("spam", 1, inplace=True)

In [14]:
# To see the change in text length after removing stop words
df['length']=df['main_body'].apply(len)
df

Unnamed: 0,label,subject,length,main_body
0,0,number number np np,626,date sun number dec number number number numbe...
1,0,np np,295,discussion np np remind years ago read source ...
2,0,number number np np,51,much restrictive np np np pro quite overrestri...
3,0,gent conference,5491,listserv international conference number secon...
4,0,query causatives korean,137,could anyone point book article causative cons...
...,...,...,...,...
2884,1,angels send serve mankind,750,learn put angels work angels another race be o...
2885,1,lucky,352,congratulations select enter vacation adventur...
2886,1,review book pc mac software pgm consumer produ...,4237,celebrate numbersth issue inet review newslett...
2887,1,stock market information,2959,sender trinity venture inc address number west...


In [15]:
# 更正由于drop stop words导致的空值
df['subject']=df['subject'].str.replace(r'\s+', ' ', regex=True)
df['main_body']=df['main_body'].str.replace(r'\s+', ' ', regex=True)

df["subject"].replace(" ", np.nan, inplace=True)
df["main_body"].replace(" ", np.nan, inplace=True)

df["subject"].fillna("missing", inplace=True)
df.dropna(subset=["main_body"], inplace=True)

In [16]:
# df.to_csv("removedstop_cleaned_data.csv", index=False)

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(df['main_body'])
df1 = pd.DataFrame(vectors.toarray(), columns=vectorizer.get_feature_names_out())
df1

Unnamed: 0,aa,aaa,aaai,aaainumbers,aaal,aaanumbers,aaarghh,aaas,aabb,aabyhoej,...,zwischen,zwitserlood,zxgahnumbersqabjh,zybatov,zybatow,zygmunt,zyokyoozyu,zytkow,zzlsa,zznumbers
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2591,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2592,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2593,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2594,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
# from sklearn.decomposition import TruncatedSVD

# # n_components must be between 1 and min(vectors.shape)
# svd = TruncatedSVD(n_components=1750)
# svd.fit(vectors)
# print(svd.explained_variance_ratio_.sum())

In [19]:
# transformed_vectors = svd.transform(vectors)
# print(transformed_vectors)
# print("dimension =", transformed_vectors.shape)

In [20]:
# 降维后的
# df1 = pd.DataFrame(transformed_vectors)
# df1

In [21]:
'''# https://mccormickml.com/2019/05/14/BERT-word-embeddings-tutorial/
from transformers import BertModel,BertTokenizer
import torch

bert_model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states = True)
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def get_bert_embeddings(text, model, tokenizer):

    marked_text = "[CLS] " + text + " [SEP]"
    tokenized_text = tokenizer.tokenize(marked_text)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    segments_ids = [1]*len(indexed_tokens)

    # convert inputs to tensors
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensor = torch.tensor([segments_ids])

    with torch.no_grad():
        # obtain hidden states
        outputs = model(tokens_tensor, segments_tensor)
        hidden_states = outputs[2]

    # `token_vecs` is a tensor with shape [22 x 768]
    token_vecs = hidden_states[-2][0]

    # Calculate the average of all 22 token vectors.
    sentence_embedding = torch.mean(token_vecs, dim=0)

    return sentence_embedding'''

'# https://mccormickml.com/2019/05/14/BERT-word-embeddings-tutorial/\nfrom transformers import BertModel,BertTokenizer\nimport torch\n\nbert_model = BertModel.from_pretrained(\'bert-base-uncased\', output_hidden_states = True)\nbert_tokenizer = BertTokenizer.from_pretrained(\'bert-base-uncased\')\n\ndef get_bert_embeddings(text, model, tokenizer):\n\n    marked_text = "[CLS] " + text + " [SEP]"\n    tokenized_text = tokenizer.tokenize(marked_text)\n    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)\n    segments_ids = [1]*len(indexed_tokens)\n\n    # convert inputs to tensors\n    tokens_tensor = torch.tensor([indexed_tokens])\n    segments_tensor = torch.tensor([segments_ids])\n\n    with torch.no_grad():\n        # obtain hidden states\n        outputs = model(tokens_tensor, segments_tensor)\n        hidden_states = outputs[2]\n\n    # `token_vecs` is a tensor with shape [22 x 768]\n    token_vecs = hidden_states[-2][0]\n\n    # Calculate the average of all 22 token

In [22]:
# 如果需要做word vectors的话
"""
def get_bert_word_vectors(text, model, tokenizer):

    marked_text = "[CLS] " + text + " [SEP]"
    tokenized_text = tokenizer.tokenize(marked_text)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    segments_ids = [1]*len(indexed_tokens)

    # convert inputs to tensors
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensor = torch.tensor([segments_ids])
    
    with torch.no_grad():
        # obtain hidden states
        outputs = model(tokens_tensor, segments_tensor)
        hidden_states = outputs[2]

    # concatenate the tensors for all layers
    # use "stack" to create new dimension in tensor
    token_embeddings = torch.stack(hidden_states, dim=0)

    # # remove dimension 1, the "batches"
    token_embeddings = torch.squeeze(token_embeddings, dim=1)

    # # swap dimensions 0 and 1 so we can loop over tokens
    token_embeddings = token_embeddings.permute(1,0,2)

    # # intialized list to store embeddings
    token_vecs_sum = []

    # # "token_embeddings" is a [Y x 12 x 768] tensor
    # # where Y is the number of tokens in the sentence

    # # loop over tokens in sentence
    for token in token_embeddings:

        # "token" is a [12 x 768] tensor

        # sum the vectors from the last four layers
        sum_vec = torch.sum(token[-4:], dim=0)
        token_vecs_sum.append(sum_vec)

    return token_vecs_sum
"""

'\ndef get_bert_word_vectors(text, model, tokenizer):\n\n    marked_text = "[CLS] " + text + " [SEP]"\n    tokenized_text = tokenizer.tokenize(marked_text)\n    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)\n    segments_ids = [1]*len(indexed_tokens)\n\n    # convert inputs to tensors\n    tokens_tensor = torch.tensor([indexed_tokens])\n    segments_tensor = torch.tensor([segments_ids])\n    \n    with torch.no_grad():\n        # obtain hidden states\n        outputs = model(tokens_tensor, segments_tensor)\n        hidden_states = outputs[2]\n\n    # concatenate the tensors for all layers\n    # use "stack" to create new dimension in tensor\n    token_embeddings = torch.stack(hidden_states, dim=0)\n\n    # # remove dimension 1, the "batches"\n    token_embeddings = torch.squeeze(token_embeddings, dim=1)\n\n    # # swap dimensions 0 and 1 so we can loop over tokens\n    token_embeddings = token_embeddings.permute(1,0,2)\n\n    # # intialized list to store embeddings\n

In [23]:
'''# Bert Embedding 前510字 （能运行）
df['truncated'] = df['main_body'].apply(lambda x: x[:510] if len(x)>510 else x)
df['bert_embedding'] = df['truncated'].apply(lambda x: get_bert_embeddings(x, bert_model, bert_tokenizer))'''

"# Bert Embedding 前510字 （能运行）\ndf['truncated'] = df['main_body'].apply(lambda x: x[:510] if len(x)>510 else x)\ndf['bert_embedding'] = df['truncated'].apply(lambda x: get_bert_embeddings(x, bert_model, bert_tokenizer))"

In [24]:
# 网上找的几种方法 但都经常报错（运行整个dataframe时）

# 1 https://huggingface.co/facebook/bart-large-cnn
"""
from transformers import pipeline

summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
text = df['main_body'][3]
summary = summarizer(text, max_length=510, min_length=50, do_sample=False)[0]['summary_text']
print(summary)
print(len(summary))

df['summary'] = df['main_body'].apply(lambda x: summarizer(x, max_length=510, min_length=50, do_sample=False)[0]['summary_text'] if len(x)>510 else x)
"""

# 2 https://github.com/christianversloot/machine-learning-articles/blob/main/transformers-for-long-text-code-examples-with-longformer.md
"""
from transformers import LongformerTokenizer, EncoderDecoderModel

# Load model and tokenizer
long_model = EncoderDecoderModel.from_pretrained("patrickvonplaten/longformer2roberta-cnn_dailymail-fp16")
long_tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-base-4096") 

def get_summary(text, model, tokenizer):

    # Tokenize and summarize
    input_ids = tokenizer(text, return_tensors="pt").input_ids
    output_ids = model.generate(input_ids)

    # Get the summary from the output tokens
    summary = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    
    return summary

df['summary'] = df['main_body'].apply(lambda x: get_summary(x, long_model, long_tokenizer) if len(x)>510 else x)   
"""

# https://towardsdatascience.com/text-summarization-with-nlp-textrank-vs-seq2seq-vs-bart-474943efeb09

'\nfrom transformers import LongformerTokenizer, EncoderDecoderModel\n\n# Load model and tokenizer\nlong_model = EncoderDecoderModel.from_pretrained("patrickvonplaten/longformer2roberta-cnn_dailymail-fp16")\nlong_tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-base-4096") \n\ndef get_summary(text, model, tokenizer):\n\n    # Tokenize and summarize\n    input_ids = tokenizer(text, return_tensors="pt").input_ids\n    output_ids = model.generate(input_ids)\n\n    # Get the summary from the output tokens\n    summary = tokenizer.decode(output_ids[0], skip_special_tokens=True)\n    \n    return summary\n\ndf[\'summary\'] = df[\'main_body\'].apply(lambda x: get_summary(x, long_model, long_tokenizer) if len(x)>510 else x)   \n'

In [25]:
# 试了word2vec 但df3里面的vector维度都不一样
# 也试了下doc2vec 也不行
# 有兴趣可以看下 https://towardsdatascience.com/how-to-vectorize-text-in-dataframes-for-nlp-tasks-3-simple-techniques-82925a5600db
"""
import gensim
from nltk.tokenize import word_tokenize

# Tokenize the text in the column
df['tokenized_text'] = df['main_body'].apply(lambda x: word_tokenize(x))

model = gensim.models.Word2Vec(df['tokenized_text'], vector_size=5000, window=5, min_count=1, workers=4)

df['embedding_vectors'] = df['tokenized_text'].apply(lambda x: [model.wv[word] for word in x])

df3 = pd.DataFrame(df['embedding_vectors'].tolist())
df3
"""

"\nimport gensim\nfrom nltk.tokenize import word_tokenize\n\n# Tokenize the text in the column\ndf['tokenized_text'] = df['main_body'].apply(lambda x: word_tokenize(x))\n\nmodel = gensim.models.Word2Vec(df['tokenized_text'], vector_size=5000, window=5, min_count=1, workers=4)\n\ndf['embedding_vectors'] = df['tokenized_text'].apply(lambda x: [model.wv[word] for word in x])\n\ndf3 = pd.DataFrame(df['embedding_vectors'].tolist())\ndf3\n"

Bag of Words 

In [26]:
# Count how many times a word appears in the dataset

from collections import Counter

total_counts = Counter()
for i in range(len(df['main_body'])):
    for word in df['main_body'].values[i].split(" "):
        total_counts[word] += 1

print("Total words in transformed df: ", len(total_counts))

Total words in transformed df:  51284


In [27]:
# Sort in decreasing order (Word with highest frequency appears first)
vocab = sorted(total_counts, key=total_counts.get, reverse=True)
print('Top 30 words: ', '\n', vocab[:31])

Top 30 words:  
 ['number', 'university', 'language', 'paper', 'email', 'information', 'linguistics', 'address', 'use', 'de', 'one', 'conference', 'send', 'e', 'order', 'please', 'languages', 'make', 'work', 'english', 'include', 'mail', 'http', 'program', 'also', 'edu', 'would', 'new', 'name', 'may', 'fax']


In [28]:
# Map words to index
vocab_size = len(vocab)
word2idx = {}

# print vocab_size
for i, word in enumerate(vocab):
    word2idx[word] = i

In [29]:
# Define a function to convert text to vectors
def text_to_vector(text, vocab):
    vector = np.zeros(len(vocab), dtype=np.int_)
    for word in text.split():
        if word in vocab:
            index = vocab.index(word)
            vector[index] += 1
    return vector

# Convert all text to vectors
word_vectors = np.zeros((len(df['main_body']), len(vocab)), dtype=np.int_)

for i, text in enumerate(df['main_body']):
    word_vectors[i] = text_to_vector(text, vocab)
    
word_vectors.shape

(2596, 51284)

In [30]:
# convert transformed vectors to dataframe to visualize
df2 = pd.DataFrame(word_vectors, columns=vocab)
df2

Unnamed: 0,number,university,language,paper,email,information,linguistics,address,use,de,...,lifet,erform,promoti,onal,tow,urchase,crespo,tvsrnumbers,promotio,uarantee
0,7,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,89,6,4,2,1,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2591,6,0,0,1,3,0,0,2,0,0,...,0,0,0,0,0,0,0,0,0,0
2592,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2593,22,0,0,0,10,3,0,3,4,0,...,0,0,0,0,0,0,0,0,0,0
2594,26,0,0,0,3,3,0,4,0,0,...,0,0,0,0,0,0,0,0,0,0


In [31]:
# apply SVD
# from sklearn.decomposition import TruncatedSVD

# # n_components must be between 1 and min(vectors.shape)
# svd = TruncatedSVD(n_components=500)
# svd.fit(word_vectors)
# print(svd.explained_variance_ratio_.sum()) #95%

In [32]:
# bow_transformed_vectors = svd.transform(word_vectors)
# print(bow_transformed_vectors)
# print("dimension =", bow_transformed_vectors.shape)

In [33]:
# 降维后的
# df2 = pd.DataFrame(transformed_vectors)
# df2

# Model Development

In [34]:
# IDIDF - NB
# Naive Bayes does not work with SVD or other matrix factorization

In [35]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(df['main_body'])
df1 = pd.DataFrame(vectors.toarray(), columns=vectorizer.get_feature_names_out())
df1

Unnamed: 0,aa,aaa,aaai,aaainumbers,aaal,aaanumbers,aaarghh,aaas,aabb,aabyhoej,...,zwischen,zwitserlood,zxgahnumbersqabjh,zybatov,zybatow,zygmunt,zyokyoozyu,zytkow,zzlsa,zznumbers
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2591,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2592,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2593,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2594,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [36]:
from sklearn.model_selection import train_test_split

train_val_X_tfidf, test_X_tfidf, train_val_y_tfidf, test_y_tfidf = train_test_split(df1, df['label'], test_size=0.2, random_state=42)
train_X_tfidf, val_X_tfidf, train_y_tfidf, val_y_tfidf = train_test_split(train_val_X_tfidf, train_val_y_tfidf, test_size=0.25, random_state=42)

In [37]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix,accuracy_score,f1_score

In [38]:
mnb_1 = MultinomialNB(alpha=1.0) # the default value of alpha
mnb_1.fit(train_X_tfidf,train_y_tfidf)

# Evaluate the performance of the model on the validation set
y_pred = mnb_1.predict(val_X_tfidf)
print("Confusion Matrix:\n", confusion_matrix(y_pred,val_y_tfidf))
print("Accuracy: ", accuracy_score(y_pred,val_y_tfidf))
print("F1 score: ", f1_score(y_pred,val_y_tfidf, average = 'weighted'))

Confusion Matrix:
 [[442  74]
 [  0   3]]
Accuracy:  0.8574181117533719
F1 score:  0.9178554189243003


In [39]:
mnb_05 = MultinomialNB(alpha=0.5)
mnb_05.fit(train_X_tfidf,train_y_tfidf)

# Evaluate the performance of the model on the validation set
y_pred = mnb_05.predict(val_X_tfidf)
print("Confusion Matrix:\n", confusion_matrix(y_pred,val_y_tfidf))
print("Accuracy: ", accuracy_score(y_pred,val_y_tfidf))
print("F1 score: ", f1_score(y_pred,val_y_tfidf, average = 'weighted'))

Confusion Matrix:
 [[442  67]
 [  0  10]]
Accuracy:  0.8709055876685935
F1 score:  0.9160668658212969


In [40]:
mnb_025 = MultinomialNB(alpha=0.25)
mnb_025.fit(train_X_tfidf,train_y_tfidf)

# Evaluate the performance of the model on the validation set
y_pred = mnb_025.predict(val_X_tfidf)
print("Confusion Matrix:\n", confusion_matrix(y_pred,val_y_tfidf))
print("Accuracy: ", accuracy_score(y_pred,val_y_tfidf))
print("F1 score: ", f1_score(y_pred,val_y_tfidf, average = 'weighted'))

Confusion Matrix:
 [[442  33]
 [  0  44]]
Accuracy:  0.9364161849710982
F1 score:  0.9439426125654781


In [41]:
mnb_01 = MultinomialNB(alpha=0.1)
mnb_01.fit(train_X_tfidf,train_y_tfidf)

# Evaluate the performance of the model on the validation set
y_pred = mnb_01.predict(val_X_tfidf)
print("Confusion Matrix:\n", confusion_matrix(y_pred,val_y_tfidf))
print("Accuracy: ", accuracy_score(y_pred,val_y_tfidf))
print("F1 score: ", f1_score(y_pred,val_y_tfidf, average = 'weighted'))

Confusion Matrix:
 [[442   8]
 [  0  69]]
Accuracy:  0.9845857418111753
F1 score:  0.9849389280206742


In [42]:
mnb_001 = MultinomialNB(alpha=0.01)
mnb_001.fit(train_X_tfidf,train_y_tfidf)

# Evaluate the performance of the model on the validation set
y_pred = mnb_001.predict(val_X_tfidf)
print("Confusion Matrix:\n", confusion_matrix(y_pred,val_y_tfidf))
print("Accuracy: ", accuracy_score(y_pred,val_y_tfidf))
print("F1 score: ", f1_score(y_pred,val_y_tfidf, average = 'weighted'))

Confusion Matrix:
 [[441   1]
 [  1  76]]
Accuracy:  0.9961464354527938
F1 score:  0.9961464354527938


We see that the optimal alpha for tfidf vectors is 0.01. Now we can evaluate the final model with the alpha = 0.01 on the test set to obtain an unbiased estimate of its performance.

In [43]:
# Evaluate the performance of the model on the test set
y_pred = mnb_001.predict(test_X_tfidf)
print("Confusion Matrix:\n", confusion_matrix(y_pred,test_y_tfidf))
print("Accuracy: ", accuracy_score(y_pred,test_y_tfidf))
print("F1 score: ", f1_score(y_pred,test_y_tfidf, average = 'weighted'))

Confusion Matrix:
 [[422   1]
 [  7  90]]
Accuracy:  0.9846153846153847
F1 score:  0.9844240566146468


In [44]:
# BoW - NB

In [45]:
df2 = pd.DataFrame(word_vectors, columns=vocab)
df2

Unnamed: 0,number,university,language,paper,email,information,linguistics,address,use,de,...,lifet,erform,promoti,onal,tow,urchase,crespo,tvsrnumbers,promotio,uarantee
0,7,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,89,6,4,2,1,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2591,6,0,0,1,3,0,0,2,0,0,...,0,0,0,0,0,0,0,0,0,0
2592,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2593,22,0,0,0,10,3,0,3,4,0,...,0,0,0,0,0,0,0,0,0,0
2594,26,0,0,0,3,3,0,4,0,0,...,0,0,0,0,0,0,0,0,0,0


In [46]:
train_val_X_bag, test_X_bag, train_val_y_bag, test_y_bag = train_test_split(df2, df['label'], test_size=0.2, random_state=42)
train_X_bag, val_X_bag, train_y_bag, val_y_bag = train_test_split(train_val_X_bag, train_val_y_bag, test_size=0.25, random_state=42)

In [47]:
mnb_1 = MultinomialNB(alpha=1.0) # the default value of alpha
mnb_1.fit(train_X_bag,train_y_bag)

# Evaluate the performance of the model on the validation set
y_pred = mnb_1.predict(val_X_bag)
print("Confusion Matrix:\n", confusion_matrix(y_pred,val_y_bag))
print("Accuracy: ", accuracy_score(y_pred,val_y_bag))
print("F1 score: ", f1_score(y_pred,val_y_bag, average = 'weighted'))

Confusion Matrix:
 [[439   2]
 [  3  75]]
Accuracy:  0.9903660886319846
F1 score:  0.9903404667144894


In [48]:
mnb_05 = MultinomialNB(alpha=0.5) 
mnb_05.fit(train_X_bag,train_y_bag)

# Evaluate the performance of the model on the validation set
y_pred = mnb_05.predict(val_X_bag)
print("Confusion Matrix:\n", confusion_matrix(y_pred,val_y_bag))
print("Accuracy: ", accuracy_score(y_pred,val_y_bag))
print("F1 score: ", f1_score(y_pred,val_y_bag, average = 'weighted'))

Confusion Matrix:
 [[437   2]
 [  5  75]]
Accuracy:  0.9865125240847784
F1 score:  0.9864066267703201


In [49]:
mnb_075 = MultinomialNB(alpha=0.75) 
mnb_075.fit(train_X_bag,train_y_bag)

# Evaluate the performance of the model on the validation set
y_pred = mnb_075.predict(val_X_bag)
print("Confusion Matrix:\n", confusion_matrix(y_pred,val_y_bag))
print("Accuracy: ", accuracy_score(y_pred,val_y_bag))
print("F1 score: ", f1_score(y_pred,val_y_bag, average = 'weighted'))

Confusion Matrix:
 [[438   2]
 [  4  75]]
Accuracy:  0.9884393063583815
F1 score:  0.9883783067102637


In [50]:
mnb_01 = MultinomialNB(alpha=0.1) 
mnb_01.fit(train_X_bag,train_y_bag)

# Evaluate the performance of the model on the validation set
y_pred = mnb_01.predict(val_X_bag)
print("Confusion Matrix:\n", confusion_matrix(y_pred,val_y_bag))
print("Accuracy: ", accuracy_score(y_pred,val_y_bag))
print("F1 score: ", f1_score(y_pred,val_y_bag, average = 'weighted'))

Confusion Matrix:
 [[438   2]
 [  4  75]]
Accuracy:  0.9884393063583815
F1 score:  0.9883783067102637


We see that the optimal alpha for bag of words vectors is 1.0. Now we can evaluate the final model with the alpha = 1.0 on the test set to obtain an unbiased estimate of its performance.

In [51]:
# Evaluate the performance of the model on the test set
y_pred = mnb_1.predict(test_X_bag)
print("Confusion Matrix:\n", confusion_matrix(y_pred,test_y_bag))
print("Accuracy: ", accuracy_score(y_pred,test_y_bag))
print("F1 score: ", f1_score(y_pred,test_y_bag, average = 'weighted'))

Confusion Matrix:
 [[427   2]
 [  2  89]]
Accuracy:  0.9923076923076923
F1 score:  0.9923076923076923


In [52]:
from sklearn.naive_bayes import GaussianNB
gauss = GaussianNB()

In [53]:
gauss.fit(train_X_tfidf,train_y_tfidf)

# Evaluate the performance of the model on the validation set
y_pred = gauss.predict(val_X_tfidf)
print("Confusion Matrix:\n", confusion_matrix(y_pred,val_y_tfidf))
print("Accuracy: ", accuracy_score(y_pred,val_y_tfidf))
print("F1 score: ", f1_score(y_pred,val_y_tfidf, average = 'weighted'))

Confusion Matrix:
 [[441  26]
 [  1  51]]
Accuracy:  0.9479768786127167
F1 score:  0.9523024892891224


In [54]:
from sklearn.model_selection import RepeatedStratifiedKFold

cv_method = RepeatedStratifiedKFold(n_splits=5, 
                                    n_repeats=3, 
                                    random_state=999)

In [55]:
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import PowerTransformer
params_NB = {'var_smoothing': np.logspace(0,-9, num=10)}

gs_NB = GridSearchCV(estimator=gauss, 
                     param_grid=params_NB, 
                     cv=cv_method,
                     verbose=1, 
                     scoring='f1')

Data_transformed = PowerTransformer().fit_transform(val_X_tfidf)

gs_NB.fit(Data_transformed, val_y_tfidf)

  loglike = -n_samples / 2 * np.log(x_trans.var())


Fitting 15 folds for each of 10 candidates, totalling 150 fits


GridSearchCV(cv=RepeatedStratifiedKFold(n_repeats=3, n_splits=5, random_state=999),
             estimator=GaussianNB(),
             param_grid={'var_smoothing': array([1.e+00, 1.e-01, 1.e-02, 1.e-03, 1.e-04, 1.e-05, 1.e-06, 1.e-07,
       1.e-08, 1.e-09])},
             scoring='f1', verbose=1)

In [56]:
gs_NB.best_params_

{'var_smoothing': 0.001}

In [57]:
gauss_0001 = GaussianNB(var_smoothing = 0.001)

gauss_0001.fit(train_X_tfidf,train_y_tfidf)

# Evaluate the performance of the model on the validation set
y_pred = gauss_0001.predict(val_X_tfidf)
print("Confusion Matrix:\n", confusion_matrix(y_pred,val_y_tfidf))
print("Accuracy: ", accuracy_score(y_pred,val_y_tfidf))
print("F1 score: ", f1_score(y_pred,val_y_tfidf, average = 'weighted'))

Confusion Matrix:
 [[367   0]
 [ 75  77]]
Accuracy:  0.8554913294797688
F1 score:  0.8385257318428402


It turned out that the default var_smoothing for gaussian naive bayes is the best. The "best" var_smoothing value found by grid search yielded worse results than the default value.

In [58]:
# Evaluate the performance of the model on the test set
y_pred = gauss.predict(test_X_tfidf)
print("Confusion Matrix:\n", confusion_matrix(y_pred,test_y_tfidf))
print("Accuracy: ", accuracy_score(y_pred,test_y_tfidf))
print("F1 score: ", f1_score(y_pred,test_y_tfidf, average = 'weighted'))

Confusion Matrix:
 [[428  30]
 [  1  61]]
Accuracy:  0.9403846153846154
F1 score:  0.9450598925431474


Now do the same things with bag of words

In [59]:
gauss.fit(train_X_bag,train_y_bag)

# Evaluate the performance of the model on the validation set
y_pred = gauss.predict(val_X_bag)
print("Confusion Matrix:\n", confusion_matrix(y_pred,val_y_bag))
print("Accuracy: ", accuracy_score(y_pred,val_y_bag))
print("F1 score: ", f1_score(y_pred,val_y_bag, average = 'weighted'))

Confusion Matrix:
 [[442  27]
 [  0  50]]
Accuracy:  0.9479768786127167
F1 score:  0.95273597104176


In [60]:
gauss_0001 = GaussianNB(var_smoothing = 0.001) # Supposedly the best performing var_smoothing

gauss_0001.fit(train_X_bag,train_y_bag)

# Evaluate the performance of the model on the validation set
y_pred = gauss_0001.predict(val_X_bag)
print("Confusion Matrix:\n", confusion_matrix(y_pred,val_y_bag))
print("Accuracy: ", accuracy_score(y_pred,val_y_bag))
print("F1 score: ", f1_score(y_pred,val_y_bag, average = 'weighted'))

Confusion Matrix:
 [[210   1]
 [232  76]]
Accuracy:  0.5510597302504817
F1 score:  0.4957843892277233


In [61]:
# Evaluate the performance of the model on the test set
y_pred = gauss.predict(test_X_bag)
print("Confusion Matrix:\n", confusion_matrix(y_pred,test_y_bag))
print("Accuracy: ", accuracy_score(y_pred,test_y_bag))
print("F1 score: ", f1_score(y_pred,test_y_bag, average = 'weighted'))

Confusion Matrix:
 [[427  23]
 [  2  68]]
Accuracy:  0.9519230769230769
F1 score:  0.9544842307121577


Schneider [25] found that the multinomial nb surprisingly performs even better when term frequencies are replaced by Boolean attributes.

In [62]:
from sklearn.preprocessing import Binarizer
binarizer = Binarizer()
df1_bool = binarizer.transform(df1.values)

In [63]:
train_val_X_tfidf, boo_test_X_tfidf, train_val_y_tfidf, boo_test_y_tfidf = train_test_split(df1_bool, df['label'], test_size=0.2, random_state=42)
boo_train_X_tfidf, boo_val_X_tfidf, boo_train_y_tfidf, boo_val_y_tfidf = train_test_split(train_val_X_tfidf, train_val_y_tfidf, test_size=0.25, random_state=42)

In [71]:
from sklearn.naive_bayes import BernoulliNB
berNB = BernoulliNB(alpha = 1.0) #default alpha
berNB.fit(boo_train_X_tfidf,boo_train_y_tfidf)

# Evaluate the performance of the model on the validation set
y_pred = berNB.predict(boo_val_X_tfidf)
print("Confusion Matrix:\n", confusion_matrix(y_pred,boo_val_y_tfidf))
print("Accuracy: ", accuracy_score(y_pred,boo_val_y_tfidf))
print("F1 score: ", f1_score(y_pred,boo_val_y_tfidf, average = 'weighted'))

Confusion Matrix:
 [[438  45]
 [  4  32]]
Accuracy:  0.905587668593449
F1 score:  0.920623188025101


In [72]:
berNB_001 = BernoulliNB(alpha = 0.01) #optimal
berNB_001.fit(boo_train_X_tfidf,boo_train_y_tfidf)

# Evaluate the performance of the model on the validation set
y_pred = berNB_001.predict(boo_val_X_tfidf)
print("Confusion Matrix:\n", confusion_matrix(y_pred,boo_val_y_tfidf))
print("Accuracy: ", accuracy_score(y_pred,boo_val_y_tfidf))
print("F1 score: ", f1_score(y_pred,boo_val_y_tfidf, average = 'weighted'))

Confusion Matrix:
 [[435   1]
 [  7  76]]
Accuracy:  0.9845857418111753
F1 score:  0.9843493927782971


In [73]:
# Evaluate the performance of the model on the test set
y_pred = berNB_001.predict(boo_test_X_tfidf)
print("Confusion Matrix:\n", confusion_matrix(y_pred,boo_test_y_tfidf))
print("Accuracy: ", accuracy_score(y_pred,boo_test_y_tfidf))
print("F1 score: ", f1_score(y_pred,boo_test_y_tfidf, average = 'weighted'))

Confusion Matrix:
 [[416   1]
 [ 13  90]]
Accuracy:  0.9730769230769231
F1 score:  0.9724351944027311


In [74]:
df2_bool = binarizer.transform(df2.values)
train_val_X_bag, boo_test_X_bag, train_val_y_bag, boo_test_y_bag = train_test_split(df2_bool, df['label'], test_size=0.2, random_state=42)
boo_train_X_bag, boo_val_X_bag, boo_train_y_bag, boo_val_y_bag = train_test_split(train_val_X_bag, train_val_y_bag, test_size=0.25, random_state=42)

In [78]:
berNB_001 = BernoulliNB(alpha = 0.01) #optimal
berNB_001.fit(boo_train_X_bag,boo_train_y_bag)

# Evaluate the performance of the model on the validation set
y_pred = berNB_001.predict(boo_val_X_bag)
print("Confusion Matrix:\n", confusion_matrix(y_pred,boo_val_y_bag))
print("Accuracy: ", accuracy_score(y_pred,boo_val_y_bag))
print("F1 score: ", f1_score(y_pred,boo_val_y_bag, average = 'weighted'))

Confusion Matrix:
 [[435   1]
 [  7  76]]
Accuracy:  0.9845857418111753
F1 score:  0.9843493927782971


In [79]:
# Evaluate the performance of the model on the test set
y_pred = berNB_001.predict(boo_test_X_bag)
print("Confusion Matrix:\n", confusion_matrix(y_pred,boo_test_y_bag))
print("Accuracy: ", accuracy_score(y_pred,boo_test_y_bag))
print("F1 score: ", f1_score(y_pred,boo_test_y_bag, average = 'weighted'))

Confusion Matrix:
 [[417   0]
 [ 12  91]]
Accuracy:  0.9769230769230769
F1 score:  0.9763730237737694
