In [1]:
# packages about dataframe
import os
import pandas as pd
import numpy as np

# packages about stopwords etc.
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# packages about word vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter

# packages about dimensionality reduction
from sklearn.decomposition import TruncatedSVD

# packages about model training and evaluation
from fast_ml.model_development import train_valid_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score

# packages about hyperparameter tuning
from sklearn.model_selection import GridSearchCV

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/emmazhang/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/emmazhang/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


## 1 Load the data

In [2]:
# root directory
root_dir = "lingspam_public/bare"

# create DataFrame with 3 cols
# df = pd.DataFrame(columns=["label", "subject", "main_body"])
df = []

# 遍历根目录下的10个文件夹
for i in range(1, 11):
    folder = os.path.join(root_dir, "part{}".format(i))
    file_list = os.listdir(folder)

    # iterate through all files
    for file_name in file_list:
        # remove duplicate files: df.drop_duplicates(subset=['file_name'], keep='first', inplace=True)

        with open(os.path.join(folder, file_name), "r") as f:
            file_content = f.readlines()

            # get label from file name
            if "spm" in file_name:
                label = "spam"
            else:
                label = "non-spam"

            # get mainbody and subject
            if file_name == "Icon": continue
            if len(file_content) == 0:
                print(folder, file_name)
            else:
                subject = file_content[0].replace("Subject: ", "").strip()
            
            main_body = "".join([line.strip() for line in file_content[1:]])

            # add data in dataframe
            df.append(pd.DataFrame({"label": label, "subject": subject, "main_body": main_body}, index=[0]))

# concat all columns
df = pd.concat(df, ignore_index=True)

# replace all "" with NA for further handling
df["label"].replace("", np.nan, inplace=True)
df["subject"].replace("", np.nan, inplace=True)
df["main_body"].replace("", np.nan, inplace=True)

df

Unnamed: 0,label,subject,main_body
0,non-spam,conference announcement,* * * * * * * * * * * * * * * * * * * * * firs...
1,spam,""" life without debt """,pardon the intrusion . no offence is meant . i...
2,spam,do want the best and economical hunting vacati...,if you want the best hunting and camping vacat...
3,non-spam,query : uninflected tags,does anybody know of recent work on uninflecte...
4,non-spam,no accent allowed !,has anybody else seen a weird piece in a newsp...
...,...,...,...
2888,non-spam,sla conference paris 1999,call for papers xi th international conference...
2889,non-spam,conference announcement,southern illinois university edwardsville and ...
2890,non-spam,semantics,we would like to bring to your attention to tw...
2891,non-spam,honored by two keynote speakers,international conference on natural language p...


In [3]:
# df.to_csv("dataframe.csv", index=False)

## 2 Data pre-processing

In [4]:
# check null values
df.isnull().sum()

label         0
subject      62
main_body     0
dtype: int64

#### Handle Missing Value

In [5]:
'''
Data Cleaning: Handling of Incomplete & Missing Data

miss label - drop
miss subject - keep
miss main body - drop
'''
# if subject = na, replace it with "missing"
# df["subject"] = df["subject"].fillna("missing")
df["subject"].fillna("missing", inplace=True)

# if label or mainbody = na, drop row
df.dropna(subset=["label"], inplace=True)
df.dropna(subset=["main_body"], inplace=True)

df

Unnamed: 0,label,subject,main_body
0,non-spam,conference announcement,* * * * * * * * * * * * * * * * * * * * * firs...
1,spam,""" life without debt """,pardon the intrusion . no offence is meant . i...
2,spam,do want the best and economical hunting vacati...,if you want the best hunting and camping vacat...
3,non-spam,query : uninflected tags,does anybody know of recent work on uninflecte...
4,non-spam,no accent allowed !,has anybody else seen a weird piece in a newsp...
...,...,...,...
2888,non-spam,sla conference paris 1999,call for papers xi th international conference...
2889,non-spam,conference announcement,southern illinois university edwardsville and ...
2890,non-spam,semantics,we would like to bring to your attention to tw...
2891,non-spam,honored by two keynote speakers,international conference on natural language p...


#### Handle Noisy Data

In [6]:
'''
Data Cleaning: Handling of Noisy Data

noisy data -> meaningless data: all punctuations -> drop
noisy data -> redundant data: repetitive data -> drop
# REPLACING EMAIL IDs BY 'MAILID'
# REPLACING URLs  BY 'Links'
# REPLACING CURRENCY SIGNS BY 'MONEY'
# REPLACINg NUMBERS by 'numbers'
'''

# drop all punctuations
df['subject'] = df['subject'].str.replace(r'[^\w\s]', '', regex=True)
df['main_body'] = df['main_body'].str.replace(r'[^\w\s]', '', regex=True)

# drop repetitive data
df.drop_duplicates(subset=["subject"], inplace=True) # 可以删掉
df.drop_duplicates(subset=["main_body"], inplace=True)

# replace email by 'MailID'
df['subject']=df['subject'].str.replace(r'^.+@[^\.].*\.[a-z]{2,}$','MailID', regex=True) # 可以删掉
df['main_body']=df['main_body'].str.replace(r'^.+@[^\.].*\.[a-z]{2,}$','MailID', regex=True)

# replace links by 'Links'
df['subject']=df['subject'].str.replace(r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$','Links', regex=True) # 可以删掉
df['main_body']=df['main_body'].str.replace(r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$','Links', regex=True)

# replace currency by 'money'
df['subject']=df['subject'].str.replace(r'£|\$', 'money', regex=True)
df['main_body']=df['main_body'].str.replace(r'£|\$', 'money', regex=True)

# replace numbers by 'numbers'
df['subject']=df['subject'].str.replace(r'\d+(\.\d+)?', 'numbers')
df['main_body']=df['main_body'].str.replace(r'\d+(\.\d+)?', 'numbers')

df

Unnamed: 0,label,subject,main_body
0,non-spam,conference announcement,first announcement ...
1,spam,life without debt,pardon the intrusion no offence is meant if ...
2,spam,do want the best and economical hunting vacati...,if you want the best hunting and camping vacat...
3,non-spam,query uninflected tags,does anybody know of recent work on uninflecte...
4,non-spam,no accent allowed,has anybody else seen a weird piece in a newsp...
...,...,...,...
2886,non-spam,program info workshop on comparative slavic ...,workshop on comparative slavic morphosyntax pr...
2887,non-spam,tsdnumbers numbersrd call for papers,...
2888,non-spam,sla conference paris numbers,call for papers xi th international conference...
2891,non-spam,honored by two keynote speakers,international conference on natural language p...


#### Handle Inconsistent Data

In [7]:
'''
Data Cleaning: Handling of Inconsistent Data

lowercase / uppercase -> all lowercase
# REPLACING NEXT LINES BY 'WHITE SPACE'
# REPLACING LARGE WHITE SPACE BY SINGLE WHITE SPACE
# REPLACING LEADING AND TRAILING WHITE SPACE BY SINGLE WHITE SPACE
# REPLACING SPECIAL CHARACTERS  BY WHITE SPACE
'''

# convert to lowercase
df['subject']=df['subject'].str.lower()
df['main_body']=df['main_body'].str.lower()

# replace special characters by white space
df['subject']=df['subject'].str.replace(r"[^a-zA-Z0-9]+", " ", regex=True)
df['main_body']=df['main_body'].str.replace(r"[^a-zA-Z0-9]+", " ", regex=True)

# replace leading and trailing white space by single white space
df['subject']=df['subject'].str.replace(r'^\s+|\s+?$', ' ', regex=True)
df['main_body']=df['main_body'].str.replace(r'^\s+|\s+?$', ' ', regex=True)

# replace next line by white space
df['subject']=df['subject'].str.replace(r'\n'," ", regex=True)
df['main_body']=df['main_body'].str.replace(r'\n'," ", regex=True)

# 这个要放最后
# replace large white space by single white space
df['subject']=df['subject'].str.replace(r'\s+', ' ', regex=True)
df['main_body']=df['main_body'].str.replace(r'\s+', ' ', regex=True)

df

Unnamed: 0,label,subject,main_body
0,non-spam,conference announcement,first announcement groningen assembly on lang...
1,spam,life without debt,pardon the intrusion no offence is meant if yo...
2,spam,do want the best and economical hunting vacati...,if you want the best hunting and camping vacat...
3,non-spam,query uninflected tags,does anybody know of recent work on uninflecte...
4,non-spam,no accent allowed,has anybody else seen a weird piece in a newsp...
...,...,...,...
2886,non-spam,program info workshop on comparative slavic mo...,workshop on comparative slavic morphosyntax pr...
2887,non-spam,tsdnumbers numbersrd call for papers,please pay attention deadline for submissions...
2888,non-spam,sla conference paris numbers,call for papers xi th international conference...
2891,non-spam,honored by two keynote speakers,international conference on natural language p...


In [8]:
# handle missing values again since previous steps might cause new missing 
df["subject"].replace(" ", np.nan, inplace=True)
df["main_body"].replace(" ", np.nan, inplace=True)

df["subject"].fillna("missing", inplace=True)
df.dropna(subset=["main_body"], inplace=True)

df.insert(len(df.columns)-1, 'length', df['main_body'].apply(len))

df

Unnamed: 0,label,subject,length,main_body
0,non-spam,conference announcement,1278,first announcement groningen assembly on lang...
1,spam,life without debt,1658,pardon the intrusion no offence is meant if yo...
2,spam,do want the best and economical hunting vacati...,592,if you want the best hunting and camping vacat...
3,non-spam,query uninflected tags,266,does anybody know of recent work on uninflecte...
4,non-spam,no accent allowed,1371,has anybody else seen a weird piece in a newsp...
...,...,...,...,...
2886,non-spam,program info workshop on comparative slavic mo...,11168,workshop on comparative slavic morphosyntax pr...
2887,non-spam,tsdnumbers numbersrd call for papers,6044,please pay attention deadline for submissions...
2888,non-spam,sla conference paris numbers,2034,call for papers xi th international conference...
2891,non-spam,honored by two keynote speakers,1879,international conference on natural language p...


In [9]:
# df.to_csv("cleaned_data.csv", index=False)

## 3 Data Transformation

In [10]:
# Removing stopwords
stop = stopwords.words('english')
df['subject'] = df['subject'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
df['main_body'] = df['main_body'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

# Apply lemmatization
lemm = WordNetLemmatizer()
df['subject'] = df['subject'].apply(lambda x: ' '.join([lemm.lemmatize(word, pos="v") for word in x.split()]))
df['main_body'] = df['main_body'].apply(lambda x: ' '.join([lemm.lemmatize(word, pos="v") for word in x.split()]))

# Convert categorical data to numeric
df["label"].replace("non-spam", 0, inplace=True)
df["label"].replace("spam", 1, inplace=True)

# To see the change in text length after removing stop words
df['length']=df['main_body'].apply(len)
df

Unnamed: 0,label,subject,length,main_body
0,0,conference announcement,1064,first announcement groningen assembly language...
1,1,life without debt,1185,pardon intrusion offence mean interest simply ...
2,1,want best economical hunt vacation life,445,want best hunt camp vacation life come felton ...
3,0,query uninflected tag,190,anybody know recent work uninflected tag like ...
4,0,accent allow,1004,anybody else see weird piece newspaper read su...
...,...,...,...,...
2886,0,program info workshop comparative slavic morph...,8525,workshop comparative slavic morphosyntax progr...
2887,0,tsdnumbers numbersrd call paper,4713,please pay attention deadline submissions may ...
2888,0,sla conference paris number,1720,call paper xi th international conference acqu...
2891,0,honor two keynote speakers,1677,international conference natural language proc...


In [11]:
# Remove missing values again since previous steps might cause new missing values
df['subject']=df['subject'].str.replace(r'\s+', ' ', regex=True)
df['main_body']=df['main_body'].str.replace(r'\s+', ' ', regex=True)

df["subject"].replace(" ", np.nan, inplace=True)
df["main_body"].replace(" ", np.nan, inplace=True)

df["subject"].fillna("missing", inplace=True)
df.dropna(subset=["main_body"], inplace=True)

df

Unnamed: 0,label,subject,length,main_body
0,0,conference announcement,1064,first announcement groningen assembly language...
1,1,life without debt,1185,pardon intrusion offence mean interest simply ...
2,1,want best economical hunt vacation life,445,want best hunt camp vacation life come felton ...
3,0,query uninflected tag,190,anybody know recent work uninflected tag like ...
4,0,accent allow,1004,anybody else see weird piece newspaper read su...
...,...,...,...,...
2886,0,program info workshop comparative slavic morph...,8525,workshop comparative slavic morphosyntax progr...
2887,0,tsdnumbers numbersrd call paper,4713,please pay attention deadline submissions may ...
2888,0,sla conference paris number,1720,call paper xi th international conference acqu...
2891,0,honor two keynote speakers,1677,international conference natural language proc...


In [12]:
# df.to_csv("removedstop_cleaned_data.csv", index=False)

In [13]:
df['label']

0       0
1       1
2       1
3       0
4       0
       ..
2886    0
2887    0
2888    0
2891    0
2892    0
Name: label, Length: 2596, dtype: int64

### TF-IDF

In [14]:
# use TFIDF to vectorize words
vectorizer = TfidfVectorizer()
vectors_tfidf = vectorizer.fit_transform(df['main_body'])
df1 = pd.DataFrame(vectors_tfidf.toarray(), columns=vectorizer.get_feature_names_out())
df1

Unnamed: 0,aa,aaa,aaai,aaainumbers,aaal,aaanumbers,aaarghh,aaas,aabb,aabyhoej,...,zwischen,zwitserlood,zxgahnumbersqabjh,zybatov,zybatow,zygmunt,zyokyoozyu,zytkow,zzlsa,zznumbers
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2591,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2592,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2593,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2594,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
# use SVD to reduce dimension
# n_components must be between 1 and min(vectors.shape)
svd = TruncatedSVD(n_components=1750)
svd.fit(vectors_tfidf)
print(svd.explained_variance_ratio_.sum()) # 90%

0.9094339125912662


In [16]:
# print the transformed vectors
tfidf_transformed_vectors = svd.transform(vectors_tfidf)
print(tfidf_transformed_vectors)
print("dimension =", tfidf_transformed_vectors.shape)

[[ 3.23518798e-01 -1.13847806e-01 -1.07750493e-01 ...  2.22754570e-03
  -1.16295285e-02  5.32830923e-03]
 [ 5.41332088e-01  2.05756532e-01 -1.32397800e-02 ...  1.04758137e-02
  -7.81861185e-03 -2.01791011e-02]
 [ 2.49632674e-01  9.97849454e-02 -2.80745456e-02 ... -1.21049869e-02
   7.28293782e-03 -2.29325547e-03]
 ...
 [ 4.38571737e-01 -2.09359247e-02 -1.29308157e-01 ... -9.05231387e-03
   4.07506826e-04  2.99531486e-02]
 [ 2.55450518e-01 -4.69068676e-02 -8.36932479e-02 ... -2.75148411e-02
  -6.20852922e-03  1.90190431e-02]
 [ 4.14364348e-01 -1.63993582e-02  1.29983099e-03 ...  3.50831524e-02
   1.85282244e-02 -5.11265081e-03]]
dimension = (2596, 1750)


In [17]:
# visualize tfidf + svd transformed vectors
df1 = pd.DataFrame(tfidf_transformed_vectors)
df1

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1740,1741,1742,1743,1744,1745,1746,1747,1748,1749
0,0.323519,-0.113848,-0.107750,0.030892,-0.057147,-0.006141,-0.049097,-0.006113,0.022331,-0.021330,...,0.008859,-0.004634,-0.011653,0.009238,-0.000402,-0.008667,0.011104,0.002228,-0.011630,0.005328
1,0.541332,0.205757,-0.013240,0.021064,-0.059343,-0.014257,0.025846,0.007105,-0.003030,0.025589,...,-0.002995,0.008628,-0.014745,0.006576,-0.001355,-0.024890,-0.003093,0.010476,-0.007819,-0.020179
2,0.249633,0.099785,-0.028075,-0.011970,-0.023579,0.026146,-0.005913,0.000554,-0.033448,0.026775,...,-0.003099,0.011659,-0.010484,-0.005580,-0.001315,0.000459,-0.006974,-0.012105,0.007283,-0.002293
3,0.030223,-0.021128,0.100462,-0.011657,0.000726,-0.024053,0.002939,0.010541,-0.060843,0.019402,...,0.007865,-0.004586,-0.008644,-0.004356,-0.004678,0.002822,0.015734,0.010218,0.001650,0.006345
4,0.189612,-0.049302,0.064732,-0.053181,-0.061634,-0.064175,0.017099,0.074334,-0.024842,-0.029952,...,0.010538,0.007894,-0.002646,-0.016462,-0.026604,0.029775,-0.000548,-0.001399,0.004095,0.006943
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2591,0.558124,0.058706,-0.120592,-0.008570,-0.094630,-0.013373,0.028327,-0.010855,-0.054237,0.007673,...,-0.015217,-0.002321,0.017321,-0.027076,0.000671,0.001108,0.020527,-0.001771,-0.004753,0.018250
2592,0.283135,-0.117495,-0.034985,0.120626,0.045321,0.009844,0.085793,-0.080928,-0.010419,0.009684,...,0.001580,-0.000644,0.001283,0.000022,-0.001462,-0.000159,0.001506,0.000177,0.001143,0.002083
2593,0.438572,-0.020936,-0.129308,0.027597,0.063673,-0.061608,0.028273,-0.066322,-0.008901,0.060160,...,-0.016098,-0.005434,-0.000102,0.019905,0.010720,0.009318,-0.008498,-0.009052,0.000408,0.029953
2594,0.255451,-0.046907,-0.083693,-0.038572,0.181689,-0.124315,0.107780,0.095599,-0.029179,0.119886,...,-0.001616,0.010835,0.010183,0.006773,0.017375,0.004420,-0.009111,-0.027515,-0.006209,0.019019


### Bag of Words 

In [18]:
# Count how many times a word appears in the dataset
total_counts = Counter()
for i in range(len(df['main_body'])):
    for word in df['main_body'].values[i].split(" "):
        total_counts[word] += 1

print("Total words in dataframe: ", len(total_counts))

Total words in dataframe:  51312


In [19]:
# Sort in decreasing order (Word with highest frequency appears first)
vocab = sorted(total_counts, key = total_counts.get, reverse = True)
print('Top 30 words: ', '\n', vocab[:31])

Top 30 words:  
 ['number', 'university', 'language', 'paper', 'email', 'information', 'linguistics', 'address', 'use', 'de', 'one', 'conference', 'send', 'e', 'order', 'please', 'make', 'languages', 'english', 'include', 'work', 'mail', 'http', 'program', 'also', 'edu', 'new', 'would', 'name', 'may', 'fax']


In [20]:
# Map words to index
vocab_size = len(vocab)
word2idx = {}

# print vocab_size
for i, word in enumerate(vocab):
    word2idx[word] = i

In [21]:
# Define a function to convert text to vectors
def text_to_vector(text, vocab):
    vector = np.zeros(len(vocab), dtype=np.int_)
    for word in text.split():
        if word in vocab:
            index = vocab.index(word)
            vector[index] += 1
    return vector

# Convert all text to vectors
word_vectors = np.zeros((len(df['main_body']), len(vocab)), dtype = np.int_)

for i, text in enumerate(df['main_body']):
    word_vectors[i] = text_to_vector(text, vocab)
    
word_vectors.shape

(2596, 51312)

In [22]:
# convert transformed vectors to dataframe to visualize
df2 = pd.DataFrame(word_vectors, columns = vocab)
df2

Unnamed: 0,number,university,language,paper,email,information,linguistics,address,use,de,...,singledialect,corpuses,undescribed,cech,viktor,elsik,mozes,heinschink,hubschmannova,igla
0,13,6,3,1,1,1,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,36,0,0,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,13,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,8,1,1,1,0,0,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2591,158,37,0,9,4,4,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
2592,36,5,3,7,9,1,0,5,0,0,...,0,0,0,0,0,0,0,0,0,0
2593,35,0,2,5,2,1,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
2594,18,1,5,0,1,0,0,0,0,5,...,0,0,0,0,0,0,0,0,0,0


In [23]:
# apply SVD

# n_components must be between 1 and min(vectors.shape)
svd = TruncatedSVD(n_components = 500)
svd.fit(word_vectors)
print(svd.explained_variance_ratio_.sum()) #95%

0.9509950856821713


In [24]:
# print transfored vectors
bow_transformed_vectors = svd.transform(word_vectors)
print(bow_transformed_vectors)
print("dimension =", bow_transformed_vectors.shape)

[[ 1.37109776e+01 -3.28812491e+00  1.01686404e-02 ... -2.55046772e-01
   6.80130840e-04  1.15060778e-01]
 [ 3.59186423e+01 -5.65145797e-01 -3.79417928e-01 ... -1.69530354e-01
  -1.50251472e-01 -5.89102765e-02]
 [ 1.28994798e+01 -7.53032106e-01 -4.40158164e-02 ... -1.23224474e-01
   4.06798613e-02 -1.23493733e-01]
 ...
 [ 3.53182755e+01 -3.60892096e+00 -8.15405263e-02 ...  3.03080458e-01
  -6.59779010e-01 -1.98755700e-01]
 [ 1.87583132e+01 -3.81712881e+00  1.51511940e-01 ... -6.75478020e-01
  -2.84351366e-01 -3.50963855e-01]
 [ 5.05553846e+01 -5.64882460e+00  1.30458971e+00 ... -1.41509918e+00
   7.77682152e-01  1.64769963e-01]]
dimension = (2596, 500)


In [25]:
# visualize bow + svd transformed vectors
df2 = pd.DataFrame(bow_transformed_vectors)
df2

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,490,491,492,493,494,495,496,497,498,499
0,13.710978,-3.288125,0.010169,3.118854,-1.771366,-2.182045,-0.743751,-2.730162,-0.877912,0.812125,...,-0.101940,0.156249,-0.355623,0.082566,0.152873,0.048748,0.425127,-0.255047,0.000680,0.115061
1,35.918642,-0.565146,-0.379418,-3.905108,-1.306191,1.046967,0.225118,1.469410,0.360801,-0.588404,...,0.262604,-0.210689,-0.059775,-0.008094,-0.165109,0.050064,-0.200113,-0.169530,-0.150251,-0.058910
2,12.899480,-0.753032,-0.044016,-1.574052,-0.393521,0.717289,0.224887,0.723840,0.138641,0.126723,...,0.225040,0.111929,0.159556,0.168072,-0.056920,0.091418,-0.106527,-0.123224,0.040680,-0.123494
3,0.126565,0.497219,-0.028405,0.463633,0.161745,0.235655,0.040815,0.037720,0.306927,0.244349,...,0.009544,-0.019697,-0.132576,0.035903,-0.181913,-0.166119,0.097924,0.012458,0.258942,0.041086
4,8.459377,-0.293892,-0.138912,2.053423,-0.095933,0.963966,-0.318633,-0.736101,-0.030577,0.312204,...,0.048896,0.064954,-0.412037,0.049198,-0.400780,0.322876,0.277298,-0.157914,-0.089003,-0.245053
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2591,160.056049,-21.477895,-0.578358,-3.588650,-13.737201,-10.723416,-3.170077,-11.644288,12.743010,7.841726,...,-0.036112,-0.365327,-0.561862,-0.452011,0.120471,0.138716,-0.020748,-1.110170,0.350588,-0.146417
2592,38.523175,-2.051582,0.689642,8.999953,-2.769078,-9.959161,1.447135,6.047701,-2.044883,-3.710199,...,0.009888,-0.129176,0.098172,0.173418,0.208365,0.204486,0.028290,0.038821,0.037955,0.089828
2593,35.318275,-3.608921,-0.081541,0.041269,0.507978,-1.716646,0.292880,2.606521,-2.553619,-3.767196,...,-0.637356,-0.721950,-0.095805,-0.707521,-0.340234,-0.055272,0.658145,0.303080,-0.659779,-0.198756
2594,18.758313,-3.817129,0.151512,1.971991,6.004395,-1.196212,-0.388405,-0.737272,-3.659581,-1.044254,...,0.153600,-0.052630,-0.044032,-0.197488,0.048968,0.329945,-0.153837,-0.675478,-0.284351,-0.350964


## 5 Data preparation

### With TFIDF transformed data

In [26]:
# concat dataframe created by TFIDF
tfidf_df = df1
tfidf_df['label'] = df['label']

# drop missing value
#tfidf_df.dropna(subset = ["label"], inplace=True)

tfidf_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1741,1742,1743,1744,1745,1746,1747,1748,1749,label
0,0.323519,-0.113848,-0.107750,0.030892,-0.057147,-0.006141,-0.049097,-0.006113,0.022331,-0.021330,...,-0.004634,-0.011653,0.009238,-0.000402,-0.008667,0.011104,0.002228,-0.011630,0.005328,0.0
1,0.541332,0.205757,-0.013240,0.021064,-0.059343,-0.014257,0.025846,0.007105,-0.003030,0.025589,...,0.008628,-0.014745,0.006576,-0.001355,-0.024890,-0.003093,0.010476,-0.007819,-0.020179,1.0
2,0.249633,0.099785,-0.028075,-0.011970,-0.023579,0.026146,-0.005913,0.000554,-0.033448,0.026775,...,0.011659,-0.010484,-0.005580,-0.001315,0.000459,-0.006974,-0.012105,0.007283,-0.002293,1.0
3,0.030223,-0.021128,0.100462,-0.011657,0.000726,-0.024053,0.002939,0.010541,-0.060843,0.019402,...,-0.004586,-0.008644,-0.004356,-0.004678,0.002822,0.015734,0.010218,0.001650,0.006345,0.0
4,0.189612,-0.049302,0.064732,-0.053181,-0.061634,-0.064175,0.017099,0.074334,-0.024842,-0.029952,...,0.007894,-0.002646,-0.016462,-0.026604,0.029775,-0.000548,-0.001399,0.004095,0.006943,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2591,0.558124,0.058706,-0.120592,-0.008570,-0.094630,-0.013373,0.028327,-0.010855,-0.054237,0.007673,...,-0.002321,0.017321,-0.027076,0.000671,0.001108,0.020527,-0.001771,-0.004753,0.018250,
2592,0.283135,-0.117495,-0.034985,0.120626,0.045321,0.009844,0.085793,-0.080928,-0.010419,0.009684,...,-0.000644,0.001283,0.000022,-0.001462,-0.000159,0.001506,0.000177,0.001143,0.002083,0.0
2593,0.438572,-0.020936,-0.129308,0.027597,0.063673,-0.061608,0.028273,-0.066322,-0.008901,0.060160,...,-0.005434,-0.000102,0.019905,0.010720,0.009318,-0.008498,-0.009052,0.000408,0.029953,0.0
2594,0.255451,-0.046907,-0.083693,-0.038572,0.181689,-0.124315,0.107780,0.095599,-0.029179,0.119886,...,0.010835,0.010183,0.006773,0.017375,0.004420,-0.009111,-0.027515,-0.006209,0.019019,0.0


In [27]:
# split training: validation: testing = 6:2:2
X_train, y_train, X_valid, y_valid, X_test, y_test = train_valid_test_split(tfidf_df, target = 'label', 
                                                                            train_size=0.6, valid_size=0.2, test_size=0.2)

print('Training, Validation and Testing Sets of TFIDF+SVD:')
print('X_train:', X_train.shape)
print('y_train:', y_train.shape)
print('X_valid:', X_valid.shape)
print('y_valid:', y_valid.shape)
print('X_test:', X_test.shape)
print('y_test:', y_test.shape)

Training, Validation and Testing Sets of TFIDF+SVD:
X_train: (1557, 1750)
y_train: (1557,)
X_valid: (519, 1750)
y_valid: (519,)
X_test: (520, 1750)
y_test: (520,)


### With BoW transformed data

In [28]:
bow_df = df2
bow_df['label'] = df['label']

# drop missing value
#bow_df.dropna(subset = ["label"], inplace=True)

bow_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,491,492,493,494,495,496,497,498,499,label
0,13.710978,-3.288125,0.010169,3.118854,-1.771366,-2.182045,-0.743751,-2.730162,-0.877912,0.812125,...,0.156249,-0.355623,0.082566,0.152873,0.048748,0.425127,-0.255047,0.000680,0.115061,0.0
1,35.918642,-0.565146,-0.379418,-3.905108,-1.306191,1.046967,0.225118,1.469410,0.360801,-0.588404,...,-0.210689,-0.059775,-0.008094,-0.165109,0.050064,-0.200113,-0.169530,-0.150251,-0.058910,1.0
2,12.899480,-0.753032,-0.044016,-1.574052,-0.393521,0.717289,0.224887,0.723840,0.138641,0.126723,...,0.111929,0.159556,0.168072,-0.056920,0.091418,-0.106527,-0.123224,0.040680,-0.123494,1.0
3,0.126565,0.497219,-0.028405,0.463633,0.161745,0.235655,0.040815,0.037720,0.306927,0.244349,...,-0.019697,-0.132576,0.035903,-0.181913,-0.166119,0.097924,0.012458,0.258942,0.041086,0.0
4,8.459377,-0.293892,-0.138912,2.053423,-0.095933,0.963966,-0.318633,-0.736101,-0.030577,0.312204,...,0.064954,-0.412037,0.049198,-0.400780,0.322876,0.277298,-0.157914,-0.089003,-0.245053,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2591,160.056049,-21.477895,-0.578358,-3.588650,-13.737201,-10.723416,-3.170077,-11.644288,12.743010,7.841726,...,-0.365327,-0.561862,-0.452011,0.120471,0.138716,-0.020748,-1.110170,0.350588,-0.146417,
2592,38.523175,-2.051582,0.689642,8.999953,-2.769078,-9.959161,1.447135,6.047701,-2.044883,-3.710199,...,-0.129176,0.098172,0.173418,0.208365,0.204486,0.028290,0.038821,0.037955,0.089828,0.0
2593,35.318275,-3.608921,-0.081541,0.041269,0.507978,-1.716646,0.292880,2.606521,-2.553619,-3.767196,...,-0.721950,-0.095805,-0.707521,-0.340234,-0.055272,0.658145,0.303080,-0.659779,-0.198756,0.0
2594,18.758313,-3.817129,0.151512,1.971991,6.004395,-1.196212,-0.388405,-0.737272,-3.659581,-1.044254,...,-0.052630,-0.044032,-0.197488,0.048968,0.329945,-0.153837,-0.675478,-0.284351,-0.350964,0.0


In [29]:
# split training: validation: testing = 6:2:2
X_train2, y_train2, X_valid2, y_valid2, X_test2, y_test2 = train_valid_test_split(bow_df, target = 'label', 
                                                                            train_size=0.6, valid_size=0.2, test_size=0.2)

print('Training, Validation and Testing Sets of BOW+SVD:')
print('X_train:', X_train2.shape)
print('y_train:', y_train2.shape)
print('X_valid:', X_valid2.shape)
print('y_valid:', y_valid2.shape)
print('X_test:', X_test2.shape)
print('y_test:', y_test2.shape)

Training, Validation and Testing Sets of BOW+SVD:
X_train: (1557, 500)
y_train: (1557,)
X_valid: (519, 500)
y_valid: (519,)
X_test: (520, 500)
y_test: (520,)


## 6 Model Training (Logistic Regression)

### TFIDF

In [30]:
X = vectors_tfidf  # features
y = df['label']  # target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

lr = LogisticRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_test)

In [31]:
# model evaluation
accuracy = accuracy_score(y_test, y_pred)
confusion = confusion_matrix(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("TFIDF performance BEFORE hyperparameter tuning:")
print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")
print("Confusion matrix:")
confusion

TFIDF performance BEFORE hyperparameter tuning:
Accuracy: 0.9384615384615385
F1 Score: 0.7499999999999999
Confusion matrix:


array([[440,   0],
       [ 32,  48]])

### TFIDF + Tuning

In [32]:
### Create a logistic regression object
logreg = LogisticRegression()

# Define the parameter grid to search over
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],  # Regularization parameter
    'penalty': ['l1', 'l2']  # Type of regularization
}

# Create a GridSearchCV object
grid_search = GridSearchCV(logreg, param_grid, cv = 5, scoring = 'f1')

# Train the GridSearchCV object on the data
grid_search.fit(X_train, y_train)

# Print the best hyperparameters found
print("Best hyperparameters:", grid_search.best_params_)

# Use the best hyperparameters to make predictions
y_pred = grid_search.predict(X_test)

Best hyperparameters: {'C': 100, 'penalty': 'l2'}


In [33]:
# model evaluation
accuracy = accuracy_score(y_test, y_pred)
confusion = confusion_matrix(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# final performance after tuning hyperparameter
print("TFIDF performance AFTER hyperparameter tuning:")
print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")
print("Confusion matrix:")
confusion

TFIDF performance AFTER hyperparameter tuning:
Accuracy: 0.9711538461538461
F1 Score: 0.896551724137931
Confusion matrix:


array([[440,   0],
       [ 15,  65]])

### BoW

In [34]:
X = word_vectors  # features
y = df['label']  # target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

lr2 = LogisticRegression()
lr2.fit(X_train, y_train)

y_pred = lr2.predict(X_test)

In [35]:
# model evaluation
accuracy2 = accuracy_score(y_test, y_pred)
confusion2 = confusion_matrix(y_test, y_pred)
f1_2 = f1_score(y_test, y_pred)

print("BoW performance BEFORE hyperparameter tuning:")
print(f"Accuracy: {accuracy2}")
print(f"F1 Score: {f1_2}")
print("Confusion matrix:")
confusion2

BoW performance BEFORE hyperparameter tuning:
Accuracy: 0.9807692307692307
F1 Score: 0.935064935064935
Confusion matrix:


array([[438,   2],
       [  8,  72]])

### BoW + Tuning

In [36]:
# Create a logistic regression object
logreg2 = LogisticRegression()

# Define the parameter grid to search over
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],  # Regularization parameter
    'penalty': ['l1', 'l2']  # Type of regularization
}

# Create a GridSearchCV object
grid_search2 = GridSearchCV(logreg2, param_grid, cv = 5, scoring = 'f1')

# Train the GridSearchCV object on the data
grid_search2.fit(X_train, y_train)

# Print the best hyperparameters found
print("Best hyperparameters:", grid_search2.best_params_)

# Use the best hyperparameters to make predictions
y_pred = grid_search2.predict(X_test)

Best hyperparameters: {'C': 100, 'penalty': 'l2'}


In [37]:
# model evaluation
accuracy = accuracy_score(y_test, y_pred)
confusion = confusion_matrix(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# final performance after tuning hyperparameter
print("BoW performance AFTER hyperparameter tuning:")
print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")
print("Confusion matrix:")
confusion

BoW performance AFTER hyperparameter tuning:
Accuracy: 0.9788461538461538
F1 Score: 0.9290322580645162
Confusion matrix:


array([[437,   3],
       [  8,  72]])