# Linear Algebra Assignment 4
## 110062219 翁君牧

In [None]:
import sys

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

try:
    from sklearnex import patch_sklearn
    patch_sklearn()
    print("Intel Acceleration Extension for Sci-Kit Learn enabled.", file=sys.stderr)
except Exception as e:
    print(e, file=sys.stderr)

In [None]:
# ---------- using linear least square ----------------
def LSTSQDistance(spam, ham, test):
    X1 = np.linalg.lstsq(spam, test, rcond=None)[0]
    R1 = np.dot(spam, X1) - test
    X2 = np.linalg.lstsq(ham, test, rcond=None)[0]
    R2 = np.dot(ham, X2) - test
    [m, n] = R1.shape
    pos = 0
    neg = 0
    for i in range(n):
        # distance to the spam subspace
        d1 = np.linalg.norm(R1[:,i])
        # distance to the ham subspace
        d2 = np.linalg.norm(R2[:,i])
        if d1 >= d2:
            # not a spam
            neg = neg + 1
        else:
            # is a spam
            pos = pos + 1

    return pos, neg

In [None]:
# ---------- main program --------------------
emails = pd.read_csv("emails.csv")
df = pd.DataFrame(emails)
spam = df['spam']
fold = df['fold']

# initialize
cv = TfidfVectorizer(stop_words='english', max_features=20000, token_pattern=r"(?u)\b[a-zA-Z]\w+\b") 
dt_matrix = cv.fit_transform(df['text'])
[m, n] = dt_matrix.shape

In [None]:
# For each email, we have two tags: spam and fold.
# Spam = 1 means the email is a spam; spam = 0 means the email is not a spam
# And for each email, it also belongs to a fold.
# In this file, there are 5 fold, numbered from 0 to 4.
# In this example, we will use fold 0-3 as training data and fold 4 as test data.

spam_train = (dt_matrix[[i for i in range(m) if fold[i] != 4 and spam[i]], :]).toarray().transpose()
spam_test = (dt_matrix[[i for i in range(m) if fold[i] == 4 and spam[i]], :]).toarray().transpose()
ham_train = (dt_matrix[[i for i in range(m) if fold[i] != 4 and not spam[i]], :]).toarray().transpose()
ham_test = (dt_matrix[[i for i in range(m) if fold[i] == 4 and not spam[i]], :]).toarray().transpose()

# Compute the confusion matrix
p1, n1 = LSTSQDistance(spam_train, ham_train, spam_test)
p2, n2 = LSTSQDistance(spam_train, ham_train, ham_test)
print(p1, n1)
print(p2, n2)