<a href="https://colab.research.google.com/github/michaelogenyi23/codesample/blob/main/EmailClassifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Email Classifier

In [None]:
import numpy as np
import pandas as pd
import random

### Importing the Data

Below we import the data from 3920 emails.  We want to use this data to train a linear classifier that predicts if an e-mail is spam based on things like how many times the word password or viagra appears in the e-mail. We also look at whether there are exclaimation points in the subject line, whether the e-mail has attachments, and some other features.

In [None]:
df = pd.read_excel("http://people.hsc.edu/faculty-staff/blins/StatsExamples/email.xlsx")
input_variables = ['to_multiple','cc','attach','dollar','winner','inherit','viagra','password','re_subj','exclaim_subj']
X = df[input_variables].to_numpy() # Create a numpy array with the data from the columns corresponding to predictor variables.
X = np.hstack((np.ones((len(X),1)),X)) # Add a column with all ones to the matrix X.
y = 2*df['spam'].to_numpy()-1 # Make a vector y which has entry +1 when an e-mail is spam and -1 when it is not.
print(df)

  warn("Workbook contains no default style, apply openpyxl's default")


      spam  to_multiple  cc  attach  dollar  winner  inherit  viagra  \
0        0            0   0       0       0       0        0       0   
1        0            0   0       0       0       0        0       0   
2        0            0   0       0       4       0        1       0   
3        0            0   0       0       0       0        0       0   
4        0            0   0       0       0       0        0       0   
...    ...          ...  ..     ...     ...     ...      ...     ...   
3916     1            0   0       0       0       0        0       0   
3917     1            0   0       0       1       0        0       0   
3918     0            1   0       0       0       0        0       0   
3919     0            1   0       0       0       0        0       0   
3920     1            0   0       0       2       1        0       0   

      password  re_subj  exclaim_subj  
0            0        0             0  
1            0        0             0  
2            0 

### Loss Functions and Their Gradients

In [None]:
def hingeLoss(w, x, y):
    margin = y*(w @ x)
    if margin < 1:
        return 1-margin
    else:
        return 0

def hingeLossGradient(w, x, y):
    margin = y*(w @ x)
    if margin < 1:
        return -y*x
    else:
        return 0

def zeroOneLoss(w, x, y):
    margin = y*(w @ x)
    if margin < 0:
        return 1
    else:
        return 0

def logisticLoss(w, x, y):
    odds = np.exp(x @ w)
    p = odds/(odds+1)
    if y > 0:
        return -np.log(p)
    else:
        return -np.log(1-p)

def logisticLossGradient(w, x, y):
    odds = np.exp(x @ w)
    p = odds/(odds+1)
    if y > 0:
        return -(1-p)*x
    else:
        return p*x

### A General Purpose Stochastic Gradient Descent Function

In [None]:
def stochasticDescent(X, y, n = 10**5, eta = 10**(-2), gradientFunc = hingeLossGradient):
    w = np.zeros(len(X[0]))
    R = 0.1 # regularization constant
    batch_size = 5
    indices = list(range(len(y)))
    for k in range(n):
        gradient = sum(gradientFunc(w,X[i],y[i]) for i in random.sample(indices,batch_size)) + R*np.sign(w)
        w = w - eta*gradient
    return w

w = stochasticDescent(X,y)
for i in range(len(w)):
    print((["intercept"]+input_variables)[i], "%.4f" % w[i])

intercept -1.0010
to_multiple -0.0010
cc -0.3130
attach -0.0010
dollar -0.0180
winner -0.0000
inherit 0.0010
viagra -0.0000
password -0.0070
re_subj -0.0070
exclaim_subj 0.0000
