# TEST 3 - Logistic regression on spam mail data
## Student: Luis Vasquez
___

The exercise asks for a 3-set cross validation, optimizing de strength of $l_2$ regularizer, on 3 versions of the original *spamData.mat* dataset, containing the frequency of words in mails.
The three transformations that are requerid are:

* Normalize ($\mu = 0, \sigma = 1$)
* Apply tranformation log(x + 0.1), for all x in training data
* Apply tranformation bin(x > 0), for all x in training data

The procedure of the problem will consider the loading of the data to python, it's transformations and the final grid search optimization.

### **1. Loading the data from spamData.mat**

In [1]:
from scipy.io import loadmat
import pandas as pd
import numpy as np

In [2]:
def load_data(data_path, features_path):
    spam_data = loadmat(data_path)
    x_train = spam_data['Xtrain']
    x_test = spam_data['Xtest']
    y_train = spam_data['ytrain']
    y_test = spam_data['ytest']
    
    with open(features_path, 'r') as file:
        feature_names = file.read()
        feature_names = feature_names.split("\n")
    
    train_df = pd.DataFrame(np.c_[x_train, y_train], columns=feature_names+['target'])
    test_df = pd.DataFrame(np.c_[x_test, y_test], columns=feature_names+['target'])
    
    return train_df, test_df
    
train_df, test_df = load_data('spamData.mat', 'spamFeatures.txt')

In [3]:
train_df

Unnamed: 0,word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,...,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total,target
0,0.21,0.28,0.50,0.0,0.14,0.28,0.21,0.07,0.00,0.94,...,0.000,0.132,0.0,0.372,0.180,0.048,5.114,101.0,1028.0,1.0
1,0.00,0.00,0.00,0.0,0.63,0.00,0.31,0.63,0.31,0.63,...,0.000,0.137,0.0,0.137,0.000,0.000,3.537,40.0,191.0,1.0
2,0.00,0.00,0.00,0.0,0.63,0.00,0.31,0.63,0.31,0.63,...,0.000,0.135,0.0,0.135,0.000,0.000,3.537,40.0,191.0,1.0
3,0.00,0.00,0.00,0.0,1.85,0.00,0.00,1.85,0.00,0.00,...,0.000,0.223,0.0,0.000,0.000,0.000,3.000,15.0,54.0,1.0
4,0.00,0.00,0.00,0.0,1.92,0.00,0.00,0.00,0.00,0.64,...,0.000,0.054,0.0,0.164,0.054,0.000,1.671,4.0,112.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3060,0.31,0.00,0.62,0.0,0.00,0.31,0.00,0.00,0.00,0.00,...,0.000,0.232,0.0,0.000,0.000,0.000,1.142,3.0,88.0,0.0
3061,0.00,0.00,0.00,0.0,0.00,0.00,0.00,0.00,0.00,0.00,...,0.000,0.000,0.0,0.353,0.000,0.000,1.555,4.0,14.0,0.0
3062,0.30,0.00,0.30,0.0,0.00,0.00,0.00,0.00,0.00,0.00,...,0.102,0.718,0.0,0.000,0.000,0.000,1.404,6.0,118.0,0.0
3063,0.96,0.00,0.00,0.0,0.32,0.00,0.00,0.00,0.00,0.00,...,0.000,0.057,0.0,0.000,0.000,0.000,1.147,5.0,78.0,0.0


In [4]:
test_df

Unnamed: 0,word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,...,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total,target
0,0.00,0.64,0.64,0.0,0.32,0.00,0.00,0.00,0.00,0.00,...,0.00,0.000,0.000,0.778,0.000,0.000,3.756,61.0,278.0,1.0
1,0.06,0.00,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.01,0.143,0.000,0.276,0.184,0.010,9.821,485.0,2259.0,1.0
2,0.15,0.00,0.46,0.0,0.61,0.00,0.30,0.00,0.92,0.76,...,0.00,0.271,0.000,0.181,0.203,0.022,9.744,445.0,1257.0,1.0
3,0.06,0.12,0.77,0.0,0.19,0.32,0.38,0.00,0.06,0.00,...,0.04,0.030,0.000,0.244,0.081,0.000,1.729,43.0,749.0,1.0
4,0.00,0.69,0.34,0.0,0.34,0.00,0.00,0.00,0.00,0.00,...,0.00,0.056,0.000,0.786,0.000,0.000,3.728,61.0,261.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1531,0.00,0.00,1.23,0.0,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.000,0.406,0.000,0.000,0.000,1.666,13.0,70.0,0.0
1532,0.00,0.00,0.00,0.0,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.125,0.000,0.000,0.125,0.000,1.272,4.0,28.0,0.0
1533,0.00,0.00,0.00,0.0,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.185,0.000,0.000,0.000,0.092,2.468,11.0,79.0,0.0
1534,0.00,0.00,1.25,0.0,2.50,0.00,0.00,0.00,0.00,0.00,...,0.00,0.111,0.000,0.000,0.000,0.000,1.285,4.0,27.0,0.0


### **2. Preprocessing/Transforming the data**

#### 2.1 Version 1: Standarized columns, $\mu = 0, \sigma = 1$

In [5]:
def normalize(df):
    sub_df = df.loc[:,df.columns != 'target']
    x = (sub_df - sub_df.mean())/sub_df.std()
    x['target'] = df['target']
    return x

normal_train_df = normalize(train_df)
normal_test_df = normalize(test_df)

# preview
normal_train_df

Unnamed: 0,word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,...,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total,target
0,0.303072,0.037496,0.466564,-0.047176,-0.267926,0.706966,0.257926,-0.089738,-0.336761,1.018763,...,-0.161163,-0.022933,-0.149441,0.122263,0.388390,-0.010442,0.007831,0.219077,1.227820,1.0
1,-0.338634,-0.166313,-0.566358,-0.047176,0.470503,-0.367423,0.525336,1.260537,0.837239,0.566486,...,-0.161163,-0.005006,-0.149441,-0.147446,-0.304666,-0.102887,-0.050050,-0.057461,-0.150151,1.0
2,-0.338634,-0.166313,-0.566358,-0.047176,0.470503,-0.367423,0.525336,1.260537,0.837239,0.566486,...,-0.161163,-0.012177,-0.149441,-0.149741,-0.304666,-0.102887,-0.050050,-0.057461,-0.150151,1.0
3,-0.338634,-0.166313,-0.566358,-0.047176,2.309042,-0.367423,-0.303635,4.202206,-0.336761,-0.352659,...,-0.161163,0.303325,-0.149441,-0.304680,-0.304666,-0.102887,-0.069760,-0.170797,-0.375697,1.0
4,-0.338634,-0.166313,-0.566358,-0.047176,2.414532,-0.367423,-0.303635,-0.258522,-0.336761,0.581075,...,-0.161163,-0.302582,-0.149441,-0.116458,-0.096749,-0.102887,-0.118539,-0.220664,-0.280210,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3060,0.608647,-0.166313,0.714466,-0.047176,-0.478906,0.822079,-0.303635,-0.258522,-0.336761,-0.352659,...,-0.161163,0.335592,-0.149441,-0.304680,-0.304666,-0.102887,-0.137955,-0.225198,-0.319722,0.0
3061,-0.338634,-0.166313,-0.566358,-0.047176,-0.478906,-0.367423,-0.303635,-0.258522,-0.336761,-0.352659,...,-0.161163,-0.496185,-0.149441,0.100457,-0.304666,-0.102887,-0.122796,-0.220664,-0.441550,0.0
3062,0.578089,-0.166313,0.053395,-0.047176,-0.478906,-0.367423,-0.303635,-0.258522,-0.336761,-0.352659,...,0.271954,2.078022,-0.149441,-0.304680,-0.304666,-0.102887,-0.128338,-0.211598,-0.270332,0.0
3063,2.594881,-0.166313,-0.566358,-0.047176,0.003334,-0.367423,-0.303635,-0.258522,-0.336761,-0.352659,...,-0.161163,-0.291826,-0.149441,-0.304680,-0.304666,-0.102887,-0.137771,-0.216131,-0.336185,0.0


#### 2.2 Version 2: Transform $log(x_{ij} + 0.1)$

In [6]:
def transform_log(df):
    x_values = df.loc[:, df.columns != 'target']
    x_values = x_values.applymap(lambda x:np.log(x + 0.1))
    x_values['target'] = df['target']
    df = x_values
    return df

log_train_df = transform_log(train_df)
log_test_df = transform_log(test_df)

# preview
log_train_df

Unnamed: 0,word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,...,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total,target
0,-1.171183,-0.967584,-0.510826,-2.302585,-1.427116,-0.967584,-1.171183,-1.771957,-2.302585,0.039221,...,-2.302585,-1.461018,-2.302585,-0.750776,-1.272966,-1.910543,1.651347,4.616110,6.935468,1.0
1,-2.302585,-2.302585,-2.302585,-2.302585,-0.314711,-2.302585,-0.891598,-0.314711,-0.891598,-0.314711,...,-2.302585,-1.439695,-2.302585,-1.439695,-2.302585,-2.302585,1.291159,3.691376,5.252797,1.0
2,-2.302585,-2.302585,-2.302585,-2.302585,-0.314711,-2.302585,-0.891598,-0.314711,-0.891598,-0.314711,...,-2.302585,-1.448170,-2.302585,-1.448170,-2.302585,-2.302585,1.291159,3.691376,5.252797,1.0
3,-2.302585,-2.302585,-2.302585,-2.302585,0.667829,-2.302585,-2.302585,0.667829,-2.302585,-2.302585,...,-2.302585,-1.130103,-2.302585,-2.302585,-2.302585,-2.302585,1.131402,2.714695,3.990834,1.0
4,-2.302585,-2.302585,-2.302585,-2.302585,0.703098,-2.302585,-2.302585,-2.302585,-2.302585,-0.301105,...,-2.302585,-1.870803,-2.302585,-1.331806,-1.870803,-2.302585,0.571544,1.410987,4.719391,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3060,-0.891598,-2.302585,-0.328504,-2.302585,-2.302585,-0.891598,-2.302585,-2.302585,-2.302585,-2.302585,...,-2.302585,-1.102620,-2.302585,-2.302585,-2.302585,-2.302585,0.216723,1.131402,4.478473,0.0
3061,-2.302585,-2.302585,-2.302585,-2.302585,-2.302585,-2.302585,-2.302585,-2.302585,-2.302585,-2.302585,...,-2.302585,-2.302585,-2.302585,-0.791863,-2.302585,-2.302585,0.503801,1.410987,2.646175,0.0
3062,-0.916291,-2.302585,-0.916291,-2.302585,-2.302585,-2.302585,-2.302585,-2.302585,-2.302585,-2.302585,...,-1.599488,-0.200893,-2.302585,-2.302585,-2.302585,-2.302585,0.408128,1.808289,4.771532,0.0
3063,0.058269,-2.302585,-2.302585,-2.302585,-0.867501,-2.302585,-2.302585,-2.302585,-2.302585,-2.302585,...,-2.302585,-1.851509,-2.302585,-2.302585,-2.302585,-2.302585,0.220741,1.629241,4.357990,0.0


#### 2.3 Version 2: Transform to binary ($x_{ij} > 0)$

In [7]:
def transform_binary(df):
    x_values = df.loc[:, df.columns != 'target']
    x_values = x_values.applymap(lambda x: int(x > 0))
    x_values['target'] = df['target']
    df = x_values
    return df

binary_train_df = transform_binary(train_df)
binary_test_df = transform_binary(test_df)

# preview
binary_train_df

Unnamed: 0,word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,...,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total,target
0,1,1,1,0,1,1,1,1,0,1,...,0,1,0,1,1,1,1,1,1,1.0
1,0,0,0,0,1,0,1,1,1,1,...,0,1,0,1,0,0,1,1,1,1.0
2,0,0,0,0,1,0,1,1,1,1,...,0,1,0,1,0,0,1,1,1,1.0
3,0,0,0,0,1,0,0,1,0,0,...,0,1,0,0,0,0,1,1,1,1.0
4,0,0,0,0,1,0,0,0,0,1,...,0,1,0,1,1,0,1,1,1,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3060,1,0,1,0,0,1,0,0,0,0,...,0,1,0,0,0,0,1,1,1,0.0
3061,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,1,1,1,0.0
3062,1,0,1,0,0,0,0,0,0,0,...,1,1,0,0,0,0,1,1,1,0.0
3063,1,0,0,0,1,0,0,0,0,0,...,0,1,0,0,0,0,1,1,1,0.0


### **3. Optimizing via Cross Validation**

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error

#regularizer_strength_range = np.arange(0.0001,1.,0.01)
params = {
    "C" : np.arange(0.01,20,.1)
}



In [9]:
import warnings
warnings.filterwarnings('ignore')

data = {
    "normal": [normal_train_df, normal_test_df],
    "log": [log_train_df, log_test_df],
    "binary": [binary_train_df, binary_test_df]
}

best_mean_errors = {}

for name, dataframes_list in data.items():
    for df, use_type in zip(dataframes_list, ['train', 'test']):
        x = (df.loc[:, df.columns != 'target']).values
        y = (df.loc[:, 'target']).values
        gs = GridSearchCV(LogisticRegression(), param_grid=params)
        gs.fit(x,y)

        print("-"*10)
        print(name + "_" + use_type)
        print("Optim. C value:", gs.best_params_['C'])
        print("Optim. regularizer strength value:", 1/(gs.best_params_['C']))
        lr = LogisticRegression(**gs.best_params_)
        lr.fit(x, y)
        best_mean_errors[name + "_" + use_type] = mean_absolute_error(y, lr.predict(x))

----------
normal_train
Optim. C value: 16.310000000000002
Optim. regularizer strength value: 0.06131207847946044
----------
normal_test
Optim. C value: 8.01
Optim. regularizer strength value: 0.12484394506866417
----------
log_train
Optim. C value: 0.21000000000000002
Optim. regularizer strength value: 4.761904761904762
----------
log_test
Optim. C value: 0.7100000000000001
Optim. regularizer strength value: 1.408450704225352
----------
binary_train
Optim. C value: 0.21000000000000002
Optim. regularizer strength value: 4.761904761904762
----------
binary_test
Optim. C value: 1.11
Optim. regularizer strength value: 0.9009009009009008


In [10]:
print(best_mean_errors)

{'normal_train': 0.06982055464926591, 'normal_test': 0.06966145833333333, 'log_train': 0.052528548123980424, 'log_test': 0.048177083333333336, 'binary_train': 0.0632952691680261, 'binary_test': 0.059244791666666664}


### **4. Conclusions**

Translating the results to a table, finally, the absolute mean error is as follows:

|method|train|test|
|:-|:-:|:-:|
|stnd|0.0698|0.0696|
|log|0.0525|0.04817|
|binary|0.0633|0.05924|

The discrepancy between what should have resulted (from the source material) and what we got may be due to a different path that the grid search took in order to get to the optim.