### making necessary imports

In [11]:
import csv
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import accuracy_score, confusion_matrix



import warnings
warnings.filterwarnings("ignore")

### Read data from file and load into dataframe 

In [12]:
file=pd.read_csv("spambase.data",header=None)
df=file.copy(deep=True)
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,48,49,50,51,52,53,54,55,56,57
0,0.00,0.64,0.64,0.0,0.32,0.00,0.00,0.00,0.00,0.00,...,0.000,0.000,0.0,0.778,0.000,0.000,3.756,61,278,1
1,0.21,0.28,0.50,0.0,0.14,0.28,0.21,0.07,0.00,0.94,...,0.000,0.132,0.0,0.372,0.180,0.048,5.114,101,1028,1
2,0.06,0.00,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.010,0.143,0.0,0.276,0.184,0.010,9.821,485,2259,1
3,0.00,0.00,0.00,0.0,0.63,0.00,0.31,0.63,0.31,0.63,...,0.000,0.137,0.0,0.137,0.000,0.000,3.537,40,191,1
4,0.00,0.00,0.00,0.0,0.63,0.00,0.31,0.63,0.31,0.63,...,0.000,0.135,0.0,0.135,0.000,0.000,3.537,40,191,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4596,0.31,0.00,0.62,0.0,0.00,0.31,0.00,0.00,0.00,0.00,...,0.000,0.232,0.0,0.000,0.000,0.000,1.142,3,88,0
4597,0.00,0.00,0.00,0.0,0.00,0.00,0.00,0.00,0.00,0.00,...,0.000,0.000,0.0,0.353,0.000,0.000,1.555,4,14,0
4598,0.30,0.00,0.30,0.0,0.00,0.00,0.00,0.00,0.00,0.00,...,0.102,0.718,0.0,0.000,0.000,0.000,1.404,6,118,0
4599,0.96,0.00,0.00,0.0,0.32,0.00,0.00,0.00,0.00,0.00,...,0.000,0.057,0.0,0.000,0.000,0.000,1.147,5,78,0


## Preprocessing Data

### Splitting features and result in different variables

In [13]:
df.rename(columns=lambda x: "col"+str(x), inplace=True)
Xtrain=df.drop("col57", axis=1)
Ytrain = df["col57"]

### Using ANOVA feature selection and transformation technique

Since the input is numeric/continuous and output is categorical 

In [14]:
fvalue_selector = SelectKBest(f_classif, k=55)
X_kbest = fvalue_selector.fit_transform(Xtrain, Ytrain)
column_names = [column[0]  for column in zip(Xtrain.columns,fvalue_selector.get_support()) if column[1]]
Xtrain = pd.DataFrame(X_kbest,columns =column_names)
Xtrain

Unnamed: 0,col0,col2,col3,col4,col5,col6,col7,col8,col9,col10,...,col47,col48,col49,col50,col51,col52,col53,col54,col55,col56
0,0.00,0.64,0.0,0.32,0.00,0.00,0.00,0.00,0.00,0.00,...,0.0,0.000,0.000,0.0,0.778,0.000,0.000,3.756,61.0,278.0
1,0.21,0.50,0.0,0.14,0.28,0.21,0.07,0.00,0.94,0.21,...,0.0,0.000,0.132,0.0,0.372,0.180,0.048,5.114,101.0,1028.0
2,0.06,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,0.38,...,0.0,0.010,0.143,0.0,0.276,0.184,0.010,9.821,485.0,2259.0
3,0.00,0.00,0.0,0.63,0.00,0.31,0.63,0.31,0.63,0.31,...,0.0,0.000,0.137,0.0,0.137,0.000,0.000,3.537,40.0,191.0
4,0.00,0.00,0.0,0.63,0.00,0.31,0.63,0.31,0.63,0.31,...,0.0,0.000,0.135,0.0,0.135,0.000,0.000,3.537,40.0,191.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4596,0.31,0.62,0.0,0.00,0.31,0.00,0.00,0.00,0.00,0.00,...,0.0,0.000,0.232,0.0,0.000,0.000,0.000,1.142,3.0,88.0
4597,0.00,0.00,0.0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.0,0.000,0.000,0.0,0.353,0.000,0.000,1.555,4.0,14.0
4598,0.30,0.30,0.0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.0,0.102,0.718,0.0,0.000,0.000,0.000,1.404,6.0,118.0
4599,0.96,0.00,0.0,0.32,0.00,0.00,0.00,0.00,0.00,0.00,...,0.0,0.000,0.057,0.0,0.000,0.000,0.000,1.147,5.0,78.0


### checking whether there is correlation between any of the remaining features present, if there is we can safely drop this correlated feature

In [15]:
correlated_features = set()  
correlation_matrix = Xtrain.corr()  
for i in range(len(correlation_matrix .columns)):  
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > 0.85:
            colname = correlation_matrix.columns[i]
            correlated_features.add(colname)
(correlated_features)

{'col33'}

In [16]:
Xtrain=Xtrain.drop("col33",axis=1)

## Building Classifiers

### Logistic Regression

In [17]:
errors = []
logisticRegr = LogisticRegression()
kf = KFold(n_splits=10,shuffle=True)
print("Accuracy  F-pos   F-neg   Error rate")
for train, test in kf.split(Xtrain,Ytrain):
    logisticRegr.fit(Xtrain.iloc[train],Ytrain.iloc[train])
    predictions = logisticRegr.predict(Xtrain.iloc[test])
    score = accuracy_score(Ytrain.iloc[test],predictions)
    cm = confusion_matrix(Ytrain.iloc[test],predictions)
    fraction_cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    print("{0:.4f}   {1:.4f}   {2:.4f}   {3:.4f}".format(score, fraction_cm[0][1], fraction_cm[1][0], (1-score)))
    errors.append(1-score)
errors = np.array(errors)
print("Overall Error rate : {0:.4f}".format(errors.mean()))

Accuracy  F-pos   F-neg   Error rate
0.9046   0.0380   0.1717   0.0954
0.9152   0.0781   0.0942   0.0848
0.9109   0.0894   0.0886   0.0891
0.9130   0.0797   0.0978   0.0870
0.9239   0.0515   0.1117   0.0761
0.9413   0.0391   0.0894   0.0587
0.9239   0.0860   0.0608   0.0761
0.9022   0.0709   0.1354   0.0978
0.9370   0.0378   0.1065   0.0630
0.9261   0.0662   0.0867   0.0739
Overall Error rate : 0.0802


### Naive Bayes

In [19]:
gnb = GaussianNB()
kf = KFold(n_splits=10,shuffle=True)
errors = []
print("Accuracy  F-pos   F-neg   Error rate")
for train, test in kf.split(Xtrain,Ytrain):
    gnb.fit(Xtrain.iloc[train],Ytrain.iloc[train])
    predictions = gnb.predict(Xtrain.iloc[test])
    score = accuracy_score(Ytrain.iloc[test],predictions)
    cm = confusion_matrix(Ytrain.iloc[test],predictions)
    fraction_cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    print("{0:.4f}   {1:.4f}   {2:.4f}   {3:.4f}".format(score, fraction_cm[0][1], fraction_cm[1][0], (1-score)))
    errors.append(1-score)
errors = np.array(errors)
print("Overall Error rate : {0:.4f}".format(errors.mean()))

Accuracy  F-pos   F-neg   Error rate
0.7809   0.3112   0.0686   0.2191
0.8261   0.2482   0.0562   0.1739
0.8174   0.2885   0.0450   0.1826
0.8087   0.2786   0.0556   0.1913
0.8130   0.2818   0.0237   0.1870
0.8370   0.2602   0.0262   0.1630
0.8457   0.2313   0.0335   0.1543
0.7870   0.2896   0.0736   0.2130
0.8435   0.2407   0.0368   0.1565
0.8174   0.2904   0.0266   0.1826
Overall Error rate : 0.1823


### KNN Classifier

In [20]:
Kneighbors = KNeighborsClassifier(n_neighbors=1)
kf = KFold(n_splits=10,shuffle=True)
errors = []
print("Accuracy  F-pos   F-neg   Error rate")
for train, test in kf.split(Xtrain,Ytrain):
    Kneighbors.fit(Xtrain.iloc[train],Ytrain.iloc[train])
    predictions = Kneighbors.predict(Xtrain.iloc[test])
    score = accuracy_score(Ytrain.iloc[test],predictions)
    cm = confusion_matrix(Ytrain.iloc[test],predictions)
    fraction_cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    print("{0:.4f}   {1:.4f}   {2:.4f}   {3:.4f}".format(score, fraction_cm[0][1], fraction_cm[1][0], (1-score)))
    errors.append(1-score)
errors = np.array(errors)
print("Overall Error rate : {0:.4f}".format(errors.mean()))

Accuracy  F-pos   F-neg   Error rate
0.8200   0.1301   0.2500   0.1800
0.8478   0.1155   0.2077   0.1522
0.8217   0.1317   0.2514   0.1783
0.8261   0.1684   0.1829   0.1739
0.8370   0.1383   0.2022   0.1630
0.8304   0.1455   0.2054   0.1696
0.8217   0.1538   0.2184   0.1783
0.8109   0.1530   0.2458   0.1891
0.7957   0.1906   0.2253   0.2043
0.8283   0.1241   0.2419   0.1717
Overall Error rate : 0.1760


### Random Forest Classifier

In [21]:
randomforest = RandomForestClassifier(max_depth=2, random_state=0)
kf = KFold(n_splits=10,shuffle=True)
errors = []
print("Accuracy  F-pos   F-neg   Error rate")
for train, test in kf.split(Xtrain,Ytrain):
    randomforest.fit(Xtrain.iloc[train],Ytrain.iloc[train])
    predictions = randomforest.predict(Xtrain.iloc[test])
    score = accuracy_score(Ytrain.iloc[test],predictions)
    cm = confusion_matrix(Ytrain.iloc[test],predictions)
    fraction_cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    print("{0:.4f}   {1:.4f}   {2:.4f}   {3:.4f}".format(score, fraction_cm[0][1], fraction_cm[1][0], (1-score)))
    errors.append(1-score)
errors = np.array(errors)
print("Overall Error rate : {0:.4f}".format(errors.mean()))

Accuracy  F-pos   F-neg   Error rate
0.8937   0.0183   0.2340   0.1063
0.8848   0.0182   0.2581   0.1152
0.8935   0.0239   0.2515   0.1065
0.9022   0.0217   0.2120   0.0978
0.8696   0.0176   0.3125   0.1304
0.8804   0.0211   0.2800   0.1196
0.8957   0.0254   0.2228   0.1043
0.9000   0.0261   0.2031   0.1000
0.9000   0.0103   0.2529   0.1000
0.9000   0.0260   0.2042   0.1000
Overall Error rate : 0.1080


### Support Vector Machine

In [23]:
svm = make_pipeline(StandardScaler(), SVC(gamma='auto'))
kf = KFold(n_splits=10,shuffle=True)
errors = []
print("Accuracy  F-pos   F-neg   Error rate")
for train, test in kf.split(Xtrain,Ytrain):
    svm.fit(Xtrain.iloc[train],Ytrain.iloc[train])
    predictions = svm.predict(Xtrain.iloc[test])
    score = accuracy_score(Ytrain.iloc[test],predictions)
    cm = confusion_matrix(Ytrain.iloc[test],predictions)
    fraction_cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    print("{0:.4f}   {1:.4f}   {2:.4f}   {3:.4f}".format(score, fraction_cm[0][1], fraction_cm[1][0], (1-score)))
    errors.append(1-score)
errors = np.array(errors)
print("Overall Error rate : {0:.4f}".format(errors.mean()))

Accuracy  F-pos   F-neg   Error rate
0.9349   0.0246   0.1307   0.0651
0.9391   0.0521   0.0756   0.0609
0.9348   0.0265   0.1173   0.0652
0.9261   0.0414   0.1186   0.0739
0.9457   0.0396   0.0769   0.0543
0.9261   0.0474   0.1129   0.0739
0.9261   0.0436   0.1189   0.0739
0.9435   0.0305   0.1030   0.0565
0.9261   0.0565   0.1017   0.0739
0.9261   0.0393   0.1278   0.0739
Overall Error rate : 0.0672


###### Classifier    &emsp;&emsp;&emsp;&emsp;&nbsp;                Mean Error Rate<br>
Logistic Regression &nbsp; 0.0802<br>
Naive Bayes    &emsp;&emsp;&emsp;&emsp;&nbsp;    0.1823<br>
KNN           &emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&nbsp;     0.1760<br>
Random forest &emsp;&emsp;&emsp;      0.1080<br>
SVM           &emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&nbsp;     0.0672<br>

### The above table shows that SVM has the minimum error rate


