# Project Group 32
## SVM Model - Cancer Classification

In [9]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, CategoricalNB
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from imblearn.over_sampling import RandomOverSampler
from IPython.display import display # Just for solution

drop_columns = ['id', 
                'perimeter_mean', 'area_mean', 'concavity_mean', 'concave points_mean',
                'perimeter_se', 'area_se', 'concavity_se', 'concave points_se', 'fractal_dimension_se',
                'perimeter_worst', 'area_worst', 'concavity_worst', 'concave points_worst', 'fractal_dimension_worst',
                ]

df = pd.read_csv('Cancer_Data.csv')
df = df.drop(drop_columns, axis = 1)

cats = ["radius_mean","texture_mean", "smoothness_mean","compactness_mean","symmetry_mean","fractal_dimension_mean","radius_se","texture_se","smoothness_se", "compactness_se","symmetry_se","radius_worst","texture_worst","smoothness_worst","compactness_worst","symmetry_worst"]
for cat in cats:
    print(df[cat].unique()) 

[17.99  20.57  19.69  11.42  20.29  12.45  18.25  13.71  13.    12.46
 16.02  15.78  19.17  15.85  13.73  14.54  14.68  16.13  19.81  13.54
 13.08   9.504 15.34  21.16  16.65  17.14  14.58  18.61  15.3   17.57
 18.63  11.84  17.02  19.27  16.74  14.25  13.03  14.99  13.48  13.44
 10.95  19.07  13.28  13.17  18.65   8.196 12.05  13.49  11.76  13.64
 11.94  18.22  15.1   11.52  19.21  14.71  13.05   8.618 10.17   8.598
  9.173 12.68  14.78   9.465 11.31   9.029 12.78  18.94   8.888 17.2
 13.8   12.31  16.07  13.53  18.05  20.18  12.86  11.45  13.34  25.22
 19.1   12.    18.46  14.48  19.02  12.36  14.64  14.62  15.37  13.27
 13.45  15.06  20.26  12.18   9.787 11.6   14.42  13.61   6.981  9.876
 10.49  13.11  11.64  22.27  11.34   9.777 12.63  14.26  10.51   8.726
 11.93   8.95  14.87  17.95  11.41  18.66  24.25  14.5   13.37  13.85
 19.    19.79  12.19  15.46  16.16  15.71  18.45  12.77  11.71  11.43
 14.95  11.28   9.738 16.11  12.9   10.75  11.9   11.8   14.44  13.74
  8.219  9.731 11.

In [12]:
df['diagnosis_N'] = (df['diagnosis'] == 'M')
print(df['diagnosis_N'])
df = df.drop(columns=['diagnosis'])

print(df.columns[df.isna().any()])
df = df.fillna(0)

print(df.shape)
display(df.head(100))

0       True
1       True
2       True
3       True
4       True
       ...  
564     True
565     True
566     True
567     True
568    False
Name: diagnosis_N, Length: 569, dtype: bool
Index([], dtype='object')
(569, 17)


Unnamed: 0,radius_mean,texture_mean,smoothness_mean,compactness_mean,symmetry_mean,fractal_dimension_mean,radius_se,texture_se,smoothness_se,compactness_se,symmetry_se,radius_worst,texture_worst,smoothness_worst,compactness_worst,symmetry_worst,diagnosis_N
0,17.990,10.38,0.11840,0.27760,0.2419,0.07871,1.0950,0.9053,0.006399,0.04904,0.03003,25.38,17.33,0.1622,0.66560,0.4601,True
1,20.570,17.77,0.08474,0.07864,0.1812,0.05667,0.5435,0.7339,0.005225,0.01308,0.01389,24.99,23.41,0.1238,0.18660,0.2750,True
2,19.690,21.25,0.10960,0.15990,0.2069,0.05999,0.7456,0.7869,0.006150,0.04006,0.02250,23.57,25.53,0.1444,0.42450,0.3613,True
3,11.420,20.38,0.14250,0.28390,0.2597,0.09744,0.4956,1.1560,0.009110,0.07458,0.05963,14.91,26.50,0.2098,0.86630,0.6638,True
4,20.290,14.34,0.10030,0.13280,0.1809,0.05883,0.7572,0.7813,0.011490,0.02461,0.01756,22.54,16.67,0.1374,0.20500,0.2364,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,20.260,23.03,0.09078,0.13130,0.2095,0.05649,0.7576,1.5090,0.006016,0.03482,0.02657,24.22,31.59,0.1190,0.35390,0.3689,True
96,12.180,17.84,0.10450,0.07057,0.1900,0.06635,0.3661,1.5110,0.005433,0.01179,0.02220,12.83,20.92,0.1140,0.09358,0.2227,False
97,9.787,19.94,0.10240,0.05301,0.1350,0.06890,0.3350,2.0430,0.011130,0.01463,0.01801,10.92,26.29,0.1316,0.09473,0.1934,False
98,11.600,12.84,0.08983,0.07525,0.1620,0.06582,0.2315,0.5391,0.006153,0.01330,0.01651,13.06,17.16,0.1431,0.18510,0.2772,False


## Exercies 4 - SVM Classifier (35 points in total)

### Exercise 4.1 - Additional Data Preprocessing (5 points)

To build a SVM Classifier, we need a different encoding for our categorical variables.

 - For each of the **categorical attribtues**, encode them with **one-hot encoding**.
     - You can find information about this encoding in the discussion materials.


 - Split the data into training and testing set with the ratio of 80:20.


In [15]:
cats = ["radius_mean","texture_mean", "smoothness_mean","compactness_mean","symmetry_mean","fractal_dimension_mean","radius_se","texture_se","smoothness_se", "compactness_se","symmetry_se","radius_worst","texture_worst","smoothness_worst","compactness_worst","symmetry_worst"]
nums = [col for col in df.drop(columns=['diagnosis_N']).columns if col not in cats]
df_svm = df.copy()
df_svm = pd.get_dummies(df_svm, columns=cats)
print(df_svm.columns)

svm_train, svm_test = train_test_split(df_svm, test_size=0.2)
X_svm_train, y_svm_train = svm_train.drop(columns=['diagnosis_N']), svm_train['diagnosis_N']
X_svm_test, y_svm_test = svm_test.drop(columns=['diagnosis_N']), svm_test['diagnosis_N']

Index(['diagnosis_N', 'radius_mean_6.981', 'radius_mean_7.691',
       'radius_mean_7.729', 'radius_mean_7.76', 'radius_mean_8.196',
       'radius_mean_8.219', 'radius_mean_8.571', 'radius_mean_8.597',
       'radius_mean_8.598',
       ...
       'symmetry_worst_0.4753', 'symmetry_worst_0.4761',
       'symmetry_worst_0.4824', 'symmetry_worst_0.4863',
       'symmetry_worst_0.4882', 'symmetry_worst_0.5166',
       'symmetry_worst_0.544', 'symmetry_worst_0.5558',
       'symmetry_worst_0.5774', 'symmetry_worst_0.6638'],
      dtype='object', length=7931)


### Exercise 4.2 - SVM with Different Kernels (20 points)

Using all the attributes we have, please build a SVM that predicts the column `satisfied`. <br >
Specifically, please 
 - Build one SVM with **linear kernel**.
 - Build another SVM but with **rbf kernel**.
 - Report the **testing results** of **both models** using `classification report`.

The kernel is the only setting requirement. <br >
Other hyperparameter tuning is not required. But make sure they are the same in these two SVMs if you'd like to tune the model. In other words, the only difference between the two SVMs should be the kernel setting.

**Remember to scale your data. The scaling method is up to you.**

In [16]:
svc_li = SVC(kernel='linear')

scaler = preprocessing.StandardScaler()
scaler.fit(X_svm_train)

Z_svm_train = scaler.transform(X_svm_train)
Z_svm_test = scaler.transform(X_svm_test)

svc_li.fit(Z_svm_train, np.asarray(y_svm_train))

print('Linear Kernel')
print(classification_report(y_svm_test, svc_li.predict(Z_svm_test)))

Linear Kernel
              precision    recall  f1-score   support

       False       0.67      1.00      0.80        75
        True       1.00      0.05      0.10        39

    accuracy                           0.68       114
   macro avg       0.83      0.53      0.45       114
weighted avg       0.78      0.68      0.56       114



### Exercise 4.3 - SVM with Over-sampling (10 points)
 - For the column `satisfied` in our **training set**, please **print out** the frequency of each class. 
 - Oversample the **training data**. 
 - For the column `satisfied` in the oversampled data, **print out** the frequency of each class  again.
 - Re-build the 2 SVMs with the same setting you have in Exercise 3.2, but **use oversampled training data** instead.
     - Do not forget to scale the data first. As always, the scaling method is up to you.
 - Report the **testing result** with `classification_report`.

You can use ANY methods listed on [here](https://imbalanced-learn.org/stable/references/over_sampling.html#) such as RandomOverSampler or SMOTE. <br > 
You are definitely welcomed to build your own oversampler. <br >

Note that you do not have to over-sample your testing data.

In [17]:
print(y_svm_train.value_counts())

ros = RandomOverSampler()
X_os, y_os = ros.fit_resample(X_svm_train, y_svm_train)

print(y_os.value_counts())

False    282
True     173
Name: diagnosis_N, dtype: int64
True     282
False    282
Name: diagnosis_N, dtype: int64


In [18]:
import numpy as np

class SVM:
    def __init__(self,lr=.005,iters=1000,lambda_p=0.01):
        self.lr=lr
        self.iters=iters
        self.lambda_p = lambda_p

        self.weight = None
        self.bias = None
        
    def fit(self,X,y):
        n_samples,n_feat = X.shape
        
        self.weight = np.zeros(n_feat)
        self.bias = 0
        
        y_true = np.where(y <= 0, -1, 1) # same as -1 if y <= 0 else 1
        
        for _ in range(self.iters):
            for idx, sample in enumerate(X):
                y_pred = y_true[idx]*(np.dot(sample,self.weight)-self.bias)

                if y_pred>=1:
                    self.weight -= self.lr * (2 * self.lambda_p * self.weight)
                else:
                    self.weight -= self.lr * (2 * self.lambda_p * self.weight - np.dot(sample, y_true[idx]))
                    self.bias -= self.lr * y_true[idx]
        
    def predict(self,X):
        # do a forward pass
        y = np.dot(X, self.weight) - self.bias
        return y>=0 # if > 0, belongs to class 0, else class 1

In [19]:
svm = SVM()
svm.fit(Z_svm_train, np.asarray(y_svm_train))

print(classification_report(y_svm_test, svm.predict(Z_svm_test)))

              precision    recall  f1-score   support

       False       0.67      1.00      0.80        75
        True       1.00      0.05      0.10        39

    accuracy                           0.68       114
   macro avg       0.83      0.53      0.45       114
weighted avg       0.78      0.68      0.56       114

