In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from imblearn.over_sampling import RandomOverSampler
from sklearn.svm import SVC
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('data.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 33 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       569 non-null    int64  
 1   diagnosis                569 non-null    object 
 2   radius_mean              569 non-null    float64
 3   texture_mean             569 non-null    float64
 4   perimeter_mean           569 non-null    float64
 5   area_mean                569 non-null    float64
 6   smoothness_mean          569 non-null    float64
 7   compactness_mean         569 non-null    float64
 8   concavity_mean           569 non-null    float64
 9   concave points_mean      569 non-null    float64
 10  symmetry_mean            569 non-null    float64
 11  fractal_dimension_mean   569 non-null    float64
 12  radius_se                569 non-null    float64
 13  texture_se               569 non-null    float64
 14  perimeter_se             5

In [None]:
df.drop(['Unnamed: 32','id'], inplace = True, axis = 1 )

In [None]:
df['diagnosis'] = df['diagnosis'].map({'M':1,'B':0})

In [None]:
Traing_score ={}
Testing_score = {}
Model_accuracy = {}


# Scaling and OverSampling

In [None]:
def scaling(X,Y, scale,over_sampling = False):
  scale.fit(X)
  scaled_data = scale.transform(X)
  osr = RandomOverSampler()
  if over_sampling:
    x_scaled, y_osr = osr.fit_resample(scaled_data, Y)
  else:
    x_scaled = scaled_data
    y_osr = Y
  return x_scaled, y_osr

Grid

In [None]:
grid = {
    'C':[0.1,1,10,100],
    'gamma':[1,0.1,0.01,0.001],
    'kernel':['linear','poly','sigmoid','rbf','laplacian'],
    'degree':[1,2,3,4,5]
}

In [None]:
X = df.drop('diagnosis', axis = 1)
Y = df['diagnosis']

# Without Scaling

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42)
svc_m = SVC(random_state=101)
cross_val = cross_val_score(svc_m, X_train, y_train, cv = 5)
Traing_score['Default_valuse'] = np.mean(cross_val)
print("The traing accuracy", Traing_score['Default_valuse'])
svc_m.fit(X_train, y_train)
Testing_score['Default_values'] = svc_m.fit(X_train, y_train).score(X_test, y_test)
print('The test accuracy',Testing_score['Default_values'])



The traing accuracy 0.8943354430379747
The test accuracy 0.935672514619883


# With Scaling

In [None]:
for i in [StandardScaler(), MinMaxScaler()]:
  X_scaled, Y_scaled = scaling(X,Y, i, False)
  X_train, X_test, y_train, y_test = train_test_split(X_scaled, Y_scaled, test_size=0.3, random_state=42)
  svc_m = SVC(random_state=101)
  cross_val = cross_val_score(svc_m, X_train, y_train, cv = 5)
  Traing_score[f'Default_valuse-{i}'] = np.mean(cross_val)
  print(f"The traing accuracy-{i}", Traing_score[f'Default_valuse-{i}'])
  svc_m.fit(X_train, y_train)
  Testing_score[f'Default_valuse-{i}'] = svc_m.fit(X_train, y_train).score(X_test, y_test)
  print(f'The test accuracy-{i}',Testing_score[f'Default_valuse-{i}'])



The traing accuracy-StandardScaler() 0.9672784810126581
The test accuracy-StandardScaler() 0.9707602339181286
The traing accuracy-MinMaxScaler() 0.9723734177215189
The test accuracy-MinMaxScaler() 0.9824561403508771


# Scaling and Oversampling

In [None]:
for i in [StandardScaler(), MinMaxScaler()]:
  X_scaled, Y_scaled = scaling(X,Y, i, True)
  X_train, X_test, y_train, y_test = train_test_split(X_scaled, Y_scaled, test_size=0.3, random_state=42)
  svc_m = SVC(random_state=101)
  cross_val = cross_val_score(svc_m, X_train, y_train, cv = 5)
  Traing_score[f'Default_valuse-{i}-OverSamp'] = np.mean(cross_val)
  print(f"The traing accuracy-{i}-OverSamp", Traing_score[f'Default_valuse-{i}-OverSamp'])
  svc_m.fit(X_train, y_train)
  Testing_score[f'Default_valuse-{i}-OverSamp'] = svc_m.fit(X_train, y_train).score(X_test, y_test)
  print(f'The test accuracy-{i}-Oversamp',Testing_score[f'Default_valuse-{i}-OverSamp'])

The traing accuracy-StandardScaler()-OverSamp 0.9759595959595959
The test accuracy-StandardScaler()-Oversamp 0.9627906976744186
The traing accuracy-MinMaxScaler()-OverSamp 0.9779595959595959
The test accuracy-MinMaxScaler()-Oversamp 0.9767441860465116


#Grid

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42)
svc_m = SVC(random_state = 101)
svc_grid = GridSearchCV(svc_m , grid , cv = 5)
svc_grid.fit(X_train, y_train)
print(f"the best parameters :{svc_grid.best_params_}")
Traing_score['Grid_valuse'] = svc_grid.best_score_
print("The traing accuracy", Traing_score['Grid_valuse'])
Testing_score['Grid_valuse'] = svc_grid.score(X_test, y_test)
print('The test accuracy',Testing_score['Grid_valuse'])

# Grid with Sampling

In [None]:
for i in [StandardScaler(), MinMaxScaler()]:
  X_scaled, Y_scaled = scaling(X,Y, i, False)
  X_train, X_test, y_train, y_test = train_test_split(X_scaled, Y_scaled, test_size=0.3, random_state=42)
  svc_m = SVC(random_state = 101)
  svc_grid = GridSearchCV(svc_m , grid , cv = 5)
  svc_grid.fit(X_train, y_train)
  print(f"the best parameters :{svc_grid.best_params_}")
  Traing_score[f'Grid_valuse-{i}'] = svc_grid.best_score_
  print(f"The traing accuracy-{i}", Traing_score[f'Grid_valuse-{i}'])
  Testing_score[f'Grid_valuse-{i}'] = svc_grid.score(X_test, y_test)
  print(f'The test accuracy-{i}',Testing_score[f'Grid_valuse-{i}'])

the best parameters :{'C': 1, 'degree': 1, 'gamma': 1, 'kernel': 'linear'}
The traing accuracy-StandardScaler() 0.9748101265822784
The test accuracy-StandardScaler() 0.9766081871345029
the best parameters :{'C': 1, 'degree': 2, 'gamma': 1, 'kernel': 'poly'}
The traing accuracy-MinMaxScaler() 0.977373417721519
The test accuracy-MinMaxScaler() 0.9883040935672515


# Grid with scalling and oversampling

In [None]:
for i in [StandardScaler(), MinMaxScaler()]:
  X_scaled, Y_scaled = scaling(X,Y, i, True)
  X_train, X_test, y_train, y_test = train_test_split(X_scaled, Y_scaled, test_size=0.3, random_state=42)
  svc_m = SVC(random_state = 101)
  svc_grid = GridSearchCV(svc_m , grid , cv = 5)
  svc_grid.fit(X_train, y_train)
  print(f"the best parameters :{svc_grid.best_params_}")
  Traing_score[f'Grid_valuse-{i}-OVS'] = svc_grid.best_score_
  print(f"The traing accuracy-{i}-OVS", Traing_score[f'Grid_valuse-{i}-OVS'])
  Testing_score[f'Grid_valuse-{i}-OVS'] = svc_grid.score(X_test, y_test)
  print(f'The test accuracy-{i}-OVS',Testing_score[f'Grid_valuse-{i}-OVS'])

In [None]:
df1 = pd.DataFrame(Traing_score, index=['Traing_score'])
df2 = pd.DataFrame(Testing_score, index=['Testing_score'])

Scores = pd.concat([df1, df2], axis=1)
Scores

# Linearsvc

In [None]:
from sklearn.svm import LinearSVC


X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42)
clf = LinearSVC()
clf.fit(X_train, y_train)

Traing_score[f'clf_valuse-{i}'] = svc_grid.best_score_
print(f"The traing accuracy-{i}", Traing_score[f'clf_valuse-{i}'])
Testing_score[f'clf_valuse-{i}'] = svc_grid.score(X_test, y_test)
print(f'The test accuracy-{i}',Testing_score[f'clf_valuse-{i}'])




# LinearSVC with scaling

In [None]:
for i in [StandardScaler(), MinMaxScaler()]:
  X_scaled, Y_scaled = scaling(X,Y, i, False)
  X_train, X_test, y_train, y_test = train_test_split(X_scaled, Y_scaled, test_size=0.3, random_state=42)
  clf = LinearSVC()
  clf.fit(X_train, y_train)
  clf.fit(X_train, y_train)

  Traing_score[f'clf_valuse-{i}'] = svc_grid.best_score_
  print(f"The traing accuracy-{i}", Traing_score[f'clf_valuse-{i}'])
  Testing_score[f'clf_valuse-{i}'] = svc_grid.score(X_test, y_test)
  print(f'The test accuracy-{i}',Testing_score[f'clf_valuse-{i}'])

#  LinearSVC with scaling and oversampling

In [None]:
for i in [StandardScaler(), MinMaxScaler()]:
  X_scaled, Y_scaled = scaling(X,Y, i, True)
  X_train, X_test, y_train, y_test = train_test_split(X_scaled, Y_scaled, test_size=0.3, random_state=42)
  clf = LinearSVC()
  clf.fit(X_train, y_train)
  clf.fit(X_train, y_train)

  Traing_score[f'clf_valuse-{i}-ovs'] = svc_grid.best_score_
  print(f"The traing accuracy-{i}-ovs", Traing_score[f'clf_valuse-{i}-ovs'])
  Testing_score[f'clf_valuse-{i}-ovs'] = svc_grid.score(X_test, y_test)
  print(f'The test accuracy-{i}-ovs',Testing_score[f'clf_valuse-{i}-ovs'])

# SGDClassifier


In [None]:
from sklearn.linear_model import SGDClassifier

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42)
clf = SGDClassifier()
clf.fit(X_train, y_train)
clf.fit(X_train, y_train)

Traing_score[f'clf_valuse-{i}'] = svc_grid.best_score_
print(f"The traing accuracy-{i}", Traing_score[f'clf_valuse-{i}'])
Testing_score[f'clf_valuse-{i}'] = svc_grid.score(X_test, y_test)
print(f'The test accuracy-{i}',Testing_score[f'clf_valuse-{i}'])

In [None]:
for i in [StandardScaler(), MinMaxScaler()]:
  X_scaled, Y_scaled = scaling(X,Y, i, False)
  X_train, X_test, y_train, y_test = train_test_split(X_scaled, Y_scaled, test_size=0.3, random_state=42)
  clf = clf = SGDClassifier()
  clf.fit(X_train, y_train)
  clf.fit(X_train, y_train)

  Traing_score[f'clf_valuse-{i}'] = svc_grid.best_score_
  print(f"The traing accuracy-{i}", Traing_score[f'clf_valuse-{i}'])
  Testing_score[f'clf_valuse-{i}'] = svc_grid.score(X_test, y_test)
  print(f'The test accuracy-{i}',Testing_score[f'clf_valuse-{i}'])

In [None]:
for i in [StandardScaler(), MinMaxScaler()]:
  X_scaled, Y_scaled = scaling(X,Y, i, True)
  X_train, X_test, y_train, y_test = train_test_split(X_scaled, Y_scaled, test_size=0.3, random_state=42)
  clf = clf = SGDClassifier()
  clf.fit(X_train, y_train)
  clf.fit(X_train, y_train)

  Traing_score[f'clf_valuse-{i}-ovs'] = svc_grid.best_score_
  print(f"The traing accuracy-{i}-ovs", Traing_score[f'clf_valuse-{i}-ovs'])
  Testing_score[f'clf_valuse-{i}-ovs'] = svc_grid.score(X_test, y_test)
  print(f'The test accuracy-{i}-ovs',Testing_score[f'clf_valuse-{i}-ovs'])

In [None]:
# Example with different hyperparameters
clf = SGDClassifier(alpha=0.001, penalty='l1', learning_rate='adaptive')