In [None]:
!pip install -U imbalanced-learn

Requirement already up-to-date: imbalanced-learn in /usr/local/lib/python3.7/dist-packages (0.8.0)


# Minimally exploring the breast-cancer data


Class Distribution:

- no-recurrence-events: 201 instances
- recurrence-events: 85 instances


In [None]:
import pandas as pd
# reading csv files
headers =["recurrence", "age", "menopause", "tumor-size", "inv-nodes",
           "node-caps", "deg-malig", "breast", "breast-quad",
           "irradiant"]
cancer = pd.read_csv('breast-cancer.data', sep=",", 
                     header = None,names=headers, na_values="?" )


cleanup = {"recurrence": {"no-recurrence-events": 0, "recurrence-events": 1},
           "age": {"10-19": 10, "20-29": 20, "30-39": 30, "40-49": 40,
                   "50-59": 50, "60-69": 60, "70-79": 70, "80-89": 80, "90-99": 90},
           "menopause": {"premeno": 0, "lt40":1, "ge40":2},
           "tumor-size": {"0-4": 0, "5-9":5, "10-14":10, "15-19":15, "20-24":20, 
                         "25-29":25, "30-34":30, "35-39":35, "40-44":40,
                          "45-49":45, "50-54":50, "55-59":55},
           "inv-nodes": {"0-2": 0, "3-5":3, "6-8":6, "9-11":9, "12-14":12, 
                         "15-17":15, "18-20":18, "21-23":21, "24-26":24,
                         "27-29":27, "30-32":30, "33-35":33, "36-39":36},
           "node-caps": {"yes":1, "no":0},
           "breast": {"left":0, "right":1},
           "breast-quad": {"left_up":-2, "left_low":-1, 
                           "right_up":1,	"right_low":2, "central":0},
           "irradiant": {"yes":1, "no":0}}

cancer = cancer.replace(cleanup).dropna()

print(cancer)

     recurrence  age  menopause  ...  breast  breast-quad  irradiant
0             0   30          0  ...       0         -1.0          0
1             0   40          0  ...       1          1.0          0
2             0   40          0  ...       0         -1.0          0
3             0   60          2  ...       1         -2.0          0
4             0   40          0  ...       1          2.0          0
..          ...  ...        ...  ...     ...          ...        ...
281           1   30          0  ...       0         -2.0          0
282           1   30          0  ...       0         -2.0          1
283           1   60          2  ...       1         -2.0          0
284           1   40          2  ...       0         -1.0          0
285           1   50          2  ...       0         -1.0          0

[277 rows x 10 columns]


### Import libraries

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve
from sklearn.metrics import auc, accuracy_score
import matplotlib.pyplot as plt
import numpy as np


from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.under_sampling import NeighbourhoodCleaningRule
from warnings import simplefilter
from imblearn.under_sampling import TomekLinks
from imblearn.over_sampling import SMOTE



from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)


We split the data in 80-20 proportion and preprocess:

In [None]:
from sklearn import preprocessing
X = cancer.drop('recurrence', axis=1)
y= cancer['recurrence']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=11)

scaler = preprocessing.StandardScaler().fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# Functions to test resampling techniques

In [None]:
def testResamplingTechniques(model, resampling, X_train, y_train, X_test, y_test,niter=1):
  
  if resampling == "none": 
    model2 = model
    model2.fit(X_train,y_train)
    y_scores = model2.predict_proba(X_test)
    fpr, tpr, threshold = roc_curve(y_test, y_scores[:, 1])
   
    
    y_predict= model2.predict(X_test)
    acc = accuracy_score(y_test,y_predict)
    
    aucc = auc(fpr, tpr)
    
    
    return acc, aucc
  
  else: 
    accVec = []
    fprVec = []
    tprVec = []
    aucVec = []

    for i in range(niter):

      Xmod, ymod = resampling.fit_resample(X_train, y_train) 
      model2 = model
      
      model2.fit(Xmod,ymod)
    
      y_scores = model2.predict_proba(X_test)
      fpr, tpr, threshold = roc_curve(y_test, y_scores[:, 1])
      #fprVec.append(fpr)
      #tprVec.append(tpr)
      aucVec.append(auc(fpr, tpr))
    
      y_predict= model2.predict(X_test)
      acc = accuracy_score(y_test, y_predict)
      accVec.append(acc)
  
    acc = np.mean(accVec)
    #print("here")
    #tpr = np.mean(tprVec, axis=0)
    #fpr = np.mean(fprVec, axis=0)
    aucc = np.mean(aucVec)
    
    return acc, aucc

  
  

In [None]:
def selectTechnique(resname, perc):
  if resname == "none":  
    resampling = "none"
  
  if resname == "oversample": 
    resampling = RandomOverSampler(sampling_strategy='minority')
    resampling = RandomOverSampler(sampling_strategy=perc)
  
  if resname == "undersample": 
    resampling = RandomUnderSampler(sampling_strategy='majority')
    resampling = RandomUnderSampler(sampling_strategy=perc)
  
  if resname == "NCL": 
    resampling = NeighbourhoodCleaningRule()
  
  if resname == "Tomek":
    resampling =  TomekLinks()
  
  if resname == "smote":
    resampling = SMOTE(sampling_strategy='minority')
    resampling = SMOTE(sampling_strategy=perc)
  
  return resampling

## Ratios for  Tomek Links and NCL change my data:

In [None]:
from collections import Counter

print(Counter(y_train))
print("Ratio for no resampling is:", 64 /(157))

X_aux, y_aux = TomekLinks().fit_resample(X_train, y_train) 
print(Counter(y_aux))
print("Ratio for Tomek Link is:", 64 /(144))

X_aux, y_aux = NeighbourhoodCleaningRule().fit_resample(X_train, y_train) 
print(Counter(y_aux))
print("Ratio for NCL is:", 64 /(83))

Counter({0: 157, 1: 64})
Ratio for no resampling is: 0.40764331210191085
Counter({0: 144, 1: 64})
Ratio for Tomek Link is: 0.4444444444444444
Counter({0: 83, 1: 64})
Ratio for NCL is: 0.7710843373493976


## Knn

https://stackoverflow.com/questions/52910061/implementing-roc-curves-for-k-nn-machine-learning-algorithm-using-python-and-sci

In [None]:
def ResultsRatiokNN(perc, kinit, kfinal, step):
  dfAcc = pd.DataFrame(columns=("none", "oversample", "undersample", "NCL", "Tomek", "smote"))
  dfAUC = pd.DataFrame(columns=("none", "oversample", "undersample", "NCL", "Tomek", "smote"))

  resTechniques = ["none", "oversample", "undersample", "NCL", "Tomek", "smote"]


  for k in range(kinit, kfinal+1, step):
    model = KNeighborsClassifier(n_neighbors = k)
    accVector = []
    aucVector = []
    for res in resTechniques:
    #print(res)
      resampling = selectTechnique(res, perc)
  
      [acc, aucc] = testResamplingTechniques(model, resampling, X_train, y_train, X_test, y_test, niter=100)
      accVector.append(acc)
      aucVector.append(aucc)
  
    dfAcc.loc[k] = accVector
    dfAUC.loc[k] = aucVector
  
  return dfAcc, dfAUC

We explore the performance of the KNN for different ration between the majority and minority: 

In [None]:
[dfAcc, dfAUC] = ResultsRatiokNN(perc = 0.6, kinit=1, kfinal=10, step=1)
print("Data set for accuracy") #row is number of neighbours
print(dfAcc)
print("Data set for AUC")
print(dfAUC)

Data set for accuracy
        none  oversample  undersample       NCL     Tomek     smote
1   0.642857    0.625357     0.608393  0.571429  0.660714  0.618214
2   0.732143    0.694286     0.698214  0.660714  0.750000  0.671429
3   0.625000    0.617500     0.626607  0.607143  0.607143  0.608750
4   0.750000    0.661071     0.705893  0.660714  0.732143  0.645000
5   0.678571    0.627500     0.645179  0.607143  0.660714  0.617679
6   0.714286    0.674464     0.708571  0.696429  0.714286  0.658393
7   0.732143    0.637500     0.683214  0.642857  0.678571  0.615357
8   0.714286    0.674107     0.676786  0.660714  0.696429  0.641071
9   0.678571    0.643571     0.668750  0.607143  0.678571  0.626607
10  0.696429    0.666071     0.715714  0.696429  0.714286  0.647143
Data set for AUC
        none  oversample  undersample       NCL     Tomek     smote
1   0.610860    0.604932     0.597066  0.609351  0.656863  0.600468
2   0.678733    0.659457     0.627285  0.663650  0.686275  0.641931
3   0.599

In [None]:
[dfAcc, dfAUC] = ResultsRatiokNN(perc = 0.8, kinit=1, kfinal=10, step=1)
print("Data set for accuracy") #row is number of neighbours 
print(dfAcc)
print("Data set for AUC")
print(dfAUC)

Data set for accuracy
        none  oversample  undersample       NCL     Tomek     smote
1   0.642857    0.628214     0.587679  0.571429  0.660714  0.603571
2   0.732143    0.668036     0.675357  0.660714  0.750000  0.630536
3   0.625000    0.617143     0.591429  0.607143  0.607143  0.598036
4   0.750000    0.646250     0.676429  0.660714  0.732143  0.616071
5   0.678571    0.602857     0.601250  0.607143  0.660714  0.593393
6   0.714286    0.648750     0.677143  0.696429  0.714286  0.625714
7   0.732143    0.607857     0.648929  0.642857  0.678571  0.591964
8   0.714286    0.633214     0.656786  0.660714  0.696429  0.607500
9   0.678571    0.617500     0.625357  0.607143  0.678571  0.611250
10  0.696429    0.635893     0.696250  0.696429  0.714286  0.623571
Data set for AUC
        none  oversample  undersample       NCL     Tomek     smote
1   0.610860    0.606154     0.597790  0.609351  0.656863  0.589291
2   0.678733    0.651893     0.621259  0.663650  0.686275  0.620588
3   0.599

In [None]:
[dfAcc, dfAUC] = ResultsRatiokNN(perc = 1, kinit=1, kfinal=10, step=1)
print("Data set for accuracy") #row is number of neighbours 
print(dfAcc)
print("Data set for AUC")
print(dfAUC)

Data set for accuracy
        none  oversample  undersample       NCL     Tomek     smote
1   0.642857    0.631607     0.571607  0.571429  0.660714  0.602500
2   0.732143    0.661250     0.666786  0.660714  0.750000  0.620179
3   0.625000    0.605893     0.575536  0.607143  0.607143  0.595000
4   0.750000    0.624821     0.641964  0.660714  0.732143  0.604821
5   0.678571    0.573036     0.568750  0.607143  0.660714  0.566071
6   0.714286    0.609107     0.653036  0.696429  0.714286  0.600893
7   0.732143    0.593750     0.611964  0.642857  0.678571  0.573929
8   0.714286    0.614286     0.625893  0.660714  0.696429  0.596429
9   0.678571    0.607679     0.589464  0.607143  0.678571  0.602143
10  0.696429    0.619821     0.669107  0.696429  0.714286  0.614286
Data set for AUC
        none  oversample  undersample       NCL     Tomek     smote
1   0.610860    0.609751     0.601018  0.609351  0.656863  0.595490
2   0.678733    0.655181     0.621742  0.663650  0.686275  0.618635
3   0.599

## SVM

In [None]:
from sklearn.svm import SVC

def ResultsRatioSVM(perc):
  dfAcc = pd.DataFrame(columns=("none", "oversample", "undersample", "NCL", "Tomek", "smote"))
  dfAUC = pd.DataFrame(columns=("none", "oversample", "undersample", "NCL", "Tomek", "smote"))

  resTechniques = ["none", "oversample", "undersample", "NCL", "Tomek", "smote"]

  model = SVC(probability=True)
  accVector = []
  aucVector = []
  for res in resTechniques:
    #print(res)
    resampling = selectTechnique(res, perc)
  
    [acc, aucc] = testResamplingTechniques(model, resampling, X_train, y_train, X_test, y_test, niter=100)
    accVector.append(acc)
    aucVector.append(aucc)
  
  dfAcc.loc[1] = accVector
  dfAUC.loc[1] = aucVector
  
  return dfAcc, dfAUC

In [None]:
[dfAcc, dfAUC] = ResultsRatioSVM(perc = 0.6)
print("Data set for accuracy") #row is number of neighbours 
print(dfAcc)
print("Data set for AUC")
print(dfAUC)

Data set for accuracy
       none  oversample  undersample       NCL     Tomek     smote
1  0.714286       0.735     0.728036  0.642857  0.732143  0.736429
Data set for AUC
     none  oversample  undersample       NCL     Tomek     smote
1  0.6727    0.695204       0.6969  0.716433  0.710407  0.687255


In [None]:
[dfAcc, dfAUC] = ResultsRatioSVM(perc = 0.8)
print("Data set for accuracy") #row is number of neighbours 
print(dfAcc)
print("Data set for AUC")
print(dfAUC)

Data set for accuracy
       none  oversample  undersample       NCL     Tomek     smote
1  0.714286    0.696607     0.671429  0.642857  0.732143  0.683214
Data set for AUC
     none  oversample  undersample       NCL   Tomek     smote
1  0.6727    0.707722     0.696305  0.716433  0.7104  0.707858


In [None]:
[dfAcc, dfAUC] = ResultsRatioSVM(perc = 1)
print("Data set for accuracy") #row is number of neighbours 
print(dfAcc)
print("Data set for AUC")
print(dfAUC)

Data set for accuracy
       none  oversample  undersample       NCL     Tomek     smote
1  0.714286     0.66125     0.636071  0.642857  0.732143  0.639821
Data set for AUC
     none  oversample  undersample       NCL     Tomek     smote
1  0.6727    0.715747     0.712511  0.716425  0.710415  0.713477


## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression


def ResultsRatioLR(perc):
  dfAcc = pd.DataFrame(columns=("none", "oversample", "undersample", "NCL", "Tomek", "smote"))
  dfAUC = pd.DataFrame(columns=("none", "oversample", "undersample", "NCL", "Tomek", "smote"))

  resTechniques = ["none", "oversample", "undersample", "NCL", "Tomek", "smote"]

  model = LogisticRegression()
  accVector = []
  aucVector = []
  for res in resTechniques:
    #print(res)
    resampling = selectTechnique(res, perc)
  
    [acc, aucc] = testResamplingTechniques(model, resampling, X_train, y_train, X_test, y_test, niter=100)
    accVector.append(acc)
    aucVector.append(aucc)
  
  dfAcc.loc[1] = accVector
  dfAUC.loc[1] = aucVector
  
  return dfAcc, dfAUC

In [None]:
[dfAcc, dfAUC] = ResultsRatioLR(perc = 0.6)
print("Data set for accuracy") #row is number of neighbours 
print(dfAcc)
print("Data set for AUC")
print(dfAUC)

Data set for accuracy
       none  oversample  undersample       NCL  Tomek     smote
1  0.767857    0.731429     0.728571  0.660714   0.75  0.728393
Data set for AUC
      none  oversample  undersample       NCL     Tomek     smote
1  0.73454     0.73098     0.730603  0.740573  0.742081  0.726757


In [None]:
[dfAcc, dfAUC] = ResultsRatioLR(perc = 0.8)
print("Data set for accuracy") #row is number of neighbours 
print(dfAcc)
print("Data set for AUC")
print(dfAUC)

Data set for accuracy
       none  oversample  undersample       NCL  Tomek     smote
1  0.767857    0.690536      0.68625  0.660714   0.75  0.690179
Data set for AUC
      none  oversample  undersample       NCL     Tomek     smote
1  0.73454    0.730709     0.730875  0.740573  0.742081  0.727451


In [None]:
[dfAcc, dfAUC] = ResultsRatioLR(perc = 1)
print("Data set for accuracy") #row is number of neighbours 
print(dfAcc)
print("Data set for AUC")
print(dfAUC)

Data set for accuracy
       none  oversample  undersample       NCL  Tomek     smote
1  0.767857    0.663214     0.661607  0.660714   0.75  0.659286
Data set for AUC
      none  oversample  undersample       NCL     Tomek     smote
1  0.73454    0.731885     0.732685  0.740573  0.742081  0.727617


## Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

def ResultsRatioNB(perc):
  dfAcc = pd.DataFrame(columns=("none", "oversample", "undersample", "NCL", "Tomek", "smote"))
  dfAUC = pd.DataFrame(columns=("none", "oversample", "undersample", "NCL", "Tomek", "smote"))

  resTechniques = ["none", "oversample", "undersample", "NCL", "Tomek", "smote"]

  model = GaussianNB()
  accVector = []
  aucVector = []
  for res in resTechniques:
    #print(res)
    resampling = selectTechnique(res, perc)
  
    [acc, aucc] = testResamplingTechniques(model, resampling, X_train, y_train, X_test, y_test, niter=100)
    accVector.append(acc)
    aucVector.append(aucc)
  
  dfAcc.loc[1] = accVector
  dfAUC.loc[1] = aucVector
  
  return dfAcc, dfAUC

In [None]:
[dfAcc, dfAUC] = ResultsRatioNB(perc = 0.6)
print("Data set for accuracy") #row is number of neighbours 
print(dfAcc)
print("Data set for AUC")
print(dfAUC)

Data set for accuracy
       none  oversample  undersample       NCL  Tomek     smote
1  0.767857    0.749821     0.738393  0.714286   0.75  0.740536
Data set for AUC
       none  oversample  undersample      NCL     Tomek     smote
1  0.717949    0.719321     0.719864  0.73454  0.720965  0.721161


In [None]:
[dfAcc, dfAUC] = ResultsRatioNB(perc = 0.8)
print("Data set for accuracy") #row is number of neighbours 
print(dfAcc)
print("Data set for AUC")
print(dfAUC)

Data set for accuracy
       none  oversample  undersample       NCL  Tomek     smote
1  0.767857     0.72125     0.724107  0.714286   0.75  0.713571
Data set for AUC
       none  oversample  undersample      NCL     Tomek     smote
1  0.717949    0.719397     0.720166  0.73454  0.720965  0.723092


In [None]:
[dfAcc, dfAUC] = ResultsRatioNB(perc = 1)
print("Data set for accuracy") #row is number of neighbours 
print(dfAcc)
print("Data set for AUC")
print(dfAUC)

Data set for accuracy
       none  oversample  undersample       NCL  Tomek     smote
1  0.767857    0.703214     0.717321  0.714286   0.75  0.705357
Data set for AUC
       none  oversample  undersample      NCL     Tomek     smote
1  0.717949    0.718959     0.720724  0.73454  0.720965  0.724887


## Decision Trees

In [None]:
from sklearn.tree import DecisionTreeClassifier

def ResultsRatioDTBal(perc):
  dfAcc = pd.DataFrame(columns=("none", "oversample", "undersample", "NCL", "Tomek", "smote"))
  dfAUC = pd.DataFrame(columns=("none", "oversample", "undersample", "NCL", "Tomek", "smote"))

  resTechniques = ["none", "oversample", "undersample", "NCL", "Tomek", "smote"]

  model = DecisionTreeClassifier(class_weight="balanced", random_state=11)
  accVector = []
  aucVector = []
  for res in resTechniques:
    #print(res)
    resampling = selectTechnique(res, perc)
  
    [acc, aucc] = testResamplingTechniques(model, resampling, X_train, y_train, X_test, y_test, niter=100)
    accVector.append(acc)
    aucVector.append(aucc)
  
  dfAcc.loc[1] = accVector
  dfAUC.loc[1] = aucVector
  
  return dfAcc, dfAUC

In [None]:
from sklearn.tree import DecisionTreeClassifier

def ResultsRatioDT(perc):
  dfAcc = pd.DataFrame(columns=("none", "oversample", "undersample", "NCL", "Tomek", "smote"))
  dfAUC = pd.DataFrame(columns=("none", "oversample", "undersample", "NCL", "Tomek", "smote"))

  resTechniques = ["none", "oversample", "undersample", "NCL", "Tomek", "smote"]

  model = DecisionTreeClassifier(random_state=11)
  accVector = []
  aucVector = []
  for res in resTechniques:
    #print(res)
    resampling = selectTechnique(res, perc)
  
    [acc, aucc] = testResamplingTechniques(model, resampling, X_train, y_train, X_test, y_test, niter=100)
    accVector.append(acc)
    aucVector.append(aucc)
  
  dfAcc.loc[1] = accVector
  dfAUC.loc[1] = aucVector
  
  return dfAcc, dfAUC

In [None]:
[dfAcc, dfAUC] = ResultsRatioDTBal(perc = 0.6)
print("Data set for accuracy") #row is number of neighbours 
print(dfAcc)
print("Data set for AUC")
print(dfAUC)

Data set for accuracy
    none  oversample  undersample       NCL  Tomek     smote
1  0.625    0.585357     0.565536  0.446429  0.625  0.588214
Data set for AUC
       none  oversample  undersample       NCL     Tomek     smote
1  0.578431    0.547934      0.55319  0.466063  0.611614  0.555136


In [None]:
[dfAcc, dfAUC] = ResultsRatioDTBal(perc = 0.8)
print("Data set for accuracy") #row is number of neighbours 
print(dfAcc)
print("Data set for AUC")
print(dfAUC)

Data set for accuracy
    none  oversample  undersample       NCL  Tomek  smote
1  0.625    0.591786     0.550893  0.446429  0.625  0.585
Data set for AUC
       none  oversample  undersample       NCL     Tomek     smote
1  0.578431    0.557903     0.559525  0.466063  0.611614  0.551787


In [None]:
[dfAcc, dfAUC] = ResultsRatioDTBal(perc = 1)
print("Data set for accuracy") #row is number of neighbours 
print(dfAcc)
print("Data set for AUC")
print(dfAUC)

Data set for accuracy
    none  oversample  undersample       NCL  Tomek    smote
1  0.625    0.586786     0.539821  0.446429  0.625  0.57875
Data set for AUC
       none  oversample  undersample       NCL     Tomek     smote
1  0.578431    0.554465     0.554925  0.466063  0.611614  0.553507


In [None]:
[dfAcc, dfAUC] = ResultsRatioDT(perc = 0.6)
print("Data set for accuracy") #row is number of neighbours 
print(dfAcc)
print("Data set for AUC")
print(dfAUC)

Data set for accuracy
       none  oversample  undersample       NCL     Tomek  smote
1  0.660714      0.6125     0.581071  0.428571  0.660714  0.595
Data set for AUC
       none  oversample  undersample       NCL     Tomek    smote
1  0.608597    0.575038     0.566802  0.468326  0.638763  0.56856


In [None]:
[dfAcc, dfAUC] = ResultsRatioDT(perc = 0.8)
print("Data set for accuracy") #row is number of neighbours 
print(dfAcc)
print("Data set for AUC")
print(dfAUC)

Data set for accuracy
       none  oversample  undersample       NCL     Tomek     smote
1  0.660714    0.601607     0.557143  0.428571  0.660714  0.583929
Data set for AUC
       none  oversample  undersample       NCL     Tomek     smote
1  0.608597    0.565943     0.558439  0.468326  0.638763  0.561667


In [None]:
[dfAcc, dfAUC] = ResultsRatioDT(perc = 1)
print("Data set for accuracy") #row is number of neighbours 
print(dfAcc)
print("Data set for AUC")
print(dfAUC)

Data set for accuracy
       none  oversample  undersample       NCL     Tomek     smote
1  0.660714       0.595     0.535357  0.428571  0.660714  0.582857
Data set for AUC
       none  oversample  undersample       NCL     Tomek     smote
1  0.608597     0.56083     0.550309  0.468326  0.638763  0.555618


# Functions to test ensemble-resampling techniques

In [None]:
def testEnsembleTechniques(initial_model, resname, X_train, y_train, X_test, y_test, perc):
  
  if resname == "none": model2 = initial_model
    
  
  else: model2 = selectEnsemble(resname, initial_model, perc)

      
  model2.fit(X_train,y_train)
    
  y_scores = model2.predict_proba(X_test)
  fpr, tpr, threshold = roc_curve(y_test, y_scores[:, 1])
  aucc = auc(fpr, tpr)
      
  y_predict= model2.predict(X_test)
  acc = accuracy_score(y_test, y_predict)
  
    
  return acc, aucc

  

In [None]:
def selectEnsemble(resname, initial_model, perc):
  if resname == "none":  
    ensemble = "none"
    model = initial_model
  
  if resname == "SMOTEBagg":
  
    #resampling = SMOTE(sampling_strategy='minority')
    resampling = SMOTE(sampling_strategy=perc, k_neighbors=1, random_state=1)
    steps = [('over', resampling), ('model',initial_model)]
    pipeline = Pipeline(steps=steps) 
    #model = BalancedBaggingClassifier(base_estimator=initial_model,sampler=resampling,n_estimators=10,max_samples=200,random_state=1)
    model = BaggingClassifier(base_estimator=pipeline,n_estimators=10,max_samples=200,random_state=1)
    
  if resname == "SMOTEBoost": 
    steps = [('model', initial_model)]
    pipeline = Pipeline(steps=steps) 
    model = smt.SMOTEBoost(base_estimator=pipeline,n_estimators=1, n_samples=int(perc*157)-64)
    
  if resname == "RUSBagg": 
    resampling = RandomUnderSampler(sampling_strategy='majority')
    resampling = RandomUnderSampler(sampling_strategy=perc,random_state=1)
    steps = [('under', resampling), ('model',initial_model)]
    pipeline = Pipeline(steps=steps) 
    model = BaggingClassifier(base_estimator=pipeline, n_estimators=10,max_samples=200, random_state=1, n_jobs=5)
    
    
  
  if resname == "RUSBoost":
    steps = [('model', initial_model)]
    pipeline = Pipeline(steps=steps) 
    model = smt.SMOTEBoost(base_estimator=pipeline,n_estimators=10,k_neighbors=2, n_samples=int(perc*157)-64)
    
  
  return model

In [None]:
!cp drive/MyDrive/Colab\ Notebooks/projectCS/smoteMy.py smote2.py
!cp drive/MyDrive/Colab\ Notebooks/projectCS/smoteMy.py rus.py

In [None]:
from imblearn.pipeline import Pipeline
from imblearn.pipeline import make_pipeline
from sklearn.ensemble import BaggingClassifier
from sklearn.svm import SVC
from imblearn.ensemble import BalancedBaggingClassifier
from imblearn.ensemble import BalancedBaggingClassifier 
import smote2 as smt



def ResultsRatioEnsemble(perc, initial_model, weight_support):
  if weight_support:
    dfAcc = pd.DataFrame(columns=("SMOTEBoost","SMOTEBagg",  "RUSBagg", "RUSBoost"))
    dfAUC = pd.DataFrame(columns=("SMOTEBoost", "SMOTEBagg",  "RUSBagg", "RUSBoost"))
    resTechniques = ["SMOTEBoost",  "SMOTEBagg",  "RUSBagg", "RUSBoost"]
  

  else: 
    dfAcc = pd.DataFrame(columns=("SMOTEBagg",  "RUSBagg"))
    dfAUC = pd.DataFrame(columns=("SMOTEBagg",  "RUSBagg"))
    resTechniques = ["SMOTEBagg",  "RUSBagg"]
  
  

  accVector = []
  aucVector = []
  for res in resTechniques:
    
    [acc, aucc] = testEnsembleTechniques(initial_model, res, X_train, y_train, X_test, y_test, perc)
     
    accVector.append(acc)
    aucVector.append(aucc)
  
  dfAcc.loc[1] = accVector
  dfAUC.loc[1] = aucVector
  
  return dfAcc, dfAUC

## Knn

In [None]:
dfAccfin = pd.DataFrame(columns=("SMOTEBagg",  "RUSBagg" ))
dfAUCfin = pd.DataFrame(columns=( "SMOTEBagg",  "RUSBagg"))
    
for k in range(1, 10, 1):
  [dfAcc, dfAUC] =  ResultsRatioEnsemble(perc = 0.6, initial_model=KNeighborsClassifier(k),weight_support=False)
  dfAccfin.loc[k] = dfAcc.loc[1]
  dfAUCfin.loc[k] = dfAUC.loc[1]

print("Data set for accuracy") #row is number of neighbours 
print(dfAccfin)
print("Data set for AUC")
print(dfAUCfin)

Data set for accuracy
   SMOTEBagg   RUSBagg
1   0.642857  0.607143
2   0.625000  0.607143
3   0.607143  0.660714
4   0.625000  0.660714
5   0.625000  0.642857
6   0.625000  0.660714
7   0.607143  0.678571
8   0.660714  0.642857
9   0.660714  0.696429
Data set for AUC
   SMOTEBagg   RUSBagg
1   0.599548  0.628959
2   0.594268  0.641026
3   0.606335  0.628205
4   0.595023  0.638763
5   0.591252  0.623680
6   0.615385  0.625943
7   0.617647  0.602564
8   0.610860  0.609351
9   0.594268  0.624434


In [None]:
dfAccfin = pd.DataFrame(columns=("SMOTEBagg",  "RUSBagg" ))
dfAUCfin = pd.DataFrame(columns=( "SMOTEBagg",  "RUSBagg"))
    
for k in range(1, 10, 1):
  [dfAcc, dfAUC] =  ResultsRatioEnsemble(perc = 0.8, initial_model=KNeighborsClassifier(k),weight_support=False)
  dfAccfin.loc[k] = dfAcc.loc[1]
  dfAUCfin.loc[k] = dfAUC.loc[1]

print("Data set for accuracy") #row is number of neighbours 
print(dfAccfin)
print("Data set for AUC")
print(dfAUCfin)

Data set for accuracy
   SMOTEBagg   RUSBagg
1   0.625000  0.589286
2   0.589286  0.589286
3   0.589286  0.571429
4   0.607143  0.642857
5   0.589286  0.625000
6   0.607143  0.660714
7   0.607143  0.642857
8   0.642857  0.625000
9   0.625000  0.642857
Data set for AUC
   SMOTEBagg   RUSBagg
1   0.598039  0.650830
2   0.609351  0.636501
3   0.607089  0.646305
4   0.608597  0.638763
5   0.604827  0.651584
6   0.606335  0.619910
7   0.622926  0.613122
8   0.622172  0.623680
9   0.628205  0.636501


In [None]:
dfAccfin = pd.DataFrame(columns=("SMOTEBagg",  "RUSBagg" ))
dfAUCfin = pd.DataFrame(columns=( "SMOTEBagg",  "RUSBagg"))
    
for k in range(1, 10, 1):
  [dfAcc, dfAUC] =  ResultsRatioEnsemble(perc =1 , initial_model=KNeighborsClassifier(k),weight_support=False)
  dfAccfin.loc[k] = dfAcc.loc[1]
  dfAUCfin.loc[k] = dfAUC.loc[1]

print("Data set for accuracy") #row is number of neighbours 
print(dfAccfin)
print("Data set for AUC")
print(dfAUCfin)

Data set for accuracy
   SMOTEBagg   RUSBagg
1   0.642857  0.571429
2   0.607143  0.589286
3   0.553571  0.553571
4   0.571429  0.571429
5   0.571429  0.589286
6   0.589286  0.607143
7   0.589286  0.571429
8   0.571429  0.607143
9   0.589286  0.571429
Data set for AUC
   SMOTEBagg   RUSBagg
1   0.619155  0.635747
2   0.596531  0.636501
3   0.601810  0.631976
4   0.587481  0.633484
5   0.604827  0.633484
6   0.610106  0.616893
7   0.618401  0.632730
8   0.622172  0.637255
9   0.620664  0.644042


In [None]:
dfAccfin = pd.DataFrame(columns=("SMOTEBagg",  "RUSBagg" ))
dfAUCfin = pd.DataFrame(columns=( "SMOTEBagg",  "RUSBagg"))
    
for k in range(10, 100, 10):
  [dfAcc, dfAUC] =  ResultsRatioEnsemble(perc =0.6 , initial_model=KNeighborsClassifier(k),weight_support=False)
  dfAccfin.loc[k] = dfAcc.loc[1]
  dfAUCfin.loc[k] = dfAUC.loc[1]

print("Data set for accuracy") #row is number of neighbours 
print(dfAccfin)
print("Data set for AUC")
print(dfAUCfin)

Data set for accuracy
    SMOTEBagg   RUSBagg
10   0.642857  0.696429
20   0.714286  0.714286
30   0.732143  0.714286
40   0.714286  0.714286
50   0.714286  0.732143
60   0.714286  0.732143
70   0.714286  0.714286
80   0.732143  0.696429
90   0.732143  0.696429
Data set for AUC
    SMOTEBagg   RUSBagg
10   0.588989  0.622926
20   0.625189  0.689291
30   0.659879  0.727753
40   0.726244  0.739819
50   0.753394  0.751885
60   0.741327  0.773756
70   0.728507  0.789593
80   0.741327  0.774510
90   0.751131  0.766968


In [None]:
dfAccfin = pd.DataFrame(columns=("SMOTEBagg",  "RUSBagg" ))
dfAUCfin = pd.DataFrame(columns=( "SMOTEBagg",  "RUSBagg"))
    
for k in range(10, 100, 10):
  [dfAcc, dfAUC] =  ResultsRatioEnsemble(perc =0.8 , initial_model=KNeighborsClassifier(k),weight_support=False)
  dfAccfin.loc[k] = dfAcc.loc[1]
  dfAUCfin.loc[k] = dfAUC.loc[1]

print("Data set for accuracy") #row is number of neighbours 
print(dfAccfin)
print("Data set for AUC")
print(dfAUCfin)

Data set for accuracy
    SMOTEBagg   RUSBagg
10   0.625000  0.625000
20   0.660714  0.714286
30   0.696429  0.732143
40   0.696429  0.767857
50   0.732143  0.732143
60   0.750000  0.750000
70   0.750000  0.750000
80   0.767857  0.750000
90   0.767857  0.714286
Data set for AUC
    SMOTEBagg   RUSBagg
10   0.606335  0.634992
20   0.631222  0.693816
30   0.670437  0.728507
40   0.707391  0.743590
50   0.731523  0.767722
60   0.749623  0.783560
70   0.739065  0.774510
80   0.738311  0.759427
90   0.745098  0.752640


In [None]:
dfAccfin = pd.DataFrame(columns=("SMOTEBagg",  "RUSBagg" ))
dfAUCfin = pd.DataFrame(columns=( "SMOTEBagg",  "RUSBagg"))
    
for k in range(10, 100, 10):
  [dfAcc, dfAUC] =  ResultsRatioEnsemble(perc =1 , initial_model=KNeighborsClassifier(k),weight_support=False)
  dfAccfin.loc[k] = dfAcc.loc[1]
  dfAUCfin.loc[k] = dfAUC.loc[1]

print("Data set for accuracy") #row is number of neighbours 
print(dfAccfin)
print("Data set for AUC")
print(dfAUCfin)

Data set for accuracy
    SMOTEBagg   RUSBagg
10   0.589286  0.571429
20   0.607143  0.678571
30   0.678571  0.696429
40   0.660714  0.696429
50   0.660714  0.750000
60   0.714286  0.767857
70   0.714286  0.785714
80   0.696429  0.750000
90   0.696429  0.785714
Data set for AUC
    SMOTEBagg   RUSBagg
10   0.622172  0.649321
20   0.634992  0.714178
30   0.669683  0.752640
40   0.693816  0.750377
50   0.722474  0.777526
60   0.739819  0.776772
70   0.753394  0.757919
80   0.741327  0.752640
90   0.743590  0.705882


## SVM

In [None]:
[dfAcc, dfAUC] =  ResultsRatioEnsemble(perc = 0.6, initial_model=SVC(probability=True),weight_support=False)
print("Data set for accuracy") #row is number of neighbours 
print(dfAcc)
print("Data set for AUC")
print(dfAUC)

Data set for accuracy
   SMOTEBagg   RUSBagg
1       0.75  0.732143
Data set for AUC
   SMOTEBagg   RUSBagg
1   0.705882  0.692308


In [None]:
[dfAcc, dfAUC] =  ResultsRatioEnsemble(perc = 0.8, initial_model=SVC(probability=True),weight_support=False)
print("Data set for accuracy") #row is number of neighbours 
print(dfAcc)
print("Data set for AUC")
print(dfAUC)

Data set for accuracy
   SMOTEBagg   RUSBagg
1   0.714286  0.678571
Data set for AUC
   SMOTEBagg   RUSBagg
1   0.695324  0.711916


In [None]:
[dfAcc, dfAUC] =  ResultsRatioEnsemble(perc = 1, initial_model=SVC(probability=True),weight_support=False)
print("Data set for accuracy") #row is number of neighbours 
print(dfAcc)
print("Data set for AUC")
print(dfAUC)

Data set for accuracy
   SMOTEBagg   RUSBagg
1   0.696429  0.660714
Data set for AUC
   SMOTEBagg   RUSBagg
1   0.689291  0.713424


## Logistic Regression

In [None]:
[dfAcc, dfAUC] =  ResultsRatioEnsemble(perc = 0.6, initial_model=LogisticRegression(),weight_support=False)
print("Data set for accuracy") #row is number of neighbours 
print(dfAcc)
print("Data set for AUC")
print(dfAUC)

Data set for accuracy
   SMOTEBagg   RUSBagg
1   0.714286  0.714286
Data set for AUC
   SMOTEBagg   RUSBagg
1    0.73454  0.742081


In [None]:
[dfAcc, dfAUC] =  ResultsRatioEnsemble(perc = 0.8, initial_model=LogisticRegression(),weight_support=False)
print("Data set for accuracy") #row is number of neighbours 
print(dfAcc)
print("Data set for AUC")
print(dfAUC)

Data set for accuracy
   SMOTEBagg   RUSBagg
1   0.714286  0.732143
Data set for AUC
   SMOTEBagg  RUSBagg
1   0.739065  0.73454


In [None]:
[dfAcc, dfAUC] =  ResultsRatioEnsemble(perc = 1, initial_model=LogisticRegression(),weight_support=False)
print("Data set for accuracy") #row is number of neighbours 
print(dfAcc)
print("Data set for AUC")
print(dfAUC)

Data set for accuracy
   SMOTEBagg   RUSBagg
1   0.678571  0.642857
Data set for AUC
   SMOTEBagg  RUSBagg
1   0.748115  0.74359


## Naive Bayes

In [None]:
[dfAcc, dfAUC] =  ResultsRatioEnsemble(perc = 0.6, initial_model=GaussianNB(),weight_support=False)
print("Data set for accuracy") #row is number of neighbours 
print(dfAcc)
print("Data set for AUC")
print(dfAUC)

Data set for accuracy
   SMOTEBagg  RUSBagg
1   0.767857     0.75
Data set for AUC
   SMOTEBagg   RUSBagg
1   0.726998  0.736048


In [None]:
[dfAcc, dfAUC] =  ResultsRatioEnsemble(perc = 0.8, initial_model=GaussianNB(),weight_support=False)
print("Data set for accuracy") #row is number of neighbours 
print(dfAcc)
print("Data set for AUC")
print(dfAUC)

Data set for accuracy
   SMOTEBagg   RUSBagg
1       0.75  0.714286
Data set for AUC
   SMOTEBagg   RUSBagg
1   0.726998  0.726998


In [None]:
[dfAcc, dfAUC] =  ResultsRatioEnsemble(perc = 1, initial_model=GaussianNB(),weight_support=False)
print("Data set for accuracy") #row is number of neighbours 
print(dfAcc)
print("Data set for AUC")
print(dfAUC)

Data set for accuracy
   SMOTEBagg   RUSBagg
1       0.75  0.696429
Data set for AUC
   SMOTEBagg  RUSBagg
1    0.72549  0.73454


## Decision Trees

In [None]:
[dfAcc, dfAUC] =  ResultsRatioEnsemble(perc = 0.6, initial_model=DecisionTreeClassifier(random_state=11),weight_support=False)
print("Data set for accuracy") #row is number of neighbours 
print(dfAcc)
print("Data set for AUC")
print(dfAUC)

Data set for accuracy
   SMOTEBagg   RUSBagg
1   0.696429  0.660714
Data set for AUC
   SMOTEBagg   RUSBagg
1   0.573906  0.626697


In [None]:
[dfAcc, dfAUC] =  ResultsRatioEnsemble(perc = 0.8, initial_model=DecisionTreeClassifier(random_state=11),weight_support=False)
print("Data set for accuracy") #row is number of neighbours 
print(dfAcc)
print("Data set for AUC")
print(dfAUC)

Data set for accuracy
   SMOTEBagg   RUSBagg
1   0.678571  0.589286
Data set for AUC
   SMOTEBagg   RUSBagg
1   0.546003  0.588989


In [None]:
[dfAcc, dfAUC] =  ResultsRatioEnsemble(perc = 1, initial_model=DecisionTreeClassifier(random_state=11),weight_support=False)
print("Data set for accuracy") #row is number of neighbours 
print(dfAcc)
print("Data set for AUC")
print(dfAUC)

Data set for accuracy
   SMOTEBagg   RUSBagg
1   0.642857  0.660714
Data set for AUC
   SMOTEBagg  RUSBagg
1   0.559578  0.61991


In [None]:
[dfAcc, dfAUC] =  ResultsRatioEnsemble(perc = 0.6, initial_model=DecisionTreeClassifier(random_state=11,class_weight="balanced"),weight_support=False)
print("Data set for accuracy") #row is number of neighbours 
print(dfAcc)
print("Data set for AUC")
print(dfAUC)

Data set for accuracy
   SMOTEBagg   RUSBagg
1   0.696429  0.607143
Data set for AUC
   SMOTEBagg   RUSBagg
1   0.596531  0.616893


In [None]:
[dfAcc, dfAUC] =  ResultsRatioEnsemble(perc = 0.8, initial_model=DecisionTreeClassifier(random_state=11,class_weight="balanced"),weight_support=False)
print("Data set for accuracy") #row is number of neighbours 
print(dfAcc)
print("Data set for AUC")
print(dfAUC)

Data set for accuracy
   SMOTEBagg   RUSBagg
1   0.696429  0.607143
Data set for AUC
   SMOTEBagg   RUSBagg
1   0.596531  0.616893


In [None]:
[dfAcc, dfAUC] =  ResultsRatioEnsemble(perc = 1, initial_model=DecisionTreeClassifier(random_state=11,class_weight="balanced"),weight_support=False)
print("Data set for accuracy") #row is number of neighbours 
print(dfAcc)
print("Data set for AUC")
print(dfAUC)

Data set for accuracy
   SMOTEBagg   RUSBagg
1   0.642857  0.660714
Data set for AUC
   SMOTEBagg  RUSBagg
1   0.559578  0.61991
