## Demonstrate how both the OVA and AVA methods work by predicting the class with the highest probability using binary class logistic regression on the Vertebrae data (eg. DH, SL, or NO). Verify using multi-class logistic regression.

In [None]:
# import drive 
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd 

my_path = '/content/drive/My Drive/column_3C.dat'
vert = pd.read_csv(my_path, sep=' ',header=None)
vert.head()

Unnamed: 0,0,1,2,3,4,5,6
0,63.03,22.55,39.61,40.48,98.67,-0.25,DH
1,39.06,10.06,25.02,29.0,114.41,4.56,DH
2,68.83,22.22,50.09,46.61,105.99,-3.53,DH
3,69.3,24.65,44.31,44.64,101.87,11.21,DH
4,49.71,9.65,28.32,40.06,108.17,7.92,DH


In [None]:
vertDumms=pd.get_dummies(vert.iloc[:,6].astype(str),drop_first=False)
vert_dataset = vert.iloc[:,list(range(0,6))]
vert_dataset=  pd.concat([vert_dataset,vertDumms],axis=1)
vert_dataset.head()

Unnamed: 0,0,1,2,3,4,5,DH,NO,SL
0,63.03,22.55,39.61,40.48,98.67,-0.25,1,0,0
1,39.06,10.06,25.02,29.0,114.41,4.56,1,0,0
2,68.83,22.22,50.09,46.61,105.99,-3.53,1,0,0
3,69.3,24.65,44.31,44.64,101.87,11.21,1,0,0
4,49.71,9.65,28.32,40.06,108.17,7.92,1,0,0


# OVA

In [None]:
def OVA_method(vert_dataset, test_size):  # creates a dictionary of the 

  from sklearn.model_selection import train_test_split 
  import statsmodels.api as sm
  from sklearn.metrics import accuracy_score
  from statistics import mode
  import warnings
  warnings.filterwarnings('ignore')

  X = vert_dataset.iloc[:,list(range(0,6))]
  Y_DH = pd.DataFrame(vert_dataset.loc[:,'DH'])
  Y_NO = pd.DataFrame(vert_dataset.loc[:,'NO'])
  Y_SL = pd.DataFrame(vert_dataset.loc[:,'SL'])

  accuracy_scores = {}
  # Y = DH
  X_train_DH, X_test_DH, y_train_DH, y_test_DH = train_test_split(X, Y_DH, test_size=test_size) # split the data with the DH row being the Y variable
  model_DH = sm.GLM(y_train_DH, X_train_DH,family=sm.families.Binomial()).fit()  # fit the train set to the GLM model 
  ypred_DH = model_DH.predict(X_test_DH)  # use the model to generate predictions for the test set 
  predictions_nominal_DH = [ 0 if x < 0.5 else 1 for x in ypred_DH]  #  this gives you the predictions of DH or not DH 
  DH_yes = predictions_nominal_DH.count(1)  # count how many of the times DH yes appears
  probability_DH = DH_yes/(len(predictions_nominal_DH))  # divide the occurances of DH by the number of rows in the DH column 
  accuracy_scores['DH'] = probability_DH  # add the probability to the dictionary 
 
 # repead for NO columns and SL column 
 
  # Y = NO
  X_train_NO, X_test_NO, y_train_NO, y_test_NO = train_test_split(X, Y_NO, test_size=test_size)
  model_NO = sm.GLM(y_train_NO, X_train_NO,family=sm.families.Binomial()).fit()
  ypred_NO = model_NO.predict(X_test_NO)
  predictions_nominal_NO = [ 0 if x < 0.5 else 1 for x in ypred_NO]  #  this gives you the predictions of NO or not NO
  NO_yes = predictions_nominal_NO.count(1)
  probability_NO = NO_yes/(len(predictions_nominal_NO))
  accuracy_scores['NO'] = probability_NO

  # Y = SL
  X_train_SL, X_test_SL, y_train_SL, y_test_SL = train_test_split(X, Y_SL, test_size=test_size)
  model_SL = sm.GLM(y_train_SL, X_train_SL,family=sm.families.Binomial()).fit()
  ypred_SL = model_SL.predict(X_test_SL)
  predictions_nominal_SL = [ 0 if x < 0.5 else 1 for x in ypred_SL]  #  this gives you the predictions of SL or not SL
  SL_yes = predictions_nominal_SL.count(1)
  probability_SL = SL_yes/(len(predictions_nominal_SL))
  accuracy_scores['SL'] = probability_SL
 
  return(accuracy_scores)

To find the best class and its probability for one run:

In [None]:
import warnings
warnings.filterwarnings('ignore')
dictionary = OVA_method(vert_dataset, 0.2)
print("Each class was found to have the following probability:")
print(dictionary)

best_class = max(dictionary, key=dictionary.get)  # to find the best class just get the max probability
probability = dictionary[best_class]
print("\nThe best class using OVA was", best_class, "with a probability of", round(probability,4))

Each class was found to have the following probability:
{'DH': 0.1774193548387097, 'NO': 0.3548387096774194, 'SL': 0.4838709677419355}

The best class using OVA was SL with a probability of 0.4839


# AVA

In [None]:
def AVA_method(vert, test_size):  
  from sklearn.model_selection import train_test_split 
  import statsmodels.api as sm
  from sklearn.metrics import accuracy_score
  from statistics import mode
  import warnings
  warnings.filterwarnings('ignore')

  #Xs and Ys for each of the three scenarios
  # predicting DH = yes, given DH and NO
  vert_no_SL = vert[vert.SL != 1]
  X_no_SL = vert_no_SL.iloc[:,list(range(0,6))]
  Y_DH_no_SL = pd.DataFrame(vert_no_SL.loc[:,'DH'])

  # predicting DH = yes, given DH and SL
  vert_no_NO = vert[vert.NO != 1]
  X_no_NO = vert_no_NO.iloc[:,list(range(0,6))]
  Y_DH_no_NO = pd.DataFrame(vert_no_NO.loc[:,'DH'])

  # predicting NO = yes, given SL and NO
  vert_no_DH = vert[vert.DH != 1]
  X_no_DH = vert_no_DH.iloc[:,list(range(0,6))]
  Y_NO_no_DH = pd.DataFrame(vert_no_DH.loc[:,'NO'])

  accuracy_scores = [] # this time I'm using lists rather than a dictionary because I'm counting which class wins the most 
  # no SL, Y = DH
  X_train_DH, X_test_DH, y_train_DH, y_test_DH = train_test_split(X_no_SL, Y_DH_no_SL, test_size=test_size)  # split the data with the appropriate X and Y
  model_DH = sm.GLM(y_train_DH, X_train_DH,family=sm.families.Binomial()).fit()  # generate the GLM model fit to the train set 
  ypred_DH = model_DH.predict(X_test_DH)  # generate a prediction with the test set 
  predictions_nominal_DH = [ 0 if x < 0.5 else 1 for x in ypred_DH]  #  this gives you the predictions of DH or not DH 
  DH_yes = predictions_nominal_DH.count(1)  # count how many times DH appears given DH or NO
  probability_DH = DH_yes/(len(predictions_nominal_DH))  # find the probability of DH being predicted
  if probability_DH >= 0.5:  # If DH is more likely... DH wins 
    answer = ['DH', probability_DH]  # add DH and the probability of getting DH to the list
    accuracy_scores.append(answer)  # add this to the list
  else:   # If NO is more likely... NO wins 
    answer = ['NO', 1 - probability_DH]  # add NO and the probability of getting NO to the list 
    accuracy_scores.append(answer)

  # repeat for the rest of the options =...
  # no NO, Y = DH
  X_train_DH2, X_test_DH2, y_train_DH2, y_test_DH2 = train_test_split(X_no_NO, Y_DH_no_NO, test_size=test_size)
  model_DH2 = sm.GLM(y_train_DH2, X_train_DH2,family=sm.families.Binomial()).fit()
  ypred_DH2 = model_DH2.predict(X_test_DH2)
  predictions_nominal_DH2 = [ 0 if x < 0.5 else 1 for x in ypred_DH2]  #  this gives you the predictions of DH or not DH 
  DH2_yes = predictions_nominal_DH2.count(1)
  probability_DH2 = DH2_yes/(len(predictions_nominal_DH2))
  probability_NO = 1 - probability_DH2
  if probability_DH2 >= 0.5:  # DH wins 
    answer = ['DH', probability_DH2]
    accuracy_scores.append(answer)
  else:   # SL wins 
    answer = ['SL', 1 - probability_DH2]
    accuracy_scores.append(answer)

  # no DH, Y = NO
  X_train_NO, X_test_NO, y_train_NO, y_test_NO = train_test_split(X_no_DH, Y_NO_no_DH, test_size=test_size)
  model_NO = sm.GLM(y_train_NO, X_train_NO,family=sm.families.Binomial()).fit()
  ypred_NO = model_NO.predict(X_test_NO)
  predictions_nominal_NO = [ 0 if x < 0.5 else 1 for x in ypred_NO]  #  this gives you the predictions of DH or not DH 
  NO_yes = predictions_nominal_NO.count(1)
  probability_NO = NO_yes/(len(predictions_nominal_NO))
  if probability_NO >= 0.5:  # NO wins 
    answer = ['NO', probability_NO]
    accuracy_scores.append(answer)
  else:   # SL wins 
    answer = ['SL', 1 - probability_NO]
    accuracy_scores.append(answer)
 
  return(accuracy_scores)

In [None]:
# If this has a PerfectSeperationError just run it again and it will work 
import warnings
warnings.filterwarnings('ignore')
from statistics import mode


options = AVA_method(vert_dataset, 0.2)
# just to see how likely each winning class was to be chosen I printed the list
print("Each class was found to have the following probability:")
print(options)

just_classes = []
for item in options:
  just_classes.append(item[0]) # create a new list of the classes that won their individual battels 
# then I print which class appeared most in the list 
print("\nThe best class using AVA was found to be ", mode(just_classes))  # whichever one wins the most is the overall AVA winner 

Each class was found to have the following probability:
[['NO', 0.6875], ['SL', 0.6666666666666667], ['SL', 0.54]]

The best class using AVA was found to be  SL


Comparison between the two models built:

In [None]:
# Here I wanted to test the models I bulit against the multi class function 
from sklearn.model_selection import train_test_split 
import statsmodels.api as sm
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
X = vert_dataset.iloc[:,list(range(0,6))]
Y_SL = pd.DataFrame(vert_dataset.loc[:,'SL'])
le = preprocessing.LabelEncoder()
vert.iloc[:,6] = le.fit_transform(vert.iloc[:,6])
X = vert.iloc[:,list(range(0,6))]
Y = vert.iloc[:,6]

# Here I am generate predictions with the binomial model using the winning class from AVA/OVA
X_train_SL, X_test_SL, y_train_SL, y_test_SL = train_test_split(X, Y_SL, test_size=0.2)
model_SL = sm.GLM(y_train_SL, X_train_SL,family=sm.families.Binomial()).fit()
prediction_binary = model_SL.predict(X)  # I am using the model_SL model which I generated above to predict SL or not SL
predictions_nominal_SL = [ 0 if x < 0.5 else 2 for x in prediction_binary] # predict 2 for 'SL' or 0 for not SL 

# Here I used the multi class model to predict 
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)  # split the data
model = LogisticRegression()  # the default for the LogisticRegression() call is multi class
model.fit(X_train, y_train)  # fit the train set to the multi class model
prediction_multi_class = model.predict(X)  # generate predictions with this model for all of X to compare

accuracy1 = accuracy_score(Y,predictions_nominal_SL)
print("The prediction accuracy for the binomial model on the entire data set:")
print(accuracy1)
accuracy2 = accuracy_score(Y,prediction_multi_class)
print("\nThe prediction accuracy for the multiclass model:")
print(accuracy2)

The prediction accuracy for the binomial model on the entire data set:
0.6580645161290323

The prediction accuracy for the multiclass model:
0.8709677419354839


In [None]:
# If you want to check the predictions/real y value for a specific row...

prediction_binary = predictions_nominal_SL[5]
prediction_multi_class1 = prediction_multi_class[5]
real_value_of_Y = Y[5]

print("\nThe prediction generated using the binary model:")
print(prediction_binary)
print("\nThe prediction generated using the multi class model:")
print(prediction_multi_class1)
print("\nThe true value of Y:")
print(real_value_of_Y)


The prediction generated using the binary model:
0

The prediction generated using the multi class model:
1

The true value of Y:
0
