In [2]:
import numpy as np 
import pandas as pd 
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split 
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.metrics import accuracy_score
from graphviz import Source
import matplotlib.pyplot as plt

# Question 4

In [5]:
# A)

# Function to read data from csv and split it into train/validation/test groups
def load_data(csvFile):

    # read csv to dataframe and replace NAs w/ 0
    df = pd.read_csv(csvFile)
    df = df.fillna(0)

    # Define the labelled target column and drop it from the independant variables
    Target = df['Dataset']
    df = df.drop(columns = ['Dataset'])

    # Convert the Gender data to quantitative values
    df['Gender'].replace(['Male', 'Female'],
                        [0, 1], inplace=True)

    # Generate train test split
    X_train, X_test, y_train, y_test = train_test_split(df, Target, test_size=0.3,
                                                        random_state = 1)

    # Split test data further into test/validation
    X_test, X_valid, y_test, y_valid = train_test_split(X_test, y_test, test_size=0.3,
                                                        random_state = 1)

    return(X_train, y_train, X_valid, y_valid, X_test, y_test)

In [6]:
# Load the data for use
X_train, y_train, X_valid, y_valid, X_test, y_test = load_data('hw1_data.csv')

#test print to check if data was properly split
#print(len(X_train), len(y_train), len(X_valid), len(y_valid), len(X_test), len(y_test))

In [7]:
# B)

def select_knn_model(X_train, y_train, X_valid, y_valid, X_test, y_test):

  # Define K-values from 1 - 20
  k_values = list(range(1,21))

  # Initial empty lists to store generated accuracies
  ValidAccuracies = []
  TrainAccuracies = []

  # For each k in range 1-20, fit the model on the training data and predict 
  # training and validation values, storing and plotting the respective accuracies
  for k in k_values:
      model = KNeighborsClassifier(n_neighbors=k)
      model.fit(X_train, y_train)

      train_pred = model.predict(X_train)

      TrainAccuracies.append(accuracy_score(y_train, train_pred))

      val_pred = model.predict(X_valid)

      ValidAccuracies.append(accuracy_score(y_valid, val_pred))

  # Cat the results into a dataframe for easier visualizaiton
  kAccuracies = pd.DataFrame(zip(k_values, TrainAccuracies, ValidAccuracies), 
                            columns = ['K values', 'Train Accuracy', 'Validation Accuracy'])

  # Plot the training accuracy for each value of k
  plt.plot(range(1, 21), kAccuracies['Train Accuracy'], 'b-', label='Training Accuracy')

  # Plot the validation accuracy for each value of k
  plt.plot(range(1, 21), kAccuracies['Validation Accuracy'], 'r-', label='Validation Accuracy')

  plt.xlabel("k")
  plt.ylabel("Accuracy")
  plt.title("Accuracy for k-NN")
  plt.legend(loc='best')
  plt.show()

  return(kAccuracies)

In [None]:
# Run the Knn model function on the split data
kAccuracies = select_knn_model(X_train, y_train, X_valid, y_valid, X_test, y_test)

In [None]:
# best k-value selected and used on test data

# Initiate the model with a k-value corresponding to the highest validation accuracy
model = KNeighborsClassifier(n_neighbors=
                             kAccuracies['Validation Accuracy'].idxmax() + 1)

# Fit the model on the training data
model.fit(X_train, y_train)

# employ the model on the test data
test_pred = model.predict(X_test)

# Compute the accuracy score for the test predictions
test_accuracy = accuracy_score(y_test, test_pred)

print(test_accuracy)

# The model with a k value corresponding to the highest validation accuracy
# achieves an accuracy score of ~0.71311 on the test data

In [10]:
# C)

def select_knn_model_cos(X_train, y_train, X_valid, y_valid, X_test, y_test):

 # Define K-values from 1 - 20
  k_values = list(range(1,21))

  # Initial empty lists to store generated accuracies
  ValidAccuracies = []
  TrainAccuracies = []

  # For each k in range 1-20 using the metric "cosine", fit the model on the training
  # data and predict training and validation values, storing and plotting the 
  # respective accuracies
  for k in k_values:
      model = KNeighborsClassifier(n_neighbors=k, metric = 'cosine')
      model.fit(X_train, y_train)

      train_pred = model.predict(X_train)

      TrainAccuracies.append(accuracy_score(y_train, train_pred))

      val_pred = model.predict(X_valid)

      ValidAccuracies.append(accuracy_score(y_valid, val_pred))

  # Cat the results into a dataframe for easier visualizaiton
  kAccuracies = pd.DataFrame(zip(k_values, TrainAccuracies, ValidAccuracies), 
                            columns = ['K values', 'Train Accuracy', 'Validation Accuracy'])

  # Plot the training accuracy for each value of k
  plt.plot(range(1, 21), kAccuracies['Train Accuracy'], 'b-', label='Training Accuracy')

  # Plot the validation accuracy for each value of k
  plt.plot(range(1, 21), kAccuracies['Validation Accuracy'], 'r-', label='Validation Accuracy')

  plt.xlabel("k")
  plt.ylabel("Accuracy")
  plt.title("Accuracy for k-NN")
  plt.legend(loc='best')
  plt.show()

  return(kAccuracies)

In [None]:
# Run the Knn model with cosine metric on the split data
kAccuracies = select_knn_model_cos(X_train, y_train, X_valid, y_valid, X_test, y_test)

# Question 5

In [12]:
# A) 

def train_decision_tree(X_train, y_train, X_val, y_val, X_test, y_test):
    
    # Create an instance of the DecisionTreeClassifier class
    dt = DecisionTreeClassifier(criterion='gini', splitter='best', min_samples_leaf=1)
    
    # Fit the model to the training data
    dt.fit(X_train, y_train)
    
    # Predict the labels for train, validation and test set
    y_train_pred = dt.predict(X_train)
    y_val_pred = dt.predict(X_val)
    y_test_pred = dt.predict(X_test)
    
    # Compute the accuracy for train, validation and test set
    train_acc = accuracy_score(y_train, y_train_pred)
    val_acc = accuracy_score(y_val, y_val_pred)
    test_acc = accuracy_score(y_test, y_test_pred)
    
    # Print the results
    print("Train Accuracy: {:.4f}".format(train_acc))
    print("Validation Accuracy: {:.4f}".format(val_acc))
    print("Test Accuracy: {:.4f}".format(test_acc))

In [None]:
train_decision_tree(X_train, y_train, X_valid, y_valid, X_test, y_test)

# The model is overfitting, however adjusting min_samples_leaf lowers Train 
# Accuracy, but not Test accuracy

# min_smaples_leaf = 1:
# Train Accuracy: 1.0000
# Validation Accuracy: 0.7358
# Test Accuracy: 0.5902

# min_smaples_leaf = 2:
# Train Accuracy: 0.9485
# Validation Accuracy: 0.6981
# Test Accuracy: 0.5902

# min_smaples_leaf = 3:
# Train Accuracy: 0.9265
# Validation Accuracy: 0.6981
# Test Accuracy: 0.5738

In [None]:
# B)

feature_names = ['Age',	'Gender',	'Total_Bilirubin',	'Direct_Bilirubin',
                 'Alkaline_Phosphotase',	'Alamine_Aminotransferase',
                 'Aspartate_Aminotransferase',	'Total_Protiens',	'Albumin',
                 'Albumin_and_Globulin_Ratio']

dt = DecisionTreeClassifier(criterion='gini', splitter='best', min_samples_leaf=1)    

# Fit the model to the training data
dt.fit(X_train, y_train)

# Predict the labels for train, validation and test set
# y_train_pred = dt.predict(X_train)
# y_val_pred = dt.predict(X_val)
# y_test_pred = dt.predict(X_test)

# Compute the accuracy for train, validation and test set
# train_acc = accuracy_score(y_train, y_train_pred)
# val_acc = accuracy_score(y_val, y_val_pred)
# test_acc = accuracy_score(y_test, y_test_pred)

graph = Source(export_graphviz(dt, out_file=None, max_depth = 1, feature_names=feature_names))
graph.format = 'png'
graph.render('dt', view=True)

# Question 6

In [None]:
# Create the logistic regression object
LogReg = LogisticRegression()

# Train the model on the training data
LogReg.fit(X_train, y_train)

# Make predictions on the train, validation, and test sets
y_train_pred = LogReg.predict(X_train)
y_val_pred = LogReg.predict(X_valid)
y_test_pred = LogReg.predict(X_test)

# Calculate the accuracy scores
train_acc = accuracy_score(y_train, y_train_pred)
val_acc = accuracy_score(y_valid, y_val_pred)
test_acc = accuracy_score(y_test, y_test_pred)

# Print the results
print("Train accuracy: ", train_acc)
print("Validation accuracy: ", val_acc)
print("Test accuracy: ", test_acc)