# <u> Part A: Model Code </u>

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('datasets/DataScienceRepository/winequality-white.csv', delimiter=';')

In [3]:
fixed_acidity = df['fixed acidity']
volatile_acidity = df['volatile acidity']

### 5. Euclidean Distance

In [7]:
# Create class of distance functions
class DistanceClass:
    
    def __init__(self):
        pass
    
    def EuclideanDistance(self, a, b):
        norm = a - b
        norm = norm.dot(norm)
        return np.sqrt(norm)
    
    def ManhattanDistance(self, a, b):
        norm = a - b
        norm = norm.dot(norm)
        return norm

In [8]:
x = DistanceClass()
print("Euclidean distance: ", x.EuclideanDistance(fixed_acidity, volatile_acidity))
print("Manhattan distance: ", x.ManhattanDistance(fixed_acidity, volatile_acidity))

Euclidean distance:  464.1110898534962
Manhattan distance:  215399.10372500002


### 7. Accuracy and generalization error of two vectors

In [None]:
'''
This function returns the accuracy and the generalization error.
Y is the target from the data set.
Accuracy is the ratio of the correct predictions and the total number of predictions.
General Error = 1 - accuracy 
'''

def accuracyGeneralizationError(Y, Y_predicted ):
#     accuracy = np.mean(Y == Y_predicted)
#     gn = 1 - accuracy
    count = 0
    accuracy = 0
    for i, k in zip(Y, Y_predicted):
        if (i == 0 and k == 0) or (i ==1 and k == 1) :
            count += 1
            
    accuracy = count / len(Y)
#     print(accuracy)
    gn = 1 - accuracy
    
    return accuracy, gn  

### 8. precision, recall and F1 score

In [None]:
# Precision
def precision(Y, Y_predicted):
    #truePositive = np.sum(Y == Y_predicted).astype(np.int)
    truePositive = 0
    totalTruePositive = 0
    for i, k in zip(Y, Y_predicted):
        if i ==1 and k == 1:
            truePositive +=1
            totalTruePositive +=1
        if i == 0 and k == 1:
            totalTruePositive +=1
            
    return truePositive / totalTruePositive   

In [None]:
# recall
def recall(Y, Y_predicted):
    #truePositive = np.sum(Y == Y_predicted).astype(np.int)
    truePositive = 0
    falseNegative = 0
    for i, k in zip(Y, Y_predicted):
        if i == 1 and k == 0:
            falseNegative +=1
        if i ==1 and k ==1:
            truePositive += 1
            
    return truePositive / ( truePositive + falseNegative )


In [None]:
# F1 score
def F1_score(Y, Y_predicted):
    precision = precision(Y, Y_predicted)
    recall = recall(Y, Y_predicted)    
    
    return ( (precision*recall) / (precision+recall) )*2


### 9. Confusion matrix

In [None]:
def confusion_matrix(Y, Y_predicted):
    
    matrix = np.zeros((2,2))
    
    truePositive= 0
    trueNegative = 0 
    falseNegative = 0
    falsePositive = 0
    #truePositive = np.sum(Y == Y_predicted) #.astype(np.int)
    
    for i, k in zip(Y, Y_predicted):
        if i == 0 and k == 0 :  
            trueNegative +=1
        if i == 1 and k == 0 :
            falseNegative +=1
        if i == 0 and k == 1 :
            falsePositive += 1
        if i == 1 and k ==1:
            truePositive += 1
     
    matrix[0][0] = trueNegative
    matrix[0][1] = falsePositive
    matrix[1][1] = truePositive
    matrix[1][0] = falseNegative
            
    return matrix

#### 10. Receiver Operating Characteristic (ROC) curve

In [None]:
# takes arguments from cross_validation_predict()
def plot_roc_curve(fpr, tpr, label =None):
    
    import matplotlib.pyplot as plt
    
    # these 3 line below are unnecessaary
#     conf_matrix = confusion_matrix(Y, Y_predicted)
#     FPR = conf_matrix[0][1]/ (conf_matrix[0][1] + conf_matrix[0][0])
#     TPR = recall(Y, Y_predicted)
    
    plt.style.use('ggplot')

    fig = plt.figure(figsize=(10, 6))

    plt.plot(fpr, tpr, color='darkorange', linewidth=8, label=label) 
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.axis([0, 1, 0, 1])
    plt.title('ROC Curve (Test Data)')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    
    plt.show()

# 10.

# 11.

# 12.

# 13. KNN Model Class

In [None]:
class KNN(object): # check if this parameter is needed on udemy
    
    def __init__(self):
        #self.k = k
        pass
    
    # distance_f is a function
    def fit(self, training_features, training_labels, k, distance_f, **kwargs):
        self.training_features = training_features
        self.training_labels = training_labels
        self.k = k
        self.distance_f = distance_f
        self.kwargs = kwargs
    
    # predict
    # Need to import:
    # from sortedcontainers import SortedList
    # import numpy as np
    def predict(self, test_features):
        
        y = np.zeros(len(test_features))
        
        for i, x in enumerate(test_features):
            # to store (distance, label) turples
            sl = SortedList(key = self.k) 
            for j, xt in enumerate(self.training_features):
                # compute the distance; here it is Euclidian distance. May need to change it or call the one of the disctance function wrote above
                """ May have to change this code. Call the Euclidian distance function above
                        euclidianDistance(x, xt)
                """
                
#                 temp = x - xt
#                 distance = np.sqrt( temp.dot(temp)  )
                
                distance = euclidianDistance(x, xt)
                
                if len(sl) < self.k:
                    sl.add( ( distance, self.training_labels[j] )  )
                else:
                    if distance < sl[-1][0]:
                        del sl[-1]
                        sl.add( ( distance, self.training_labels[j] )  )
                        
            
            # count how many time a label appears in the sorted list
            labelCount = {}
            for _, l in sl:
                labelCount[l] = labelCount.get(l, 0) + 1 # get() return 0 if label not found; the value otherwise
            
            # Classify; find the label that appears the most
            maxCount = 0
            label = -1
            for l, labelCount in iteritems(labelCount):
                if labelCount > maxCount:
                    labelCount = maxCount
                    label = l
                    
            y[i] = label
            
            
        return y
    
    
        
        
    
        

In [None]:
# Austin's kNN class, fit, and predict functions
import math

class KNNModel:
    
    training_features = []
    training_labels = []
    k = 0
    distance_f = ""
    
    def __init__(self):
        pass
    
    def fit(self, training_features, training_labels, k, distance_f,\
            **kwargs):
        self.training_features = training_features
        self.training_labels = training_labels
        self.k = k
        self.distance_f = distance_f
        arguments = {}
    
    def predict(self, test_features):
        return 0

In [None]:
# Create test instantiation method for KNNModel class
def TestKNNModel():
    df = pd.read_csv('datasets/DataScienceRepository/winequality-white.csv',\
                     delimiter=';')
    training_features = np.array(df[['quality', 'fixed acidity']])
    testKNNModel = KNNModel()
    count = 0
    training_labels = []
    for item in training_features:
        training_labels.append("Row Label #%i" % count)
        count = count + 1
    training_labels = np.array(training_labels)
    distanceObject = DistancesClass()
    k = 2
    testKNNModel.fit(training_features, training_labels, k, distanceObject.EuclideanDistance(3, 4))
    print("Training features: ", testKNNModel.training_features)
    print("Training labels: ", testKNNModel.training_labels)
    print("K: ", testKNNModel.k)
    print("Distance: ", testKNNModel.distance_f)

In [None]:
TestKNNModel()

# Part B: Data Processing

### 14. Read in the file as a pandas data frame

### 15. Convert target into a two-category variable

In [None]:
df["quality"] = (df["quality"] > 5).astype(np.int)

### 16. Summary of each variable in terms of mean, standard deviation, and quartiles

In [None]:
df.describe()

### 17 Shuffle the rows without affecting the order of the data

In [None]:
df =  df.sample(frac=1)
df.head(5)

### 18. Generate pair plot using seaborn package

In [None]:
# Commented out since it runs so long
# # Matplotlib and seaborn for plotting
# import matplotlib.pyplot as plt
# %matplotlib inline

# import seaborn as sns
# from scipy import stats

# # Calculate correlation coefficient
# def corrfunc(x, y, **kws):
#     r, _ = stats.pearsonr(x, y)
#     ax = plt.gca()
#     ax.annotate("r = {:.2f}".format(r),
#                 xy=(.1, .6), xycoords=ax.transAxes,
#                size = 24)
    
# cmap = sns.cubehelix_palette(light=1, dark = 0.1,
#                              hue = 0.5, as_cmap=True)

# sns.set_context(font_scale=2)

# # Pair grid set up
# g = sns.PairGrid(df)

# # Scatter plot on the upper triangle
# g.map_upper(plt.scatter, s=10, color = 'red')

# # Distribution on the diagonal
# g.map_diag(sns.distplot, kde=False, color = 'red')

# # Density Plot and Correlation coefficients on the lower triangle
# g.map_lower(sns.kdeplot, cmap = cmap)
# g.map_lower(corrfunc);

In [None]:
# Remove redundant features
"""
    Need to go over lecture notes. Read 'information gain'. It seems like correrlation is not suffisient to evaluate 
the independence between 2 variables

"""

### 19. Drop the redundant features

### 20. Function to partition the data into train and test set

In [None]:
"""
This function takes 3 arguments: feature matrix (numpy array with rows representing data samples and columns representing 
features.), target vector (numpy array with labels corresponding to each row of the feature matrix), and t ( a real number
to determine the size of partition). 
"""
def partition(features, target, t):
    
    rowIndex = int((features.shape[0]*t))
    
    X_test = features.iloc[:rowIndex]
    X_train = features.iloc[rowIndex:]

    target_test = target.iloc[:rowIndex]
    target_train = target.iloc[rowIndex:]
    
    return  X_train, X_test, target_train, target_test 


### 21. Naively run your kNN model on the train dataset with k = 5 and using Euclidean distance

In [None]:
target = df["quality"]
features = df.drop(["quality"], axis=1)


In [None]:
X_train, X_test, target_train, target_test  = partition(features, target, t =0.2)

In [None]:
from sortedcontainers import SortedList

knn = KNN()
knn.fit(X_train, target_train, 5, euclidianDistance)

In [None]:
# Commented out since it errors
# y_predicted = knn.predict(X_train)
# y_predicted

# Part C: Model Evaluation

<font size="6"><u><strong>22.</strong></u></font>

In [None]:
def sFold(folds, data, labels, model, model_args, error_function):
    return 0

In [None]:
# Example data for k-fold cross validation
ex = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])

In [None]:
# Get Number of Rows and Columns
def GetRowsCols(array):
    print("Data array has %i data samples (rows)" % array.shape[0])
    print("Data array has %i features (columns)" % array.shape[1])

In [None]:
GetRowsCols(ex)

In [None]:
# Partition the data array into n equal chunks
def SPartician(array, n):
    arr_split = np.array_split(array, n)
    return arr_split

In [None]:
SPartician(ex, 4)