# <u> Part A: Model Code </u>

### 5. Euclidean Distance

In [23]:
import numpy as np
from numpy.linalg import norm
 
def euclidianDistance(a,b):
    norm = a - b
    norm = norm.dot(norm)
    
    return  np.linalg.norm(a - b)

### 6. Manhattan Distance

In [24]:
def ManhattanDistance(a,b):
    norm = a - b
    norm = norm.dot(norm)
    return norm

### 7. Accuracy and generalization error of two vectors

In [25]:
'''
This function returns the accuracy and the generalization error.
Y is the target from the data set.
Accuracy is the ratio of the correct predictions and the total number of predictions.
General Error = 1 - accuracy 
'''

def accuracyGeneralizationError(Y, Y_predicted ):
    accuracy = np.mean(Y == Y_predicted)
    gn = 1 - accuracy
#     count = 0
#     accuracy = 0
#     for i, k in zip(Y, Y_predicted):
#         if (i == 0 and k == 0) or (i ==1 and k == 1) :
#             count += 1
            
#     accuracy = count / len(Y)
# #     print(accuracy)
#     gn = 1 - accuracy
    
    return accuracy, gn  

### 8. precision, recall and F1 score

In [26]:
# Precision
def precision(Y, Y_predicted):
    #truePositive = np.sum(Y == Y_predicted).astype(np.int)
    truePositive = 0
    falsePositive =0
    for i, k in zip(Y, Y_predicted):
        if i ==1 and k == 1:
            truePositive +=1
            totalTruePositive +=1
        if i == 0 and k == 1:
            falsePositive +=1
            
    return truePositive / (truePositive + falsePositive )  

In [27]:
# recall
def recall(Y, Y_predicted):
    #truePositive = np.sum(Y == Y_predicted).astype(np.int)
    truePositive = 0
    falseNegative = 0
    for i, k in zip(Y, Y_predicted):
        if i == 1 and k == 0:
            falseNegative +=1
        if i ==1 and k ==1:
            truePositive += 1
                
    return truePositive / ( truePositive + falseNegative )


In [28]:
# F1 score
def F1_score(Y, Y_predicted):
    prec =0
    prec = precision(Y, Y_predicted)
    recal = 0
    recal = recall(Y, Y_predicted)    
    
    return ( (prec*recal) / (prec+recal) )*2


### 9. Confusion matrix

In [29]:
def confusion_matrix(Y, Y_predicted):
    
    matrix = np.zeros((2,2))
    
    truePositive= 0
    trueNegative = 0 
    falseNegative = 0
    falsePositive = 0
    #truePositive = np.sum(Y == Y_predicted) #.astype(np.int)
    
    for i, k in zip(Y, Y_predicted):
        if i == 0 and k == 0 :  
            trueNegative +=1
        if i == 1 and k == 0 :
            falseNegative +=1
        if i == 0 and k == 1 :
            falsePositive += 1
        if i == 1 and k ==1:
            truePositive += 1
     
    matrix[0][0] = trueNegative
    matrix[0][1] = falsePositive
    matrix[1][1] = truePositive
    matrix[1][0] = falseNegative
            
    return matrix

#### 10. Receiver Operating Characteristic (ROC) curve

In [30]:
# takes arguments from cross_validation_predict()
def plot_roc_curve(fpr, tpr, label =None):
    
    import matplotlib.pyplot as plt
    
    # these 3 line below are unnecessaary
#     conf_matrix = confusion_matrix(Y, Y_predicted)
#     FPR = conf_matrix[0][1]/ (conf_matrix[0][1] + conf_matrix[0][0])
#     TPR = recall(Y, Y_predicted)
    
    plt.style.use('ggplot')

    fig = plt.figure(figsize=(10, 6))

    plt.plot(fpr, tpr, color='darkorange', linewidth=8, label=label) 
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.axis([0, 1, 0, 1])
    plt.title('ROC Curve (Test Data)')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    
    plt.show()

# 10.

# 11.

# 12.

# 13. KNN Model Class

In [45]:
class KNN(object): # check if this parameter is needed on udemy
    
    def __init__(self):
        #self.k = k
        pass
    
    # distance_f is a function
    def fit(self, training_features, training_labels, k, distance_f, **kwargs):
        
        self.training_features = training_features
        self.training_labels = training_labels
        self.k = k
        self.distance_f = distance_f
        self.distance = 0
        self.kwargs = kwargs
        
    
    # predict
    # Need to import:
    # from sortedcontainers import SortedList
    # import numpy as np
    
    distance = 0
    def predict(self, test_features):
        
        test_features = np.array(test_features)
        
        y = np.zeros(len(test_features))
        
        for i, x in enumerate(test_features):
            # to store (distance, label) turples
            sl = SortedList() 
           
            for j, xt in enumerate(self.training_features):
                
                distance = self.distance_f(x, xt)
                
                if len(sl) < self.k:
                   
                    sl.add(  (distance, self.training_labels[j])   )
                else:
                    if distance < sl[-1][0]:
                        del sl[-1]
                        sl.add( ( distance, self.training_labels[j] )  )      
            
            # count how many time a label appears in the sorted list
            labelCount = {}
            for _, l in sl:
                labelCount[l] = labelCount.get(l, 0) + 1 # get() return 0 if label not found; the value otherwise
            
            # Classify; find the label that appears the most
            maxvotes = 0
            label = -1
            for l, labelCount in iteritems(labelCount):
                if labelCount > maxvotes:
                    maxvotes = labelCount
                    label = l
                    
            y[i] = label
            
            
        return y

# Part B: Data Processing

### 14. Read in the file as a pandas data frame

In [46]:
import pandas as pd
import numpy as np

df = pd.read_csv("datasets/winequality-white.csv", sep=";")

### 15. Convert target into a two-category variable

In [33]:
df["quality"] = (df["quality"] > 5).astype(np.int)

### 16. Summary of each variable in terms of mean, standard deviation, and quartiles

In [34]:
df.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0
mean,6.854788,0.278241,0.334192,6.391415,0.045772,35.308085,138.360657,0.994027,3.188267,0.489847,10.514267,0.665169
std,0.843868,0.100795,0.12102,5.072058,0.021848,17.007137,42.498065,0.002991,0.151001,0.114126,1.230621,0.471979
min,3.8,0.08,0.0,0.6,0.009,2.0,9.0,0.98711,2.72,0.22,8.0,0.0
25%,6.3,0.21,0.27,1.7,0.036,23.0,108.0,0.991723,3.09,0.41,9.5,0.0
50%,6.8,0.26,0.32,5.2,0.043,34.0,134.0,0.99374,3.18,0.47,10.4,1.0
75%,7.3,0.32,0.39,9.9,0.05,46.0,167.0,0.9961,3.28,0.55,11.4,1.0
max,14.2,1.1,1.66,65.8,0.346,289.0,440.0,1.03898,3.82,1.08,14.2,1.0


In [35]:
df.isnull().any()

fixed acidity           False
volatile acidity        False
citric acid             False
residual sugar          False
chlorides               False
free sulfur dioxide     False
total sulfur dioxide    False
density                 False
pH                      False
sulphates               False
alcohol                 False
quality                 False
dtype: bool

In [36]:
df = df.dropna()

### 17 Shuffle the rows without affecting the order of the data

In [37]:
df =  df.sample(frac=1)
df.head(5)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
3817,6.3,0.4,0.24,5.1,0.036,43.0,131.0,0.99186,3.24,0.44,11.3,1
4665,7.3,0.17,0.36,8.2,0.028,44.0,111.0,0.99272,3.14,0.41,12.4,1
263,7.2,0.29,0.4,13.6,0.045,66.0,231.0,0.9977,3.08,0.59,9.6,1
4875,7.4,0.22,0.26,1.2,0.035,18.0,97.0,0.99245,3.12,0.41,9.7,1
2422,6.2,0.44,0.18,7.7,0.096,28.0,210.0,0.99771,3.56,0.72,9.2,0


### 18. Generate pair plot using seaborn package

In [38]:
# # Matplotlib and seaborn for plotting
# import matplotlib.pyplot as plt
# %matplotlib inline

# import seaborn as sns
# from scipy import stats

# # Calculate correlation coefficient
# def corrfunc(x, y, **kws):
#     r, _ = stats.pearsonr(x, y)
#     ax = plt.gca()
#     ax.annotate("r = {:.2f}".format(r),
#                 xy=(.1, .6), xycoords=ax.transAxes,
#                size = 24)
    
# cmap = sns.cubehelix_palette(light=1, dark = 0.1,
#                              hue = 0.5, as_cmap=True)

# sns.set_context(font_scale=2)

# # Pair grid set up
# g = sns.PairGrid(df)

# # Scatter plot on the upper triangle
# g.map_upper(plt.scatter, s=10, color = 'red')

# # Distribution on the diagonal
# g.map_diag(sns.distplot, kde=False, color = 'red')

# # Density Plot and Correlation coefficients on the lower triangle
# g.map_lower(sns.kdeplot, cmap = cmap)
# g.map_lower(corrfunc);

### 19. Drop the redundant features

In [39]:
# Looking for numerical correlations between the attributes and target
df.corr().abs()['quality'].sort_values(ascending=False)

quality                 1.000000
alcohol                 0.383280
density                 0.268696
volatile acidity        0.225440
chlorides               0.183939
total sulfur dioxide    0.170924
residual sugar          0.092756
fixed acidity           0.089749
pH                      0.083687
sulphates               0.051858
free sulfur dioxide     0.001278
citric acid             0.000700
Name: quality, dtype: float64

In [40]:
# correlation: density vs others
df.corr().abs()['density'].sort_values(ascending=False)

density                 1.000000
residual sugar          0.838966
alcohol                 0.780138
total sulfur dioxide    0.529881
free sulfur dioxide     0.294210
quality                 0.268696
fixed acidity           0.265331
chlorides               0.257211
citric acid             0.149503
pH                      0.093591
sulphates               0.074493
volatile acidity        0.027114
Name: density, dtype: float64

In [41]:
# Because 'residual sugar' and 'alcohol' are highly correlated to density, let check there are related to each other
df.corr().abs()['alcohol'].sort_values(ascending=False)

alcohol                 1.000000
density                 0.780138
residual sugar          0.450631
total sulfur dioxide    0.448892
quality                 0.383280
chlorides               0.360189
free sulfur dioxide     0.250104
pH                      0.121432
fixed acidity           0.120881
citric acid             0.075729
volatile acidity        0.067718
sulphates               0.017433
Name: alcohol, dtype: float64

In [42]:
# Because 'residual sugar' and 'alcohol' are not highly correlated, we drop only 'density'
df = df.drop(['density' ], 1)
print(df.shape)

(4898, 11)


### 20. Function to partition the data into train and test set

In [43]:
# """
# This function takes 3 arguments: feature matrix (numpy array with rows representing data samples and columns representing 
# features.), target vector (numpy array with labels corresponding to each row of the feature matrix), and t ( a real number
# to determine the size of partition). 
# """
# def partition(features, target, t):
    
#     # Check if 'features' or 'target' is an instance of 'np.ndarray'
#     if isinstance(np_features, np.ndarray):
#         features = pd.DataFrame(np_features, index=range(np_features.shape[0]),
#                           columns=range(np_features.shape[1]))
#     if isinstance(target, np.ndarray):
#         target = pd.DataFrame(target, index=range(target.shape[0]),
#                           columns=range(target.shape[1]))
        
        
#     rowIndex = int((features.shape[0]*t))
    
#     X_test = np.array( features.iloc[:rowIndex]   )
#     X_train = np.array( features.iloc[rowIndex:] )

#     target_test = np.array( target.iloc[:rowIndex]  )
#     target_train = np.array( target.iloc[rowIndex:] )
    
#     return  X_train, X_test, target_train, target_test 


In [70]:
import random
def partition(features, target, t):
    features = np.array(features)
    target = np.array(target)
    
    rows = features.shape[0]
    training_size = int(rows * t)
    test_size = int(rows - training_size)
    
    # slice the features and target arrays for train and test data
    X_train = features[training_size:]
    X_test = features[:training_size]
    
    y_train = target[training_size:]
    y_test = target[:training_size]
    
    return X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [71]:
partition(df, df['quality'], 0.2)

((3919, 12), (979, 12), (3919,), (979,))

### 21. Naively run your kNN model on the train dataset with k = 5 and using Euclidean distance

In [44]:
partition(df, df['quality'], 0.8)

NameError: name 'partition' is not defined

### Create Target Vector

In [None]:
target_vector = (df['quality'] > 5).astype(np.int)
features = df.drop(['quality'], axis=1)
print("Target Vector Shape: ", target_vector.shape)
print("Features Shape: ", features.shape)

### Shuffle the Rows of Data

In [None]:
def shuffle(df):
    df = df.sample(frac=1)
    return df

In [None]:
target_vector = shuffle(target_vector)
features = shuffle(features)
print("Target Vector Shape: ", target_vector.shape)
print("Features Shape: ", features.shape)

### Partition Data

In [None]:
def partition(features, target, t):
    features = np.array(features)
    
    numRowsTotal = features.shape[0]
    numRowsTraining = int(numRowsTotal * t)
    
    X_train = features[:numRowsTraining]
    X_test = features[numRowsTraining:]
    y_train = target[:numRowsTraining]
    y_test = target[numRowsTraining:]
    
    print("X_train.shape: ", X_train.shape)
    print("X_test.shape: ", X_test.shape)
    print("y_train.shape: ", y_train.shape)
    print("y_test.shape: ", y_test.shape)
    
    return X_train, X_test, y_train, y_test

In [None]:
X_train, X_test, y_train, y_test = partition(features, target_vector, 0.8)

In [None]:
# from sortedcontainers import SortedList
# from future.utils import iteritems

# print(X_train.shape)
# print(target_train.shape)

# k= 15
# knn = KNN()
# knn.fit(X_train, target_train, k, euclidianDistance)

# y_predicted = knn.predict(X_train)


In [None]:
#  """ ??????????????????????????????????????????????"""
# y_predicted

In [None]:
def RunKNN(X_train, y_train, distanceFunction):
    k = 5
    knn = KNN()
    knn.fit(X_train, y_train, k, distanceFunction)

In [None]:
RunKNN(X_train, y_train, euclidianDistance)

#### a. Use accuracy and F1 score to compare your predictions to the expected label.

In [None]:
# return a matrix: accurary and generalizationerror
accuracy = accuracyGeneralizationError(target_train, y_predicted)
accuracy[0] 

In [None]:
F1 = F1_score(target_train, y_predicted)
F1

#### b. Standardize each feature of your training set (subtract mean and divide by standard deviation

In [None]:
def standardization(dataset):
    
    for index in range( 0, dataset.shape[1] ):
        mean = dataset[:,index].mean()
        sdt = dataset[:, index].std()
        dataset[:, index] =  ( dataset[:, index] * mean  ) / std
        #print( dataset[:, index])
    return dataset
    

In [None]:
np_features = np.array(features)
np_features

In [None]:
np_features = standardization(np_features)
np_features

#### c. Rerun the kNN model on the standardized data, find the accuracy and F1 score with the expected labels

In [None]:
# we are using 'np_features' here
X_train, X_test, target_train, target_test  = partition(np_features, target, t =0.2)

knn = KNN()
knn.fit(X_train, target_train, 5, euclidianDistance)

y_predicted = knn.predict(X_train)


In [None]:
print(y_predicted.shape)
y_predicted

In [None]:

accuracy = accuracyGeneralizationError(target_train, y_predicted)
print("Accuracy: ", accuracy[0] )

F1 = F1_score(target_train, y_predicted)
print("F1 score: ", F1)

In [None]:
print(target_train.shape, "\n", y_predicted.shape)