In [8]:
# import statements
import numpy as np
import pandas as pd
from numpy.linalg import norm
from sortedcontainers import SortedList
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from scipy import stats
import random

# <u> Part A: Model Code </u>

### 5. Euclidean Distance

In [47]:
import math
def euclideanDistance(instance1, instance2, length):
    distance = 0
    for x in range(length):
        distance += pow((instance1[x] - instance2[x]), 2)
    return math.sqrt(distance)

In [102]:
TestEuclideanDistance = True
if TestEuclideanDistance:
    data1 = [2, 2, 2, 'a']
    data2 = [4, 4, 4, 'b']
    distance = euclideanDistance(data1, data2, 3)
    print('Distance: ', distance)

Distance:  3.4641016151377544


### 6. Manhattan Distance

In [10]:
def ManhattanDistance(a,b):
    norm = a - b
    norm = norm.dot(norm)
    return norm

### 7. Accuracy and generalization error of two vectors

In [11]:
'''
This function returns the accuracy and the generalization error.
Y is the target from the data set.
Accuracy is the ratio of the correct predictions and the total number of predictions.
General Error = 1 - accuracy 
'''

def accuracyGeneralizationError(Y, Y_predicted ):
    accuracy = np.mean(Y == Y_predicted)
    gn = 1 - accuracy
#     count = 0
#     accuracy = 0
#     for i, k in zip(Y, Y_predicted):
#         if (i == 0 and k == 0) or (i ==1 and k == 1) :
#             count += 1
            
#     accuracy = count / len(Y)
# #     print(accuracy)
#     gn = 1 - accuracy
    
    return accuracy, gn  

### 8. precision, recall and F1 score

In [12]:
# Precision
def precision(Y, Y_predicted):
    #truePositive = np.sum(Y == Y_predicted).astype(np.int)
    truePositive = 0
    totalTruePositive = 0
    falsePositive =0
    for i, k in zip(Y, Y_predicted):
        if i ==1 and k == 1:
            truePositive +=1
            totalTruePositive +=1
        if i == 0 and k == 1:
            falsePositive +=1
            
    return truePositive / (truePositive + falsePositive )  

In [13]:
# recall
def recall(Y, Y_predicted):
    #truePositive = np.sum(Y == Y_predicted).astype(np.int)
    truePositive = 0
    falseNegative = 0
    for i, k in zip(Y, Y_predicted):
        if i == 1 and k == 0:
            falseNegative +=1
        if i ==1 and k ==1:
            truePositive += 1
                
    return truePositive / ( truePositive + falseNegative )


In [14]:
# F1 score
def F1_score(Y, Y_predicted):
    prec =0
    prec = precision(Y, Y_predicted)
    recal = 0
    recal = recall(Y, Y_predicted)    
    
    return ( (prec*recal) / (prec+recal) )*2


### 9. Confusion matrix

In [15]:
def confusion_matrix(Y, Y_predicted):
    
    matrix = np.zeros((2,2))
    
    truePositive= 0
    trueNegative = 0 
    falseNegative = 0
    falsePositive = 0
    #truePositive = np.sum(Y == Y_predicted) #.astype(np.int)
    
    for i, k in zip(Y, Y_predicted):
        if i == 0 and k == 0 :  
            trueNegative +=1
        if i == 1 and k == 0 :
            falseNegative +=1
        if i == 0 and k == 1 :
            falsePositive += 1
        if i == 1 and k ==1:
            truePositive += 1
     
    matrix[0][0] = trueNegative
    matrix[0][1] = falsePositive
    matrix[1][1] = truePositive
    matrix[1][0] = falseNegative
            
    return matrix

#### 10. Receiver Operating Characteristic (ROC) curve

In [16]:
# takes arguments from cross_validation_predict()
def plot_roc_curve(fpr, tpr, label =None):
    
    import matplotlib.pyplot as plt
    
    # these 3 line below are unnecessaary
#     conf_matrix = confusion_matrix(Y, Y_predicted)
#     FPR = conf_matrix[0][1]/ (conf_matrix[0][1] + conf_matrix[0][0])
#     TPR = recall(Y, Y_predicted)
    
    plt.style.use('ggplot')

    fig = plt.figure(figsize=(10, 6))

    plt.plot(fpr, tpr, color='darkorange', linewidth=8, label=label) 
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.axis([0, 1, 0, 1])
    plt.title('ROC Curve (Test Data)')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    
    plt.show()

# 10.

# 11.

# 12.

# 13. KNN Model Class

In [99]:
import operator
class KNN(object): # check if this parameter is needed on udemy
    
    def __init__(self):
        #self.k = k
        pass
    
    # distance_f is a function
    def fit(self, training_features, training_labels, k, distance_f, **kwargs):
        
        self.training_features = training_features
        self.training_labels = training_labels
        self.k = k
        self.distance_f = distance_f
        self.distance = 0
        self.kwargs = kwargs
        
    
    def predict(self, testInstance):
        distances = []
        length = self.kwargs["testLength"]
        for x in range(len(self.training_features)):
            dist = self.distance_f(testInstance,\
                                   self.training_features[x], length)
            distances.append((self.training_features[x], dist))
        distances.sort(key=operator.itemgetter(1))
        neighbors = []
        for x in range(self.k):
            neighbors.append(distances[x][0])
        return neighbors
        
    
    # predict
    # Need to import:
    # from sortedcontainers import SortedList
    # import numpy as np
    
#     distance = 0
#     def predict(self, test_features):
        
#         test_features = np.array(test_features)
        
#         y = np.zeros(len(test_features))
        
#         for i, x in enumerate(test_features):
#             # to store (distance, label) turples
#             sl = SortedList() 
           
#             for j, xt in enumerate(self.training_features):
                
#                 distance = self.distance_f(x, xt)
                
#                 if len(sl) < self.k:
                   
#                     sl.add(  (distance, self.training_labels[j])   )
#                 else:
#                     if distance < sl[-1][0]:
#                         del sl[-1]
#                         sl.add( ( distance, self.training_labels[j] )  )      
            
#             # count how many time a label appears in the sorted list
#             labelCount = {}
#             for _, l in sl:
#                 labelCount[l] = labelCount.get(l, 0) + 1 # get() return 0 if label not found; the value otherwise
            
#             # Classify; find the label that appears the most
#             maxvotes = 0
#             label = -1
#             for l, labelCount in labelCount.items():
#                 if labelCount > maxvotes:
#                     maxvotes = labelCount
#                     label = l
                    
#             y[i] = label
            
            
#         return y

In [100]:
def TestPredict():
    trainSet = [[2, 2, 2], [4, 4, 4]]
    trainLabels = ['a', 'b']
    testInstance = [5, 5, 5]
    testInstanceLength = len(testInstance)
    k = 1
    knn = KNN()
    knn.fit(trainSet, trainLabels, 1,\
            euclideanDistance,\
            testLength=testInstanceLength)
    print(knn.predict(testInstance))

In [101]:
TestPredict()

[[4, 4, 4]]


In [80]:
def test_var_args_call(arg1, arg2, arg3):
    print("arg1:", arg1)
    print("arg2:", arg2)
    print("arg3:", arg3)

args = ("two", 3)
test_var_args_call(1, *args)

arg1: 1
arg2: two
arg3: 3


# Part B: Data Processing

### 14. Read in the file as a pandas data frame

In [18]:
def ReadToDataFrame():
    df = pd.read_csv("datasets/winequality-white.csv", sep=";")
    return df

### 15. Convert target into a two-category variable

In [19]:
df = ReadToDataFrame()
df["quality"] = (df["quality"] > 5).astype(np.int)

### 16. Summary of each variable in terms of mean, standard deviation, and quartiles

In [20]:
df.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0
mean,6.854788,0.278241,0.334192,6.391415,0.045772,35.308085,138.360657,0.994027,3.188267,0.489847,10.514267,0.665169
std,0.843868,0.100795,0.12102,5.072058,0.021848,17.007137,42.498065,0.002991,0.151001,0.114126,1.230621,0.471979
min,3.8,0.08,0.0,0.6,0.009,2.0,9.0,0.98711,2.72,0.22,8.0,0.0
25%,6.3,0.21,0.27,1.7,0.036,23.0,108.0,0.991723,3.09,0.41,9.5,0.0
50%,6.8,0.26,0.32,5.2,0.043,34.0,134.0,0.99374,3.18,0.47,10.4,1.0
75%,7.3,0.32,0.39,9.9,0.05,46.0,167.0,0.9961,3.28,0.55,11.4,1.0
max,14.2,1.1,1.66,65.8,0.346,289.0,440.0,1.03898,3.82,1.08,14.2,1.0


In [21]:
df.isnull().any()

fixed acidity           False
volatile acidity        False
citric acid             False
residual sugar          False
chlorides               False
free sulfur dioxide     False
total sulfur dioxide    False
density                 False
pH                      False
sulphates               False
alcohol                 False
quality                 False
dtype: bool

In [22]:
df = df.dropna()

### 17 Shuffle the rows without affecting the order of the data

In [23]:
df =  df.sample(frac=1)
df.head(5)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
4306,6.1,0.24,0.27,11.5,0.05,51.0,133.0,0.99476,3.22,0.37,10.8,1
4582,6.3,0.37,0.51,6.3,0.048,35.0,146.0,0.9943,3.1,1.01,10.5,1
4159,7.4,0.16,0.3,13.7,0.056,33.0,168.0,0.99825,2.9,0.44,8.7,1
1691,7.2,0.25,0.28,14.4,0.055,55.0,205.0,0.9986,3.12,0.38,9.0,1
727,7.0,0.21,0.34,8.0,0.057,19.0,101.0,0.9954,2.99,0.59,9.4,0


### 18. Generate pair plot using seaborn package

In [24]:
# Calculate correlation coefficient
def corrfunc(x, y, **kws):
    r, _ = stats.pearsonr(x, y)
    ax = plt.gca()
    ax.annotate("r = {:.2f}".format(r),
                xy=(.1, .6), xycoords=ax.transAxes,
               size = 24)

def RunSeaborn():
    cmap = sns.cubehelix_palette(light=1, dark = 0.1,
                                 hue = 0.5, as_cmap=True)

    sns.set_context(font_scale=2)

    # Pair grid set up
    g = sns.PairGrid(df)

    # Scatter plot on the upper triangle
    g.map_upper(plt.scatter, s=10, color = 'red')

    # Distribution on the diagonal
    g.map_diag(sns.distplot, kde=False, color = 'red')

    # Density Plot and Correlation coefficients on the lower triangle
    g.map_lower(sns.kdeplot, cmap = cmap)
    g.map_lower(corrfunc);

### 19. Drop the redundant features

In [25]:
# Looking for numerical correlations between the attributes and target
df.corr().abs()['quality'].sort_values(ascending=False)

quality                 1.000000
alcohol                 0.383280
density                 0.268696
volatile acidity        0.225440
chlorides               0.183939
total sulfur dioxide    0.170924
residual sugar          0.092756
fixed acidity           0.089749
pH                      0.083687
sulphates               0.051858
free sulfur dioxide     0.001278
citric acid             0.000700
Name: quality, dtype: float64

In [26]:
# correlation: density vs others
df.corr().abs()['density'].sort_values(ascending=False)

density                 1.000000
residual sugar          0.838966
alcohol                 0.780138
total sulfur dioxide    0.529881
free sulfur dioxide     0.294210
quality                 0.268696
fixed acidity           0.265331
chlorides               0.257211
citric acid             0.149503
pH                      0.093591
sulphates               0.074493
volatile acidity        0.027114
Name: density, dtype: float64

In [27]:
# Because 'residual sugar' and 'alcohol' are highly correlated to density, let check there are related to each other
df.corr().abs()['alcohol'].sort_values(ascending=False)

alcohol                 1.000000
density                 0.780138
residual sugar          0.450631
total sulfur dioxide    0.448892
quality                 0.383280
chlorides               0.360189
free sulfur dioxide     0.250104
pH                      0.121432
fixed acidity           0.120881
citric acid             0.075729
volatile acidity        0.067718
sulphates               0.017433
Name: alcohol, dtype: float64

In [28]:
# Because 'residual sugar' and 'alcohol' are not highly correlated, we drop only 'density'
df = df.drop(['density' ], 1)
print(df.shape)

(4898, 11)


### 20. Function to partition the data into train and test set

In [29]:
def partition(features, target, t):
    features = np.array(features)
    target = np.array(target)
    
    rows = features.shape[0]
    training_size = int(rows * t)
    test_size = int(rows - training_size)
    
    # slice the features and target arrays for train and test data
    X_train = features[training_size:]
    X_test = features[:training_size]
    
    y_train = target[training_size:]
    y_test = target[:training_size]
    
    return X_train, X_test, y_train, y_test

### 21. Naively run your kNN model on the train dataset with k = 5 and using Euclidean distance

In [30]:
data = np.array(df)
target_vector = df['quality']
X_train, X_test, y_train, y_test = partition(data, target_vector, 0.2)
X_train = np.array(X_train)
X_test = np.array(X_test)
y_train = np.array(y_train)
y_test = np.array(y_test)

In [44]:
target_vector

4306    1
4582    1
4159    1
1691    1
727     0
1904    1
3263    1
1921    0
1254    0
2196    1
2486    0
2546    0
4148    1
760     1
1102    0
1042    0
3697    1
4891    1
4791    1
2816    0
3015    1
3333    1
340     1
495     1
1397    1
3486    1
4864    0
1580    0
4287    0
3423    0
       ..
2826    1
2433    1
1132    0
3052    1
4644    1
1802    1
3106    1
1950    0
2272    1
2184    1
2360    1
3811    0
874     0
3181    1
4353    1
3497    1
605     1
3144    1
1503    1
462     0
1269    1
516     0
2770    1
1204    1
3586    1
118     0
1359    1
3805    1
790     1
1286    1
Name: quality, Length: 4898, dtype: int64

In [31]:
def NaiveKNN(X_train, y_train, k, distance_f):
    knn = KNN()
    knn.fit(X_train, y_train, k, distance_f)
    y_predicted = knn.predict(y_train)
    return y_predicted

In [32]:
def PredictionLabelsToFile(arr, name_of_file):
    with open(name_of_file, 'w') as f:
        for item in arr:
            f.write("%s\n" % item)
        f.close()

In [43]:
runNaiveKNN = True
if runNaiveKNN:
    PredictionLabelsToFile(NaiveKNN(X_train, y_train, 5, euclideanDistance),\
                           "naiveknnpredictionlabels.txt")

#### a. Use accuracy and F1 score to compare your predictions to the expected label.

In [34]:
# return a matrix: accurary and generalizationerror
def GetAccuracy(data, prediction):
    accuracy = accuracyGeneralizationError(data, prediction)
    return accuracy[0]

In [35]:
def GetF1_Score(data, prediction):
    F1 = F1_score(data, prediction)
    return F1

#### b. Standardize each feature of your training set (subtract mean and divide by standard deviation

In [36]:
def standardization(dataset):
    
    for index in range( 0, dataset.shape[1] ):
        mean = dataset[:,index].mean()
        std = dataset[:, index].std()
        dataset[:, index] =  ( dataset[:, index] * mean  ) / std
        dataset = np.array(dataset)
    return dataset
    

#### c. Rerun the kNN model on the standardized data, find the accuracy and F1 score with the expected labels

In [37]:
# accuracy = accuracyGeneralizationError(target_train, y_predicted)
# print("Accuracy: ", accuracy[0] )

# F1 = F1_score(target_train, y_predicted)
# print("F1 score: ", F1)

# Part C: Model Evaluation

<font size="6"><u><strong>22.</strong></u></font>

In [38]:
def SPartition(df, k):
    data = np.array(df)
    numRows = data.shape[0]
    numFolds = int(numRows / k)
    folds = np.array_split(data, numFolds)
    return folds

In [39]:
def MakeKModels(folds):
    k = 3
    predictedLabels = []
    for fold in folds:
        testFold = np.delete(data, fold)
        knn = KNN()
        knn.fit(fold, testFold, k, euclideanDistance)
        predictedLabels.append(knn.predict(fold))
    return predictedLabels

In [40]:
def computeClassification(target_vector, foldPredictedLabels):
    accuracyGeneralizationError = \
    accuracyGeneralizationError(target_vector, foldPredictedLabels)
    
    

In [41]:
def sFold(folds, trainingData, labels, model, model_args, error_function):
    return 0

In [42]:
SPartition(df, 3)
#np.array(df)

[array([[6.10e+00, 2.40e-01, 2.70e-01, 1.15e+01, 5.00e-02, 5.10e+01,
         1.33e+02, 3.22e+00, 3.70e-01, 1.08e+01, 1.00e+00],
        [6.30e+00, 3.70e-01, 5.10e-01, 6.30e+00, 4.80e-02, 3.50e+01,
         1.46e+02, 3.10e+00, 1.01e+00, 1.05e+01, 1.00e+00],
        [7.40e+00, 1.60e-01, 3.00e-01, 1.37e+01, 5.60e-02, 3.30e+01,
         1.68e+02, 2.90e+00, 4.40e-01, 8.70e+00, 1.00e+00],
        [7.20e+00, 2.50e-01, 2.80e-01, 1.44e+01, 5.50e-02, 5.50e+01,
         2.05e+02, 3.12e+00, 3.80e-01, 9.00e+00, 1.00e+00]]),
 array([[7.00e+00, 2.10e-01, 3.40e-01, 8.00e+00, 5.70e-02, 1.90e+01,
         1.01e+02, 2.99e+00, 5.90e-01, 9.40e+00, 0.00e+00],
        [7.90e+00, 2.60e-01, 3.30e-01, 1.03e+01, 3.90e-02, 7.30e+01,
         2.12e+02, 2.93e+00, 4.90e-01, 9.50e+00, 1.00e+00],
        [6.50e+00, 2.60e-01, 3.10e-01, 3.60e+00, 3.00e-02, 3.60e+01,
         9.20e+01, 3.22e+00, 6.20e-01, 1.26e+01, 1.00e+00],
        [7.60e+00, 1.30e-01, 3.40e-01, 9.30e+00, 6.20e-02, 4.00e+01,
         1.26e+02, 3.21e+0