In [1]:
import numpy as np
import pandas as pd
from sklearn import neighbors, datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

In [4]:
wdata= pd.read_csv('C://Users//Payam//Documents//0_MetroC//Z_My_Teaching//KNN//winequality-red.csv')
wdata.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [5]:
# looking the balance of the labels
wdata.groupby('quality').size()

quality
3     10
4     53
5    681
6    638
7    199
8     18
dtype: int64

In [6]:
# column names
col_names = wdata.columns; col_names

Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality'],
      dtype='object')

In [7]:
# creating a dictionary for changing column names to numbers
coldic = {}
for i in range(len(col_names)):
    coldic[col_names[i]] = i
coldic

{'alcohol': 10,
 'chlorides': 4,
 'citric acid': 2,
 'density': 7,
 'fixed acidity': 0,
 'free sulfur dioxide': 5,
 'pH': 8,
 'quality': 11,
 'residual sugar': 3,
 'sulphates': 9,
 'total sulfur dioxide': 6,
 'volatile acidity': 1}

In [8]:
# changing column names to numbers
wdata = wdata.rename(columns=coldic)
wdata.columns

Int64Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], dtype='int64')

In [9]:
# checking for null values
wdata.isnull().any()

0     False
1     False
2     False
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11    False
dtype: bool

In [10]:
# choosing the more frequent labels (for the sake of simplicity)
wdata = wdata[wdata[11].isin([5,6,7])]
wdata.groupby(11).size()

11
5    681
6    638
7    199
dtype: int64

In [11]:
# choosing features and target
ncols = np.shape(wdata)[1]; ncols
features = wdata[wdata.columns[0:ncols-1]]
target = wdata[wdata.columns[ncols-1]]

In [None]:
#feat_scaled = pd.DataFrame(sklearn.preprocessing.scale(features))
#feat_scaled[6].mean()
#feat_scaled[6].std()
#features[6].mean()
features[6].std()

# train-test split
#data_train, data_test, target_train, target_test = train_test_split(feat_scaled, target, test_size=0.2, random_state=10)


In [12]:
# train-test split
data_train, data_test, target_train, target_test = train_test_split(features, target, test_size=0.2, random_state=10)

In [13]:
# the KNN model
n_neigh = 1
knnc = neighbors.KNeighborsClassifier(n_neigh, weights='uniform')
knnc.fit(data_train, target_train)
pred = knnc.predict(data_test)

In [17]:
# looking at the predictions compared to true values
pd.Series(pred).groupby(pd.Series(pred)).size()

5    135
6    133
7     36
dtype: int64

In [18]:
target_test.groupby(target_test).size()

11
5    137
6    126
7     41
Name: 11, dtype: int64

In [16]:
# evaluating the predictions
print(classification_report(target_test, pred))
accuracy_score(target_test, pred, normalize=True, sample_weight=None)

             precision    recall  f1-score   support

          5       0.66      0.65      0.65       137
          6       0.58      0.61      0.59       126
          7       0.47      0.41      0.44        41

avg / total       0.60      0.60      0.60       304



0.60197368421052633

In [19]:
# Sequential Feature Selector
sfs1 = SFS(knnc, k_features=11, forward=True, floating=False, verbose=2, scoring='accuracy', cv=5)
sfs1 = sfs1.fit(np.array(data_train), np.array(target_train))

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  11 out of  11 | elapsed:    0.1s finished

[2017-07-22 19:56:43] Features: 1/11 -- score: 0.502418611053[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.1s finished

[2017-07-22 19:56:43] Features: 2/11 -- score: 0.626084767669[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    0.1s finished

[2017-07-22 19:56:43] Features: 3/11 -- score: 0.668877185826[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    0.0s finished

[2017-07-22 19:56:43] Features: 4/11 -- score: 0.668890818451[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    0.0s finished

[

In [21]:
# scores for subsets of features
sfs1.subsets_

{1: {'avg_score': 0.50241861105287566,
  'cv_scores': array([ 0.51639344,  0.52459016,  0.52263374,  0.41735537,  0.53112033]),
  'feature_idx': (10,)},
 2: {'avg_score': 0.62608476766930909,
  'cv_scores': array([ 0.57786885,  0.6352459 ,  0.65020576,  0.62809917,  0.63900415]),
  'feature_idx': (0, 10)},
 3: {'avg_score': 0.668877185825837,
  'cv_scores': array([ 0.65983607,  0.6557377 ,  0.69135802,  0.67355372,  0.66390041]),
  'feature_idx': (0, 1, 10)},
 4: {'avg_score': 0.66889081845104226,
  'cv_scores': array([ 0.6557377 ,  0.6557377 ,  0.69135802,  0.66942149,  0.67219917]),
  'feature_idx': (0, 1, 10, 7)},
 5: {'avg_score': 0.66722080792483485,
  'cv_scores': array([ 0.67213115,  0.63934426,  0.69547325,  0.67355372,  0.65560166]),
  'feature_idx': (0, 1, 7, 8, 10)},
 6: {'avg_score': 0.66724127080122231,
  'cv_scores': array([ 0.67622951,  0.63114754,  0.69135802,  0.66942149,  0.66804979]),
  'feature_idx': (0, 1, 4, 7, 8, 10)},
 7: {'avg_score': 0.65736481191686258,
  'cv

In [24]:
# choosing the best features
data_train_best = data_train[[0, 1, 10, 7]]
data_test_best = data_test[[0, 1, 10, 7]]

In [25]:
# performing KNN with the best features
knnc.fit(data_train_best, target_train)
pred = knnc.predict(data_test_best)

In [26]:
# evaluating KNN with the best features
print(classification_report(target_test, pred))
accuracy_score(target_test, pred, normalize=True, sample_weight=None)

             precision    recall  f1-score   support

          5       0.71      0.80      0.75       137
          6       0.66      0.58      0.62       126
          7       0.59      0.54      0.56        41

avg / total       0.67      0.67      0.67       304



0.67434210526315785