# Linear Classifier and ANNs

## Necessary imports

In [59]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import confusion_matrix, classification_report,accuracy_score

## Data loading and visualizing

In [60]:
df = pd.read_csv("winequality-red.csv")
df.head(10)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
5,7.4,0.66,0.0,1.8,0.075,13.0,40.0,0.9978,3.51,0.56,9.4,5
6,7.9,0.6,0.06,1.6,0.069,15.0,59.0,0.9964,3.3,0.46,9.4,5
7,7.3,0.65,0.0,1.2,0.065,15.0,21.0,0.9946,3.39,0.47,10.0,7
8,7.8,0.58,0.02,2.0,0.073,9.0,18.0,0.9968,3.36,0.57,9.5,7
9,7.5,0.5,0.36,6.1,0.071,17.0,102.0,0.9978,3.35,0.8,10.5,5


### Quality change to binary (1-6.5 = 0 / 6.5-10 = 1)

In [61]:
for i in range(0,len(df)):
    if df["quality"][i]>6.5:
        df["quality"][i]=1
    else:
        df["quality"][i]=0

In [62]:
df["quality"].value_counts()

0    1382
1     217
Name: quality, dtype: int64

## Finding and dropping duplicates

In [63]:
df.loc[df.duplicated()]

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
4,7.4,0.700,0.00,1.90,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,0
11,7.5,0.500,0.36,6.10,0.071,17.0,102.0,0.99780,3.35,0.80,10.5,0
27,7.9,0.430,0.21,1.60,0.106,10.0,37.0,0.99660,3.17,0.91,9.5,0
40,7.3,0.450,0.36,5.90,0.074,12.0,87.0,0.99780,3.33,0.83,10.5,0
65,7.2,0.725,0.05,4.65,0.086,4.0,11.0,0.99620,3.41,0.39,10.9,0
...,...,...,...,...,...,...,...,...,...,...,...,...
1563,7.2,0.695,0.13,2.00,0.076,12.0,20.0,0.99546,3.29,0.54,10.1,0
1564,7.2,0.695,0.13,2.00,0.076,12.0,20.0,0.99546,3.29,0.54,10.1,0
1567,7.2,0.695,0.13,2.00,0.076,12.0,20.0,0.99546,3.29,0.54,10.1,0
1581,6.2,0.560,0.09,1.70,0.053,24.0,32.0,0.99402,3.54,0.60,11.3,0


In [64]:
df.drop_duplicates()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,0
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,0
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,0
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,0
5,7.4,0.660,0.00,1.8,0.075,13.0,40.0,0.99780,3.51,0.56,9.4,0
...,...,...,...,...,...,...,...,...,...,...,...,...
1593,6.8,0.620,0.08,1.9,0.068,28.0,38.0,0.99651,3.42,0.82,9.5,0
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,0
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,0
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,0


## Data standarizing and splitting

In [65]:
X=df.iloc[:,:-1]
y=df.iloc[:,-1]

In [66]:
y.value_counts()

0    1382
1     217
Name: quality, dtype: int64

In [55]:
df.head(10)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,0
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,0
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,0
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,0
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,0
5,7.4,0.66,0.0,1.8,0.075,13.0,40.0,0.9978,3.51,0.56,9.4,0
6,7.9,0.6,0.06,1.6,0.069,15.0,59.0,0.9964,3.3,0.46,9.4,0
7,7.3,0.65,0.0,1.2,0.065,15.0,21.0,0.9946,3.39,0.47,10.0,1
8,7.8,0.58,0.02,2.0,0.073,9.0,18.0,0.9968,3.36,0.57,9.5,1
9,7.5,0.5,0.36,6.1,0.071,17.0,102.0,0.9978,3.35,0.8,10.5,0


In [45]:
sc=StandardScaler()
X=sc.fit_transform(X)

In [46]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=100)

## Logisitic regression with/out crossvalidation

In [73]:
model=LogisticRegression()
model.fit(X_train,y_train)
pred=model.predict(X_test)
acc_score=accuracy_score(y_test,pred)
print("accuracy_score = ",acc_score)

accuracy_score =  0.8770833333333333


In [74]:
cls=classification_report(y_test,pred)
print("classification_report \n",cls)

classification_report 
               precision    recall  f1-score   support

           0       0.91      0.95      0.93       417
           1       0.54      0.41      0.47        63

    accuracy                           0.88       480
   macro avg       0.73      0.68      0.70       480
weighted avg       0.87      0.88      0.87       480



In [69]:
fold=KFold(n_splits=10,shuffle=True,random_state=20)
score=cross_val_score(model,X,y,cv=fold)
#print("10 KFold scores \n",score)
mean=np.array(score).mean()
print("K-fold of Logistic regression model \n",mean)

K-fold of Logistic regression model 
 0.880562106918239


## Multi-layer perceptron 

In [71]:
model2=MLPClassifier()
model2.fit(X_train,y_train)
pred=model2.predict(X_test)
acc_score=accuracy_score(y_test,pred)
print("accuracy_score = ",acc_score)

accuracy_score =  0.8708333333333333


In [72]:
cls=classification_report(y_test,pred)
print("classification_report \n",cls)

classification_report 
               precision    recall  f1-score   support

           0       0.92      0.94      0.93       417
           1       0.51      0.43      0.47        63

    accuracy                           0.87       480
   macro avg       0.71      0.68      0.70       480
weighted avg       0.86      0.87      0.87       480



### Tweaking hyperparameters

In [85]:
model3 = MLPClassifier(learning_rate = 'adaptive',activation='tanh')
model3.fit(X_train,y_train)
pred=model3.predict(X_test)
acc_score=accuracy_score(y_test,pred)
print("accuracy_score = ",acc_score)

accuracy_score =  0.8729166666666667


In [86]:
model4 = MLPClassifier(max_iter = 100,activation='logistic')
model4.fit(X_train,y_train)
pred=model3.predict(X_test)
acc_score=accuracy_score(y_test,pred)
print("accuracy_score = ",acc_score)

accuracy_score =  0.8729166666666667


Data is linearly separable